From 4a4bdfea7cb75b5c8b222932796f64a14027d512 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Wed, 10 Feb 2016 11:10:44 -0600 Subject: powerpc/pseries: Refactor dlpar_add_lmb() code Re-factor dlpar_lmb_add() routine by moving the validation of the lmb flags and the acquireing of the DRC to a wrapper around the work to add the memory to the system. This is done to make handling of errors during the addition of the memory easier and to facilitate the upcoming addition of updating the lmb's affinity prior to adding the memory. Signed-off-by: Nathan Fontenot Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index e9ff44c..294acfd 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -384,43 +384,32 @@ static int dlpar_memory_remove_by_index(u32 drc_index, struct property *prop) #endif /* CONFIG_MEMORY_HOTREMOVE */ -static int dlpar_add_lmb(struct of_drconf_cell *lmb) +static int dlpar_add_lmb_memory(struct of_drconf_cell *lmb) { struct memory_block *mem_block; unsigned long block_sz; int nid, rc; - if (lmb->flags & DRCONF_MEM_ASSIGNED) - return -EINVAL; - block_sz = memory_block_size_bytes(); - rc = dlpar_acquire_drc(lmb->drc_index); - if (rc) - return rc; - /* Find the node id for this address */ nid = memory_add_physaddr_to_nid(lmb->base_addr); /* Add the memory */ rc = add_memory(nid, lmb->base_addr, block_sz); - if (rc) { - dlpar_release_drc(lmb->drc_index); + if (rc) return rc; - } /* Register this block of memory */ rc = memblock_add(lmb->base_addr, block_sz); if (rc) { remove_memory(nid, lmb->base_addr, block_sz); - dlpar_release_drc(lmb->drc_index); return rc; } mem_block = lmb_to_memblock(lmb); if (!mem_block) { remove_memory(nid, lmb->base_addr, block_sz); - dlpar_release_drc(lmb->drc_index); return -EINVAL; } @@ -428,7 +417,6 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb) put_device(&mem_block->dev); if (rc) { remove_memory(nid, lmb->base_addr, block_sz); - dlpar_release_drc(lmb->drc_index); return rc; } @@ -436,6 +424,24 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb) return 0; } +static int dlpar_add_lmb(struct of_drconf_cell *lmb) +{ + int rc; + + if (lmb->flags & DRCONF_MEM_ASSIGNED) + return -EINVAL; + + rc = dlpar_acquire_drc(lmb->drc_index); + if (rc) + return rc; + + rc = dlpar_add_lmb_memory(lmb); + if (rc) + dlpar_release_drc(lmb->drc_index); + + return rc; +} + static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop) { struct of_drconf_cell *lmbs; -- cgit v0.10.2 From bdf5fc6338047cfeba98d2ab629997f3013e610f Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Wed, 10 Feb 2016 11:12:13 -0600 Subject: powerpc/pseries: Update LMB associativity index during DLPAR add/remove The associativity array index specified for a LMB in the device tree, /ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory, needs to be updated prior to DLPAR adding a LMB and after DLPAR removing a LMB. Without doing this step in the DLPAR add process a LMB could be configured with the incorrect affinity. For a LMB that was not present at boot the affinity index is set to 0xffffffff, which defaults to adding the LMB to the first online node since the index is not a valid value. Or, the affinity index could contain a stale value if the LMB was present at boot but later DLPAR removed and is being DLPAR added back to the system. This patch adds a step in the DLPAR add flow to look up the associativity index for a LMB prior to adding a LMB and setting the associativity to 0xffffffff when a LMB is removed. This patch also modifies the DLPAR add/remove flow to no longer do a single update of the device tree property after all of the requested DLPAR operations are complete and now does a property update during the add or remove of each LMB. Signed-off-by: Nathan Fontenot Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 294acfd..2ce1385 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -116,6 +116,155 @@ static struct property *dlpar_clone_drconf_property(struct device_node *dn) return new_prop; } +static void dlpar_update_drconf_property(struct device_node *dn, + struct property *prop) +{ + struct of_drconf_cell *lmbs; + u32 num_lmbs, *p; + int i; + + /* Convert the property back to BE */ + p = prop->value; + num_lmbs = *p; + *p = cpu_to_be32(*p); + p++; + + lmbs = (struct of_drconf_cell *)p; + for (i = 0; i < num_lmbs; i++) { + lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr); + lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index); + lmbs[i].flags = cpu_to_be32(lmbs[i].flags); + } + + rtas_hp_event = true; + of_update_property(dn, prop); + rtas_hp_event = false; +} + +static int dlpar_update_device_tree_lmb(struct of_drconf_cell *lmb) +{ + struct device_node *dn; + struct property *prop; + struct of_drconf_cell *lmbs; + u32 *p, num_lmbs; + int i; + + dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (!dn) + return -ENODEV; + + prop = dlpar_clone_drconf_property(dn); + if (!prop) { + of_node_put(dn); + return -ENODEV; + } + + p = prop->value; + num_lmbs = *p++; + lmbs = (struct of_drconf_cell *)p; + + for (i = 0; i < num_lmbs; i++) { + if (lmbs[i].drc_index == lmb->drc_index) { + lmbs[i].flags = lmb->flags; + lmbs[i].aa_index = lmb->aa_index; + + dlpar_update_drconf_property(dn, prop); + break; + } + } + + of_node_put(dn); + return 0; +} + +static u32 lookup_lmb_associativity_index(struct of_drconf_cell *lmb) +{ + struct device_node *parent, *lmb_node, *dr_node; + const u32 *lmb_assoc; + const u32 *assoc_arrays; + u32 aa_index; + int aa_arrays, aa_array_entries, aa_array_sz; + int i; + + parent = of_find_node_by_path("/"); + if (!parent) + return -ENODEV; + + lmb_node = dlpar_configure_connector(cpu_to_be32(lmb->drc_index), + parent); + of_node_put(parent); + if (!lmb_node) + return -EINVAL; + + lmb_assoc = of_get_property(lmb_node, "ibm,associativity", NULL); + if (!lmb_assoc) { + dlpar_free_cc_nodes(lmb_node); + return -ENODEV; + } + + dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (!dr_node) { + dlpar_free_cc_nodes(lmb_node); + return -ENODEV; + } + + assoc_arrays = of_get_property(dr_node, + "ibm,associativity-lookup-arrays", + NULL); + of_node_put(dr_node); + if (!assoc_arrays) { + dlpar_free_cc_nodes(lmb_node); + return -ENODEV; + } + + /* The ibm,associativity-lookup-arrays property is defined to be + * a 32-bit value specifying the number of associativity arrays + * followed by a 32-bitvalue specifying the number of entries per + * array, followed by the associativity arrays. + */ + aa_arrays = be32_to_cpu(assoc_arrays[0]); + aa_array_entries = be32_to_cpu(assoc_arrays[1]); + aa_array_sz = aa_array_entries * sizeof(u32); + + aa_index = -1; + for (i = 0; i < aa_arrays; i++) { + int indx = (i * aa_array_entries) + 2; + + if (memcmp(&assoc_arrays[indx], &lmb_assoc[1], aa_array_sz)) + continue; + + aa_index = i; + break; + } + + dlpar_free_cc_nodes(lmb_node); + return aa_index; +} + +static int dlpar_add_device_tree_lmb(struct of_drconf_cell *lmb) +{ + int aa_index; + + lmb->flags |= DRCONF_MEM_ASSIGNED; + + aa_index = lookup_lmb_associativity_index(lmb); + if (aa_index < 0) { + pr_err("Couldn't find associativity index for drc index %x\n", + lmb->drc_index); + return aa_index; + } + + lmb->aa_index = aa_index; + return dlpar_update_device_tree_lmb(lmb); +} + +static int dlpar_remove_device_tree_lmb(struct of_drconf_cell *lmb) +{ + lmb->flags &= ~DRCONF_MEM_ASSIGNED; + lmb->aa_index = 0xffffffff; + return dlpar_update_device_tree_lmb(lmb); +} + static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb) { unsigned long section_nr; @@ -243,8 +392,8 @@ static int dlpar_remove_lmb(struct of_drconf_cell *lmb) memblock_remove(lmb->base_addr, block_sz); dlpar_release_drc(lmb->drc_index); + dlpar_remove_device_tree_lmb(lmb); - lmb->flags &= ~DRCONF_MEM_ASSIGNED; return 0; } @@ -435,9 +584,19 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb) if (rc) return rc; + rc = dlpar_add_device_tree_lmb(lmb); + if (rc) { + pr_err("Couldn't update device tree for drc index %x\n", + lmb->drc_index); + dlpar_release_drc(lmb->drc_index); + return rc; + } + rc = dlpar_add_lmb_memory(lmb); - if (rc) + if (rc) { + dlpar_remove_device_tree_lmb(lmb); dlpar_release_drc(lmb->drc_index); + } return rc; } @@ -542,31 +701,6 @@ static int dlpar_memory_add_by_index(u32 drc_index, struct property *prop) return rc; } -static void dlpar_update_drconf_property(struct device_node *dn, - struct property *prop) -{ - struct of_drconf_cell *lmbs; - u32 num_lmbs, *p; - int i; - - /* Convert the property back to BE */ - p = prop->value; - num_lmbs = *p; - *p = cpu_to_be32(*p); - p++; - - lmbs = (struct of_drconf_cell *)p; - for (i = 0; i < num_lmbs; i++) { - lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr); - lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index); - lmbs[i].flags = cpu_to_be32(lmbs[i].flags); - } - - rtas_hp_event = true; - of_update_property(dn, prop); - rtas_hp_event = false; -} - int dlpar_memory(struct pseries_hp_errorlog *hp_elog) { struct device_node *dn; @@ -614,10 +748,7 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) break; } - if (rc) - dlpar_free_drconf_property(prop); - else - dlpar_update_drconf_property(dn, prop); + dlpar_free_drconf_property(prop); dlpar_memory_out: of_node_put(dn); -- cgit v0.10.2 From d88e397f12ba0c8c672952c4711d7280525c54ed Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Wed, 9 Mar 2016 12:53:13 +0100 Subject: cxl: Remove dead code Function cxl_get_phys_dev() was removed from the kernel API by a previous patch, but it's actually dead code. Remove it. Signed-off-by: Frederic Barrat Acked-by: Ian Munsie Reviewed-by: Andrew Donnellan Acked-by: Michael Neuling Signed-off-by: Michael Ellerman diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index 2107c94..8075823 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -68,15 +68,6 @@ struct cxl_context *cxl_get_context(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(cxl_get_context); -struct device *cxl_get_phys_dev(struct pci_dev *dev) -{ - struct cxl_afu *afu; - - afu = cxl_pci_to_afu(dev); - - return afu->adapter->dev.parent; -} - int cxl_release_context(struct cxl_context *ctx) { if (ctx->status >= STARTED) -- cgit v0.10.2 From f8a25db47ebc11fe228735d916e480d4b2ebe611 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Tue, 15 Mar 2016 21:14:12 +1100 Subject: powerpc/powernv: Use the "unknown" checkstop type as a fallback The HMI code knows about three types of errors: CORE, NX and UNKNOWN. If OPAL were to add a new type, it would not be handled at all since there is no fallback case. Instead of explicitly checking for UNKNOWN, treat any checkstop type without a handler as unknown. Signed-off-by: Russell Currey Reviewed-by: Daniel Axtens Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index d000f4e..c0a8201 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -150,15 +150,17 @@ static void print_nx_checkstop_reason(const char *level, static void print_checkstop_reason(const char *level, struct OpalHMIEvent *hmi_evt) { - switch (hmi_evt->u.xstop_error.xstop_type) { + uint8_t type = hmi_evt->u.xstop_error.xstop_type; + switch (type) { case CHECKSTOP_TYPE_CORE: print_core_checkstop_reason(level, hmi_evt); break; case CHECKSTOP_TYPE_NX: print_nx_checkstop_reason(level, hmi_evt); break; - case CHECKSTOP_TYPE_UNKNOWN: - printk("%s Unknown Malfunction Alert.\n", level); + default: + printk("%s Unknown Malfunction Alert of type %d\n", + level, type); break; } } -- cgit v0.10.2 From 1f4c66e80574e85aff25179fc71bdadfb1a701ef Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 16 Mar 2016 21:56:20 +1100 Subject: powerpc/mm: Remove long disabled SLB code We have a bunch of SLB related code in the tree which is there to handle dynamic VSIDs - but currently it's all disabled at compile time. The comments say "Keep that around for when we re-implement dynamic VSIDs". But that was over 10 years ago (commit 3c726f8dee6f ("[PATCH] ppc64: support 64k pages")). The chance that it would still work unchanged is minimal, and in the meantime it's confusing to folks browsing/grepping the code. If we ever want to re-instate it, it's in the git history. Signed-off-by: Michael Ellerman Acked-by: Balbir Singh diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7716ceb..e0e1ff4 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -209,11 +209,6 @@ data_access_slb_pSeries: EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_DAR -#ifdef __DISABLED__ - /* Keep that around for when we re-implement dynamic VSIDs */ - cmpdi r3,0 - bge slb_miss_user_pseries -#endif /* __DISABLED__ */ mfspr r12,SPRN_SRR1 #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -240,11 +235,6 @@ instruction_access_slb_pSeries: EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ -#ifdef __DISABLED__ - /* Keep that around for when we re-implement dynamic VSIDs */ - cmpdi r3,0 - bge slb_miss_user_pseries -#endif /* __DISABLED__ */ mfspr r12,SPRN_SRR1 #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -709,34 +699,6 @@ system_reset_fwnmi: #endif /* CONFIG_PPC_PSERIES */ -#ifdef __DISABLED__ -/* - * This is used for when the SLB miss handler has to go virtual, - * which doesn't happen for now anymore but will once we re-implement - * dynamic VSIDs for shared page tables - */ -slb_miss_user_pseries: - std r10,PACA_EXGEN+EX_R10(r13) - std r11,PACA_EXGEN+EX_R11(r13) - std r12,PACA_EXGEN+EX_R12(r13) - GET_SCRATCH0(r10) - ld r11,PACA_EXSLB+EX_R9(r13) - ld r12,PACA_EXSLB+EX_R3(r13) - std r10,PACA_EXGEN+EX_R13(r13) - std r11,PACA_EXGEN+EX_R9(r13) - std r12,PACA_EXGEN+EX_R3(r13) - clrrdi r12,r13,32 - mfmsr r10 - mfspr r11,SRR0 /* save SRR0 */ - ori r12,r12,slb_miss_user_common@l /* virt addr of handler */ - ori r10,r10,MSR_IR|MSR_DR|MSR_RI - mtspr SRR0,r12 - mfspr r12,SRR1 /* and SRR1 */ - mtspr SRR1,r10 - rfid - b . /* prevent spec. execution */ -#endif /* __DISABLED__ */ - #ifdef CONFIG_KVM_BOOK3S_64_HANDLER kvmppc_skip_interrupt: /* @@ -1012,70 +974,6 @@ instruction_access_common: STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) -/* - * Here is the common SLB miss user that is used when going to virtual - * mode for SLB misses, that is currently not used - */ -#ifdef __DISABLED__ - .align 7 - .globl slb_miss_user_common -slb_miss_user_common: - mflr r10 - std r3,PACA_EXGEN+EX_DAR(r13) - stw r9,PACA_EXGEN+EX_CCR(r13) - std r10,PACA_EXGEN+EX_LR(r13) - std r11,PACA_EXGEN+EX_SRR0(r13) - bl slb_allocate_user - - ld r10,PACA_EXGEN+EX_LR(r13) - ld r3,PACA_EXGEN+EX_R3(r13) - lwz r9,PACA_EXGEN+EX_CCR(r13) - ld r11,PACA_EXGEN+EX_SRR0(r13) - mtlr r10 - beq- slb_miss_fault - - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ - beq- unrecov_user_slb - mfmsr r10 - -.machine push -.machine "power4" - mtcrf 0x80,r9 -.machine pop - - clrrdi r10,r10,2 /* clear RI before setting SRR0/1 */ - mtmsrd r10,1 - - mtspr SRR0,r11 - mtspr SRR1,r12 - - ld r9,PACA_EXGEN+EX_R9(r13) - ld r10,PACA_EXGEN+EX_R10(r13) - ld r11,PACA_EXGEN+EX_R11(r13) - ld r12,PACA_EXGEN+EX_R12(r13) - ld r13,PACA_EXGEN+EX_R13(r13) - rfid - b . - -slb_miss_fault: - EXCEPTION_PROLOG_COMMON(0x380, PACA_EXGEN) - ld r4,PACA_EXGEN+EX_DAR(r13) - li r5,0 - std r4,_DAR(r1) - std r5,_DSISR(r1) - b handle_page_fault - -unrecov_user_slb: - EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN) - RECONCILE_IRQ_STATE(r10, r11) - bl save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - b 1b - -#endif /* __DISABLED__ */ - - /* * Machine check is different because we use a different * save area: PACA_EXMC instead of PACA_EXGEN. diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 825b687..48fc28b 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -32,7 +32,6 @@ enum slb_index { }; extern void slb_allocate_realmode(unsigned long ea); -extern void slb_allocate_user(unsigned long ea); static void slb_allocate(unsigned long ea) { diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 736d18b..d337400 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -179,56 +179,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) li r11,SLB_VSID_USER /* flags don't much matter */ b slb_finish_load -#ifdef __DISABLED__ - -/* void slb_allocate_user(unsigned long ea); - * - * Create an SLB entry for the given EA (user or kernel). - * r3 = faulting address, r13 = PACA - * r9, r10, r11 are clobbered by this function - * No other registers are examined or changed. - * - * It is called with translation enabled in order to be able to walk the - * page tables. This is not currently used. - */ -_GLOBAL(slb_allocate_user) - /* r3 = faulting address */ - srdi r10,r3,28 /* get esid */ - - crset 4*cr7+lt /* set "user" flag for later */ - - /* check if we fit in the range covered by the pagetables*/ - srdi. r9,r3,PGTABLE_EADDR_SIZE - crnot 4*cr0+eq,4*cr0+eq - beqlr - - /* now we need to get to the page tables in order to get the page - * size encoding from the PMD. In the future, we'll be able to deal - * with 1T segments too by getting the encoding from the PGD instead - */ - ld r9,PACAPGDIR(r13) - cmpldi cr0,r9,0 - beqlr - rlwinm r11,r10,8,25,28 - ldx r9,r9,r11 /* get pgd_t */ - cmpldi cr0,r9,0 - beqlr - rlwinm r11,r10,3,17,28 - ldx r9,r9,r11 /* get pmd_t */ - cmpldi cr0,r9,0 - beqlr - - /* build vsid flags */ - andi. r11,r9,SLB_VSID_LLP - ori r11,r11,SLB_VSID_USER - - /* get context to calculate proto-VSID */ - ld r9,PACACONTEXTID(r13) - /* fall through slb_finish_load */ - -#endif /* __DISABLED__ */ - - /* * Finish loading of an SLB entry and return * -- cgit v0.10.2 From b05fac783a75c6e9f3af2ce0824f3a6ff12540ab Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 16 Mar 2016 21:36:05 +1100 Subject: powerpc: Remove orphaned asm implementation of abs() This has been unused since ~2004, remove it. Reported-by: Al Viro Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index bf5160f..285ca8c 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -599,12 +599,6 @@ _GLOBAL(__bswapdi2) mr r4,r10 blr -_GLOBAL(abs) - srawi r4,r3,31 - xor r3,r3,r4 - sub r3,r3,r4 - blr - #ifdef CONFIG_SMP _GLOBAL(start_secondary_resume) /* Reset stack */ -- cgit v0.10.2 From b4c6afdc3a1ad73542c8e92220c30b8f953c51de Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 16 Mar 2016 21:36:06 +1100 Subject: powerpc: Make generic_memcpy() private to copy_32.S generic_memcpy() is only called from copy_32.S, so there's no reason for it to be global. Reported-by: Al Viro Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S index c44df2d..99f37f2 100644 --- a/arch/powerpc/lib/copy_32.S +++ b/arch/powerpc/lib/copy_32.S @@ -217,7 +217,7 @@ _GLOBAL(memcpy) bdnz 40b 65: blr -_GLOBAL(generic_memcpy) +generic_memcpy: srwi. r7,r5,3 addi r6,r3,-4 addi r4,r4,-4 -- cgit v0.10.2 From 8ee26530bb189909cca6f4b1f36a7577c840c526 Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Tue, 16 Feb 2016 23:06:05 +1100 Subject: powerpc/eeh: rename EEH from "extended" to "enhanced" error handling IBM online documentation for EEH uses "extended error handling" and "enhanced error handling" to refer to the same thing, in different places. The only place mentioning it as "enhanced error handling" in the kernel is the MAINTAINERS file, and it's "extended" in some documentation. IBM originally defined EEH as "enhanced error handling", so standardise all mentions of EEH to use that term. Signed-off-by: Russell Currey Acked-by: Gavin Shan Signed-off-by: Michael Ellerman diff --git a/Documentation/powerpc/eeh-pci-error-recovery.txt b/Documentation/powerpc/eeh-pci-error-recovery.txt index 9d4e33d..6781892 100644 --- a/Documentation/powerpc/eeh-pci-error-recovery.txt +++ b/Documentation/powerpc/eeh-pci-error-recovery.txt @@ -12,7 +12,7 @@ Overview: The IBM POWER-based pSeries and iSeries computers include PCI bus controller chips that have extended capabilities for detecting and reporting a large variety of PCI bus error conditions. These features -go under the name of "EEH", for "Extended Error Handling". The EEH +go under the name of "EEH", for "Enhanced Error Handling". The EEH hardware features allow PCI bus errors to be cleared and a PCI card to be "rebooted", without also having to reboot the operating system. diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 6544017..c4e41cf 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -48,7 +48,7 @@ /** Overview: - * EEH, or "Extended Error Handling" is a PCI bridge technology for + * EEH, or "Enhanced Error Handling" is a PCI bridge technology for * dealing with PCI bus errors that can't be dealt with within the * usual PCI framework, except by check-stopping the CPU. Systems * that are designed for high-availability/reliability cannot afford -- cgit v0.10.2 From 8038665fb54f1e54785c63be111647abebfe9f20 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 27 Mar 2016 18:08:15 -0400 Subject: powerpc/cell: Make spu_base.c explicitly non-modular The Kconfig currently controlling compilation of this code is: arch/powerpc/platforms/cell/Kconfig:config SPU_BASE arch/powerpc/platforms/cell/Kconfig: bool ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the driver there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We also delete the MODULE_LICENSE tag etc. since all that information is already contained at the top of the file in the comments. Cc: Arnd Bergmann Signed-off-by: Paul Gortmaker Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index f7af74f..3ede04f 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include @@ -805,7 +805,4 @@ static int __init init_spu_base(void) out: return ret; } -module_init(init_spu_base); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnd Bergmann "); +device_initcall(init_spu_base); -- cgit v0.10.2 From c0c523897d1f83bc8484cb58d1f51d935b23cee5 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 27 Mar 2016 18:08:16 -0400 Subject: powerpc: make kernel/nvram_64.c explicitly non-modular The Makefile/Kconfig currently controlling compilation of this code is: obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o arch/powerpc/platforms/Kconfig.cputype:config PPC64 arch/powerpc/platforms/Kconfig.cputype: bool "64-bit kernel" ...meaning that it currently is not being built as a module by anyone. Lets remove the modular code that is essentially orphaned, so that when reading the driver there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We don't replace module.h with init.h since the file already has that. We delete the MODULE_LICENSE tag since that information is already contained at the top of the file in the comments. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Hari Bathini Cc: Nathan Fontenot Cc: Andrzej Hajda Cc: Anton Blanchard Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Paul Gortmaker Reviewed-by: Nathan Fontenot Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 0cab9e8..856f9a7 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -15,8 +15,6 @@ * parsing code. */ -#include - #include #include #include @@ -1231,12 +1229,4 @@ static int __init nvram_init(void) return rc; } - -static void __exit nvram_cleanup(void) -{ - misc_deregister( &nvram_dev ); -} - -module_init(nvram_init); -module_exit(nvram_cleanup); -MODULE_LICENSE("GPL"); +device_initcall(nvram_init); -- cgit v0.10.2 From 6a0bcab9c6c337e14689cabd276f00649e980cf1 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 27 Mar 2016 18:08:17 -0400 Subject: drivers/cpufreq: make ppc_cbe_cpufreq_pmi driver explicitly non-modular The Kconfig for this driver is currently: config CPU_FREQ_CBE_PMI bool "CBE frequency scaling using PMI interface" ...meaning that it currently is not being built as a module by anyone. Lets remove the modular and unused code here, so that when reading the driver there is no doubt it is builtin-only. Since module_init translates to device_initcall in the non-modular case, the init ordering remains unchanged with this commit. We also delete the MODULE_LICENSE tag etc. since all that information is already contained at the top of the file in the comments. Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Cc: Christian Krafft Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-pm@vger.kernel.org Signed-off-by: Paul Gortmaker Acked-by: Viresh Kumar Signed-off-by: Michael Ellerman diff --git a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c index 7969f76..7c4cd5c 100644 --- a/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c +++ b/drivers/cpufreq/ppc_cbe_cpufreq_pmi.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include @@ -142,15 +142,4 @@ static int __init cbe_cpufreq_pmi_init(void) return 0; } - -static void __exit cbe_cpufreq_pmi_exit(void) -{ - cpufreq_unregister_notifier(&pmi_notifier_block, CPUFREQ_POLICY_NOTIFIER); - pmi_unregister_handler(&cbe_pmi_handler); -} - -module_init(cbe_cpufreq_pmi_init); -module_exit(cbe_cpufreq_pmi_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Christian Krafft "); +device_initcall(cbe_cpufreq_pmi_init); -- cgit v0.10.2 From a7ee539584acf4a565b7439ceaea8401ec210353 Mon Sep 17 00:00:00 2001 From: Rashmica Gupta Date: Fri, 19 Feb 2016 16:38:47 +1100 Subject: powerpc/Kconfig: Update config option based on page size. Currently on PPC64 changing kernel pagesize from 4K to 64K leaves FORCE_MAX_ZONEORDER set to 13 - which produces a compile error. The error occurs because of the following constraint (from include/linux/mmzone.h) being violated: MAX_ORDER -1 + PAGESHIFT <= SECTION_SIZE_BITS. Expanding this out, we get: FORCE_MAX_ZONEBITS <= 25 - PAGESHIFT, which requires, for a 64K page, FORCE_MAX_ZONEBITS <= 9. Thus set max value of FORCE_MAX_ZONEORDER for 64K pages to 9, and 4K pages to 13. Also, check the minimum value: In include/linux/huge_mm.h, we have the constraint HPAGE_PMD_ORDER < MAX_ORDER which expands out to: PTE_INDEX_SIZE < FORCE_MAX_ZONEORDER. PTE_INDEX_SIZE is: 9 (4k hash or no hash 4K pgtable) or 8 (64K hash or no hash 64K pgtable). Thus a min value of 8 for 64K pages and 9 for 4K pages is reasonable. So, update the range of FORCE_MAX_ZONEORDER from 9-64 to 8-9 for 64K pages and from 13-64 to 9-13 for 4K pages. Signed-off-by: Rashmica Gupta Reviewed-by: Balbir Singh Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7cd32c0..fbebde0 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -605,9 +605,9 @@ endchoice config FORCE_MAX_ZONEORDER int "Maximum zone order" - range 9 64 if PPC64 && PPC_64K_PAGES + range 8 9 if PPC64 && PPC_64K_PAGES default "9" if PPC64 && PPC_64K_PAGES - range 13 64 if PPC64 && !PPC_64K_PAGES + range 9 13 if PPC64 && !PPC_64K_PAGES default "13" if PPC64 && !PPC_64K_PAGES range 9 64 if PPC32 && PPC_16K_PAGES default "9" if PPC32 && PPC_16K_PAGES -- cgit v0.10.2 From e3824e4281b8e31d55c08a06b20abb4f677a0baf Mon Sep 17 00:00:00 2001 From: Russell Currey Date: Wed, 2 Mar 2016 17:12:45 +1100 Subject: powerpc/swsusp: Only use tlbie in POWER4 mode If CONFIG_HIBERNATION and CONFIG_PPC_BOOK3S_64 are set, code in arch/powerpc/kernel/swsusp_amd64.S which uses the tlbia macro is enabled. tlbia in turn uses tlbie, an instruction which takes more than one operand in newer versions of POWER. As such, the kernel fails to build due to the assembler complaining about missing operands. This can be worked around by assembling the instruction as in POWER4. This fixes the build breakage caused by enabling CONFIG_HIBERNATION. Hibernation is currently only tested on G5 PowerMacs, which should be unaffected by this change. For other platforms it may now build, whether or not it works is a different story. Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 499d9f8..2b31632 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -427,7 +427,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) li r4,1024; \ mtctr r4; \ lis r4,KERNELBASE@h; \ + .machine push; \ + .machine "power4"; \ 0: tlbie r4; \ + .machine pop; \ addi r4,r4,0x1000; \ bdnz 0b #endif -- cgit v0.10.2 From ef69b03dfd321ffa1bcb4248e9b8744f653e3859 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 2 Mar 2016 12:36:56 +1100 Subject: MAINTAINERS: Add powerpc drivers to the powerpc section We'd like folks working on drivers for powerpc to also Cc linuxppc-dev, so we can be aware of what's going on in drivers and/or review the changes. So add patterns to the powerpc MAINTAINERS section to catch some of the drivers we're interested in. Signed-off-by: Michael Ellerman diff --git a/MAINTAINERS b/MAINTAINERS index 61a323a..9a955ea 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6608,6 +6608,19 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git S: Supported F: Documentation/powerpc/ F: arch/powerpc/ +F: drivers/char/tpm/tpm_ibmvtpm* +F: drivers/crypto/nx/ +F: drivers/crypto/vmx/ +F: drivers/net/ethernet/ibm/ibmveth.* +F: drivers/net/ethernet/ibm/ibmvnic.* +F: drivers/pci/hotplug/rpa* +F: drivers/scsi/ibmvscsi/ +N: opal +N: /pmac +N: powermac +N: powernv +N: [^a-z0-9]ps3 +N: pseries LINUX FOR POWER MACINTOSH M: Benjamin Herrenschmidt -- cgit v0.10.2 From b3d79eaa6c97b04965fa479652dd0317253f81d1 Mon Sep 17 00:00:00 2001 From: Vipin K Parashar Date: Tue, 1 Sep 2015 04:52:43 +0530 Subject: powerpc/opal: Assign numbers to OPAL_MSG macros of enum opal_msg_type This patch assigns numbers to OPAL_MSG macros of enum opal_msg_type to prevent accidental insertion of any new value in between and thus break OPAL API. This is also helpful while backporting mainline kernel changes to distros which run downlevel kernel and thus don't have all OPAL messages defined, avoiding unnecessary bugs due to enum values order mismatch. Signed-off-by: Vipin K Parashar Acked-by: Stewart Smith Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index f8faaae..9bb8ddf 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -368,16 +368,16 @@ enum OpalLPCAddressType { }; enum opal_msg_type { - OPAL_MSG_ASYNC_COMP = 0, /* params[0] = token, params[1] = rc, + OPAL_MSG_ASYNC_COMP = 0, /* params[0] = token, params[1] = rc, * additional params function-specific */ - OPAL_MSG_MEM_ERR, - OPAL_MSG_EPOW, - OPAL_MSG_SHUTDOWN, /* params[0] = 1 reboot, 0 shutdown */ - OPAL_MSG_HMI_EVT, - OPAL_MSG_DPO, - OPAL_MSG_PRD, - OPAL_MSG_OCC, + OPAL_MSG_MEM_ERR = 1, + OPAL_MSG_EPOW = 2, + OPAL_MSG_SHUTDOWN = 3, /* params[0] = 1 reboot, 0 shutdown */ + OPAL_MSG_HMI_EVT = 4, + OPAL_MSG_DPO = 5, + OPAL_MSG_PRD = 6, + OPAL_MSG_OCC = 7, OPAL_MSG_TYPE_MAX, }; -- cgit v0.10.2 From 86c9ffcc1ed17497a5df473232a7e654fa8cfa1f Mon Sep 17 00:00:00 2001 From: Philippe Bergheaud Date: Thu, 31 Mar 2016 11:19:27 +0200 Subject: powerpc: Define PVR value for POWER8NVL processor Signed-off-by: Philippe Bergheaud Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index f5f4c66..cf09c6e 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1182,6 +1182,7 @@ #define PVR_970GX 0x0045 #define PVR_POWER7p 0x004A #define PVR_POWER8E 0x004B +#define PVR_POWER8NVL 0x004C #define PVR_POWER8 0x004D #define PVR_BE 0x0070 #define PVR_PA6T 0x0090 -- cgit v0.10.2 From aa14138a51ca42eada706d4b9635bd32d1e09ced Mon Sep 17 00:00:00 2001 From: Philippe Bergheaud Date: Thu, 31 Mar 2016 11:19:28 +0200 Subject: cxl: Configure the PSL for two CAPI ports on POWER8NVL The POWER8NVL chip has two CAPI ports. Configure the PSL to route data to the port corresponding to the CAPP unit. Signed-off-by: Philippe Bergheaud Signed-off-by: Michael Ellerman diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 2844e97..94fd3f7 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "cxl.h" #include @@ -321,12 +322,43 @@ static void dump_afu_descriptor(struct cxl_afu *afu) #undef show_reg } +#define CAPP_UNIT0_ID 0xBA +#define CAPP_UNIT1_ID 0XBE + +static u64 get_capp_unit_id(struct device_node *np) +{ + u32 phb_index; + + /* + * For chips other than POWER8NVL, we only have CAPP 0, + * irrespective of which PHB is used. + */ + if (!pvr_version_is(PVR_POWER8NVL)) + return CAPP_UNIT0_ID; + + /* + * For POWER8NVL, assume CAPP 0 is attached to PHB0 and + * CAPP 1 is attached to PHB1. + */ + if (of_property_read_u32(np, "ibm,phb-index", &phb_index)) + return 0; + + if (phb_index == 0) + return CAPP_UNIT0_ID; + + if (phb_index == 1) + return CAPP_UNIT1_ID; + + return 0; +} + static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev *dev) { struct device_node *np; const __be32 *prop; u64 psl_dsnctl; u64 chipid; + u64 capp_unit_id; if (!(np = pnv_pci_get_phb_node(dev))) return -ENODEV; @@ -336,10 +368,17 @@ static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev if (!np) return -ENODEV; chipid = be32_to_cpup(prop); + capp_unit_id = get_capp_unit_id(np); of_node_put(np); + if (!capp_unit_id) { + pr_err("cxl: invalid capp unit id\n"); + return -ENODEV; + } /* Tell PSL where to route data to */ - psl_dsnctl = 0x02E8900002000000ULL | (chipid << (63-5)); + psl_dsnctl = 0x0000900002000000ULL | (chipid << (63-5)); + psl_dsnctl |= (capp_unit_id << (63-13)); + cxl_p1_write(adapter, CXL_PSL_DSNDCTL, psl_dsnctl); cxl_p1_write(adapter, CXL_PSL_RESLCKTO, 0x20000000200ULL); /* snoop write mask */ -- cgit v0.10.2 From 635218c785bef355bc8266a1fdb28f38cdca365d Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Wed, 6 Jan 2016 11:45:50 +1100 Subject: powerpc: sparse: static-ify some things As sparse suggests, these should be made static. Signed-off-by: Daniel Axtens Reviewed-by: Andrew Donnellan Reviewed-by: Stewart Smith Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 4eefb6e..82e7327 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -36,7 +36,7 @@ static DEFINE_SPINLOCK(eeh_eventlist_lock); static struct semaphore eeh_eventlist_sem; -LIST_HEAD(eeh_eventlist); +static LIST_HEAD(eeh_eventlist); /** * eeh_event_handler - Dispatch EEH events. diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c index ac86c53..a89f4f7 100644 --- a/arch/powerpc/kernel/ibmebus.c +++ b/arch/powerpc/kernel/ibmebus.c @@ -408,7 +408,7 @@ static ssize_t modalias_show(struct device *dev, return len+1; } -struct device_attribute ibmebus_bus_device_attrs[] = { +static struct device_attribute ibmebus_bus_device_attrs[] = { __ATTR_RO(devspec), __ATTR_RO(name), __ATTR_RO(modalias), diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index b2eb468..3513822 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -37,7 +37,7 @@ static DEFINE_PER_CPU(int, mce_queue_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); static void machine_check_process_queued_event(struct irq_work *work); -struct irq_work mce_event_process_work = { +static struct irq_work mce_event_process_work = { .func = machine_check_process_queued_event, }; diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index aa610ce..c638e24 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -442,7 +442,7 @@ static void do_event_scan(void) } static void rtas_event_scan(struct work_struct *w); -DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan); +static DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan); /* * Delay should be at least one second since some machines have problems if diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 5f8dcda..8d7358f 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -87,7 +87,7 @@ struct vio_cmo_dev_entry { * @curr: bytes currently allocated * @high: high water mark for IO data usage */ -struct vio_cmo { +static struct vio_cmo { spinlock_t lock; struct delayed_work balance_q; struct list_head device_list; @@ -615,7 +615,7 @@ static u64 vio_dma_get_required_mask(struct device *dev) return dma_iommu_ops.get_required_mask(dev); } -struct dma_map_ops vio_dma_mapping_ops = { +static struct dma_map_ops vio_dma_mapping_ops = { .alloc = vio_dma_iommu_alloc_coherent, .free = vio_dma_iommu_free_coherent, .mmap = dma_direct_mmap_coherent, -- cgit v0.10.2 From 7f92bc5694557dee4cefa90df27feec16c7b62da Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Wed, 6 Jan 2016 11:45:51 +1100 Subject: powerpc: sparse: Include headers for __weak symbols Sometimes when sparse warns about undefined symbols, it isn't because they should have 'static' added, it's because they're overriding __weak symbols defined elsewhere, and the header has been missed. Fix a few of them by adding appropriate headers. Signed-off-by: Daniel Axtens Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index b8500b4..4695088 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -55,6 +56,7 @@ #include #endif #include +#include #include #include diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 7030b03..1b082c7 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 81b0900..3ed9a5a 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 4087705..30611f8 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -26,6 +26,7 @@ #include #include #include +#include /* * Top of mmap area (just below the process stack). -- cgit v0.10.2 From c796d1d97c3035cf54d4d5a9e75abd094db80e76 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Sun, 10 Apr 2016 22:53:47 +0300 Subject: drivers: macintosh: rack-meter: limit idle ticks to total ticks Limit idle ticks to total ticks. This prevents the annoying rackmeter leds fully ON / OFF blinking state that happens on fully idling G5 Xserve systems. Signed-off-by: Aaro Koskinen Signed-off-by: Michael Ellerman diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c index caaec65..ba7e330 100644 --- a/drivers/macintosh/rack-meter.c +++ b/drivers/macintosh/rack-meter.c @@ -227,6 +227,7 @@ static void rackmeter_do_timer(struct work_struct *work) total_idle_ticks = get_cpu_idle_time(cpu); idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle); + idle_ticks = min(idle_ticks, total_ticks); rcpu->prev_idle = total_idle_ticks; /* We do a very dumb calculation to update the LEDs for now, -- cgit v0.10.2 From 4f7bef7a9f69b1c6fe44b9f249b49056288475a4 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Sun, 10 Apr 2016 22:53:48 +0300 Subject: drivers: macintosh: rack-meter: fix bogus memsets Fix bogus memsets pointed out by sparse: linux-v4.3/drivers/macintosh/rack-meter.c:157:15: warning: memset with byte count of 0 linux-v4.3/drivers/macintosh/rack-meter.c:158:15: warning: memset with byte count of 0 Probably "&" is mistyped "*"; use ARRAY_SIZE to make it more safe. Signed-off-by: Aaro Koskinen Signed-off-by: Michael Ellerman diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c index ba7e330..465c522 100644 --- a/drivers/macintosh/rack-meter.c +++ b/drivers/macintosh/rack-meter.c @@ -154,8 +154,8 @@ static void rackmeter_do_pause(struct rackmeter *rm, int pause) DBDMA_DO_STOP(rm->dma_regs); return; } - memset(rdma->buf1, 0, SAMPLE_COUNT & sizeof(u32)); - memset(rdma->buf2, 0, SAMPLE_COUNT & sizeof(u32)); + memset(rdma->buf1, 0, ARRAY_SIZE(rdma->buf1)); + memset(rdma->buf2, 0, ARRAY_SIZE(rdma->buf2)); rm->dma_buf_v->mark = 0; -- cgit v0.10.2 From 1050e689a63baffdadcd33498c15d859922504c0 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 6 Nov 2015 11:00:23 +0100 Subject: cxl: Delete an unnecessary check before the function call "kfree" The kfree() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Reviewed-by: Andrew Donnellan Acked-by: Ian Munsie Signed-off-by: Michael Ellerman diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 10370f2..80203c9 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -290,8 +290,7 @@ static void reclaim_ctx(struct rcu_head *rcu) if (ctx->kernelapi) kfree(ctx->mapping); - if (ctx->irq_bitmap) - kfree(ctx->irq_bitmap); + kfree(ctx->irq_bitmap); /* Drop ref to the afu device taken during cxl_context_init */ cxl_afu_put(ctx->afu); -- cgit v0.10.2 From 8ed8ab40047a570fdd8043a40c104a57248dd3fd Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 15 Apr 2016 22:48:02 +1000 Subject: powerpc/book3s64: Fix branching to OOL handlers in relocatable kernel Some of the interrupt vectors on 64-bit POWER server processors are only 32 bytes long (8 instructions), which is not enough for the full first-level interrupt handler. For these we need to branch to an out-of-line (OOL) handler. But when we are running a relocatable kernel, interrupt vectors till __end_interrupts marker are copied down to real address 0x100. So, branching to labels (ie. OOL handlers) outside this section must be handled differently (see LOAD_HANDLER()), considering relocatable kernel, which would need at least 4 instructions. However, branching from interrupt vector means that we corrupt the CFAR (come-from address register) on POWER7 and later processors as mentioned in commit 1707dd16. So, EXCEPTION_PROLOG_0 (6 instructions) that contains the part up to the point where the CFAR is saved in the PACA should be part of the short interrupt vectors before we branch out to OOL handlers. But as mentioned already, there are interrupt vectors on 64-bit POWER server processors that are only 32 bytes long (like vectors 0x4f00, 0x4f20, etc.), which cannot accomodate the above two cases at the same time owing to space constraint. Currently, in these interrupt vectors, we simply branch out to OOL handlers, without using LOAD_HANDLER(), which leaves us vulnerable when running a relocatable kernel (eg. kdump case). While this has been the case for sometime now and kdump is used widely, we were fortunate not to see any problems so far, for three reasons: 1. In almost all cases, production kernel (relocatable) is used for kdump as well, which would mean that crashed kernel's OOL handler would be at the same place where we end up branching to, from short interrupt vector of kdump kernel. 2. Also, OOL handler was unlikely the reason for crash in almost all the kdump scenarios, which meant we had a sane OOL handler from crashed kernel that we branched to. 3. On most 64-bit POWER server processors, page size is large enough that marking interrupt vector code as executable (see commit 429d2e83) leads to marking OOL handler code from crashed kernel, that sits right below interrupt vector code from kdump kernel, as executable as well. Let us fix this by moving the __end_interrupts marker down past OOL handlers to make sure that we also copy OOL handlers to real address 0x100 when running a relocatable kernel. This fix has been tested successfully in kdump scenario, on an LPAR with 4K page size by using different default/production kernel and kdump kernel. Also tested by manually corrupting the OOL handlers in the first kernel and then kdump'ing, and then causing the OOL handlers to fire - mpe. Fixes: c1fb6816fb1b ("powerpc: Add relocation on exception vector handlers") Cc: stable@vger.kernel.org Signed-off-by: Hari Bathini Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index e0e1ff4..b952173 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -915,11 +915,6 @@ hv_facility_unavailable_relon_trampoline: #endif STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) - /* Other future vectors */ - .align 7 - .globl __end_interrupts -__end_interrupts: - .align 7 system_call_entry: b system_call_common @@ -1142,6 +1137,17 @@ __end_handlers: STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) + /* + * The __end_interrupts marker must be past the out-of-line (OOL) + * handlers, so that they are copied to real address 0x100 when running + * a relocatable kernel. This ensures they can be reached from the short + * trampoline handlers (like 0x4f00, 0x4f20, etc.) which branch + * directly, without using LOAD_HANDLER(). + */ + .align 7 + .globl __end_interrupts +__end_interrupts: + #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* * Data area reserved for FWNMI option. -- cgit v0.10.2 From 057b6d7e62ea4e6b1809e2946929d0d586cad142 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 8 Apr 2016 03:30:34 +0530 Subject: powerpc/book3s64: Remove __end_handlers marker The __end_handlers marker was intended to mark down upto code that gets called from exception prologs. But that hasn't kept pace with code changes. Case in point, slb_miss_realmode being called from exception prolog code but isn't below __end_handlers marker. So, __end_handlers marker is as good as a comment but could be misleading at times if it isn't in sync with the code, as is the case now. So, let us avoid this confusion by having a better comment and removing __end_handlers marker altogether. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b952173..52d2d74 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -726,11 +726,10 @@ kvmppc_skip_Hinterrupt: #endif /* - * Code from here down to __end_handlers is invoked from the - * exception prologs above. Because the prologs assemble the - * addresses of these handlers using the LOAD_HANDLER macro, - * which uses an ori instruction, these handlers must be in - * the first 64k of the kernel image. + * Ensure that any handlers that get invoked from the exception prologs + * above are below the first 64KB (0x10000) of the kernel image because + * the prologs assemble the addresses of these handlers using the + * LOAD_HANDLER macro, which uses an ori instruction. */ /*** Common interrupt handlers ***/ @@ -1123,10 +1122,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) - .align 7 - .globl __end_handlers -__end_handlers: - /* Equivalents to the above handlers for relocation-on interrupt vectors */ STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) -- cgit v0.10.2 From 1bfadabfebc671a6af0f5008b524b681ce757dec Mon Sep 17 00:00:00 2001 From: Anju T Date: Sat, 20 Feb 2016 10:32:45 +0530 Subject: powerpc/perf: Assign an id to each powerpc register The enum definition assigns an 'id' to each register in "struct pt_regs" of arch/powerpc. The order of these values in the enum definition are based on the order of members in pt_regs. Signed-off-by: Anju T [mpe: Rename LNK to LINK, use _UAPI_ASM for include guards] Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h new file mode 100644 index 0000000..6a93209 --- /dev/null +++ b/arch/powerpc/include/uapi/asm/perf_regs.h @@ -0,0 +1,50 @@ +#ifndef _UAPI_ASM_POWERPC_PERF_REGS_H +#define _UAPI_ASM_POWERPC_PERF_REGS_H + +enum perf_event_powerpc_regs { + PERF_REG_POWERPC_R0, + PERF_REG_POWERPC_R1, + PERF_REG_POWERPC_R2, + PERF_REG_POWERPC_R3, + PERF_REG_POWERPC_R4, + PERF_REG_POWERPC_R5, + PERF_REG_POWERPC_R6, + PERF_REG_POWERPC_R7, + PERF_REG_POWERPC_R8, + PERF_REG_POWERPC_R9, + PERF_REG_POWERPC_R10, + PERF_REG_POWERPC_R11, + PERF_REG_POWERPC_R12, + PERF_REG_POWERPC_R13, + PERF_REG_POWERPC_R14, + PERF_REG_POWERPC_R15, + PERF_REG_POWERPC_R16, + PERF_REG_POWERPC_R17, + PERF_REG_POWERPC_R18, + PERF_REG_POWERPC_R19, + PERF_REG_POWERPC_R20, + PERF_REG_POWERPC_R21, + PERF_REG_POWERPC_R22, + PERF_REG_POWERPC_R23, + PERF_REG_POWERPC_R24, + PERF_REG_POWERPC_R25, + PERF_REG_POWERPC_R26, + PERF_REG_POWERPC_R27, + PERF_REG_POWERPC_R28, + PERF_REG_POWERPC_R29, + PERF_REG_POWERPC_R30, + PERF_REG_POWERPC_R31, + PERF_REG_POWERPC_NIP, + PERF_REG_POWERPC_MSR, + PERF_REG_POWERPC_ORIG_R3, + PERF_REG_POWERPC_CTR, + PERF_REG_POWERPC_LINK, + PERF_REG_POWERPC_XER, + PERF_REG_POWERPC_CCR, + PERF_REG_POWERPC_SOFTE, + PERF_REG_POWERPC_TRAP, + PERF_REG_POWERPC_DAR, + PERF_REG_POWERPC_DSISR, + PERF_REG_POWERPC_MAX, +}; +#endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */ -- cgit v0.10.2 From ed4a4ef85cf5b75231079501030db28dd59f920a Mon Sep 17 00:00:00 2001 From: Anju T Date: Sat, 20 Feb 2016 10:32:46 +0530 Subject: powerpc/perf: Add support for sampling interrupt register state The perf infrastructure uses a bit mask to find out valid registers to display. Define a register mask for supported registers defined in uapi/asm/perf_regs.h. The bit positions also correspond to register IDs which is used by perf infrastructure to fetch the register values. CONFIG_HAVE_PERF_REGS enables sampling of the interrupted machine state. Signed-off-by: Anju T [mpe: Add license, use CONFIG_PPC64, fix 32-bit build] Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 9b5c7bc..b14966d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -116,6 +116,7 @@ config PPC select GENERIC_ATOMIC64 if PPC32 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_PERF_EVENTS + select HAVE_PERF_REGS select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64 select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index f9c083a..77b6394 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -1,6 +1,6 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror -obj-$(CONFIG_PERF_EVENTS) += callchain.o +obj-$(CONFIG_PERF_EVENTS) += callchain.o perf_regs.o obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o bhrb.o obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c new file mode 100644 index 0000000..d24a8a3 --- /dev/null +++ b/arch/powerpc/perf/perf_regs.c @@ -0,0 +1,104 @@ +/* + * Copyright 2016 Anju T, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) + +#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1)) + +static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = { + PT_REGS_OFFSET(PERF_REG_POWERPC_R0, gpr[0]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R1, gpr[1]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R2, gpr[2]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R3, gpr[3]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R4, gpr[4]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R5, gpr[5]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R6, gpr[6]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R7, gpr[7]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R8, gpr[8]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R9, gpr[9]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R10, gpr[10]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R11, gpr[11]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R12, gpr[12]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R13, gpr[13]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R14, gpr[14]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R15, gpr[15]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R16, gpr[16]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R17, gpr[17]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R18, gpr[18]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R19, gpr[19]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R20, gpr[20]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R21, gpr[21]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R22, gpr[22]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R23, gpr[23]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R24, gpr[24]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R25, gpr[25]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R26, gpr[26]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R27, gpr[27]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R28, gpr[28]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R29, gpr[29]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R30, gpr[30]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R31, gpr[31]), + PT_REGS_OFFSET(PERF_REG_POWERPC_NIP, nip), + PT_REGS_OFFSET(PERF_REG_POWERPC_MSR, msr), + PT_REGS_OFFSET(PERF_REG_POWERPC_ORIG_R3, orig_gpr3), + PT_REGS_OFFSET(PERF_REG_POWERPC_CTR, ctr), + PT_REGS_OFFSET(PERF_REG_POWERPC_LINK, link), + PT_REGS_OFFSET(PERF_REG_POWERPC_XER, xer), + PT_REGS_OFFSET(PERF_REG_POWERPC_CCR, ccr), +#ifdef CONFIG_PPC64 + PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, softe), +#else + PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, mq), +#endif + PT_REGS_OFFSET(PERF_REG_POWERPC_TRAP, trap), + PT_REGS_OFFSET(PERF_REG_POWERPC_DAR, dar), + PT_REGS_OFFSET(PERF_REG_POWERPC_DSISR, dsisr), +}; + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + if (WARN_ON_ONCE(idx >= PERF_REG_POWERPC_MAX)) + return 0; + + return regs_get_register(regs, pt_regs_offset[idx]); +} + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ +#ifdef CONFIG_PPC64 + if (!test_tsk_thread_flag(task, TIF_32BIT)) + return PERF_SAMPLE_REGS_ABI_64; + else +#endif + return PERF_SAMPLE_REGS_ABI_32; +} + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) +{ + regs_user->regs = task_pt_regs(current); + regs_user->abi = perf_reg_abi(current); +} -- cgit v0.10.2 From dc642e8388b63a9a221903584bb6b562e5055d0d Mon Sep 17 00:00:00 2001 From: Anju T Date: Sat, 20 Feb 2016 10:32:47 +0530 Subject: tools/perf: Map the ID values with register names Map ID values with corresponding register names. These names are then displayed when user issues perf record with the -I option followed by perf report/script with -D option. To test this patchset, Eg: $ perf record -I ls # record machine state at interrupt $ perf script -D # read the perf.data file Sample output obtained for this patch / output looks like as follows: 496768515470 0x1988 [0x188]: PERF_RECORD_SAMPLE(IP, 0x1): 4522/4522: 0xc0000000001e538c period: 1 addr: 0 ... intr regs: mask 0x7ffffffffff ABI 64-bit .... r0 0xc0000000001e5e34 .... r1 0xc000000fe733f9a0 .... r2 0xc000000001523100 .... r3 0xc000000ffaadeb60 .... r4 0xc000000003456800 .... r5 0x73a9b5e000 .... r6 0x1e000000 .... r7 0x0 .... r8 0x0 .... r9 0x0 .... r10 0x1 .... r11 0x0 .... r12 0x24022822 .... r13 0xc00000000feec180 .... r14 0x0 .... r15 0xc000001e4be18800 .... r16 0x0 .... r17 0xc000000ffaac5000 .... r18 0xc000000fe733f8a0 .... r19 0xc000000001523100 .... r20 0xc00000000009fd1c .... r21 0xc000000fcaa69000 .... r22 0xc0000000001e4968 .... r23 0xc000000001523100 .... r24 0xc000000fe733f850 .... r25 0xc000000fcaa69000 .... r26 0xc000000003b8fcf0 .... r27 0xfffffffffffffead .... r28 0x0 .... r29 0xc000000fcaa69000 .... r30 0x1 .... r31 0x0 .... nip 0xc0000000001dd320 .... msr 0x9000000000009032 .... orig_r3 0xc0000000001e538c .... ctr 0xc00000000009d550 .... link 0xc0000000001e5e34 .... xer 0x0 .... ccr 0x84022882 .... softe 0x0 .... trap 0xf01 .... dar 0x0 .... dsisr 0xf00040060000004 ... thread: :4522:4522 ...... dso: /root/.debug/.build-id/b0/ef11b1a1629e62ac9de75199117ee5ef9469e9 :4522 4522 496.768515: 1 cycles: c0000000001e538c .perf_event_context_sched_in (/boot/vmlinux) Signed-off-by: Anju T Acked-by: Arnaldo Carvalho de Melo Signed-off-by: Michael Ellerman diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h new file mode 100644 index 0000000..75de0e9 --- /dev/null +++ b/tools/perf/arch/powerpc/include/perf_regs.h @@ -0,0 +1,69 @@ +#ifndef ARCH_PERF_REGS_H +#define ARCH_PERF_REGS_H + +#include +#include +#include + +#define PERF_REGS_MASK ((1ULL << PERF_REG_POWERPC_MAX) - 1) +#define PERF_REGS_MAX PERF_REG_POWERPC_MAX +#ifdef __powerpc64__ + #define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_64 +#else + #define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_32 +#endif + +#define PERF_REG_IP PERF_REG_POWERPC_NIP +#define PERF_REG_SP PERF_REG_POWERPC_R1 + +static const char *reg_names[] = { + [PERF_REG_POWERPC_R0] = "r0", + [PERF_REG_POWERPC_R1] = "r1", + [PERF_REG_POWERPC_R2] = "r2", + [PERF_REG_POWERPC_R3] = "r3", + [PERF_REG_POWERPC_R4] = "r4", + [PERF_REG_POWERPC_R5] = "r5", + [PERF_REG_POWERPC_R6] = "r6", + [PERF_REG_POWERPC_R7] = "r7", + [PERF_REG_POWERPC_R8] = "r8", + [PERF_REG_POWERPC_R9] = "r9", + [PERF_REG_POWERPC_R10] = "r10", + [PERF_REG_POWERPC_R11] = "r11", + [PERF_REG_POWERPC_R12] = "r12", + [PERF_REG_POWERPC_R13] = "r13", + [PERF_REG_POWERPC_R14] = "r14", + [PERF_REG_POWERPC_R15] = "r15", + [PERF_REG_POWERPC_R16] = "r16", + [PERF_REG_POWERPC_R17] = "r17", + [PERF_REG_POWERPC_R18] = "r18", + [PERF_REG_POWERPC_R19] = "r19", + [PERF_REG_POWERPC_R20] = "r20", + [PERF_REG_POWERPC_R21] = "r21", + [PERF_REG_POWERPC_R22] = "r22", + [PERF_REG_POWERPC_R23] = "r23", + [PERF_REG_POWERPC_R24] = "r24", + [PERF_REG_POWERPC_R25] = "r25", + [PERF_REG_POWERPC_R26] = "r26", + [PERF_REG_POWERPC_R27] = "r27", + [PERF_REG_POWERPC_R28] = "r28", + [PERF_REG_POWERPC_R29] = "r29", + [PERF_REG_POWERPC_R30] = "r30", + [PERF_REG_POWERPC_R31] = "r31", + [PERF_REG_POWERPC_NIP] = "nip", + [PERF_REG_POWERPC_MSR] = "msr", + [PERF_REG_POWERPC_ORIG_R3] = "orig_r3", + [PERF_REG_POWERPC_CTR] = "ctr", + [PERF_REG_POWERPC_LINK] = "link", + [PERF_REG_POWERPC_XER] = "xer", + [PERF_REG_POWERPC_CCR] = "ccr", + [PERF_REG_POWERPC_SOFTE] = "softe", + [PERF_REG_POWERPC_TRAP] = "trap", + [PERF_REG_POWERPC_DAR] = "dar", + [PERF_REG_POWERPC_DSISR] = "dsisr" +}; + +static inline const char *perf_reg_name(int id) +{ + return reg_names[id]; +} +#endif /* ARCH_PERF_REGS_H */ diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index f7d7f5a..eda1cfb 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -23,6 +23,11 @@ $(call detected_var,ARCH) NO_PERF_REGS := 1 +# Additional ARCH settings for ppc +ifeq ($(ARCH),powerpc) + NO_PERF_REGS := 0 +endif + # Additional ARCH settings for x86 ifeq ($(ARCH),x86) $(call detected,CONFIG_X86) -- cgit v0.10.2 From bb62bad623deb7952aef13fcbe065fde5d1f03e5 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Sat, 20 Feb 2016 10:32:48 +0530 Subject: tool/perf: Add sample_reg_mask to include all perf_regs Add sample_reg_mask array with pt_regs registers. This is needed for printing supported regs ( -I? option). Signed-off-by: Madhavan Srinivasan Acked-by: Arnaldo Carvalho de Melo Signed-off-by: Michael Ellerman diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build index c8fe207..9ee3350 100644 --- a/tools/perf/arch/powerpc/util/Build +++ b/tools/perf/arch/powerpc/util/Build @@ -1,6 +1,7 @@ libperf-y += header.o libperf-y += sym-handling.o libperf-y += kvm-stat.o +libperf-y += perf_regs.o libperf-$(CONFIG_DWARF) += dwarf-regs.o libperf-$(CONFIG_DWARF) += skip-callchain-idx.o diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c new file mode 100644 index 0000000..a3c3e1c --- /dev/null +++ b/tools/perf/arch/powerpc/util/perf_regs.c @@ -0,0 +1,49 @@ +#include "../../perf.h" +#include "../../util/perf_regs.h" + +const struct sample_reg sample_reg_masks[] = { + SMPL_REG(r0, PERF_REG_POWERPC_R0), + SMPL_REG(r1, PERF_REG_POWERPC_R1), + SMPL_REG(r2, PERF_REG_POWERPC_R2), + SMPL_REG(r3, PERF_REG_POWERPC_R3), + SMPL_REG(r4, PERF_REG_POWERPC_R4), + SMPL_REG(r5, PERF_REG_POWERPC_R5), + SMPL_REG(r6, PERF_REG_POWERPC_R6), + SMPL_REG(r7, PERF_REG_POWERPC_R7), + SMPL_REG(r8, PERF_REG_POWERPC_R8), + SMPL_REG(r9, PERF_REG_POWERPC_R9), + SMPL_REG(r10, PERF_REG_POWERPC_R10), + SMPL_REG(r11, PERF_REG_POWERPC_R11), + SMPL_REG(r12, PERF_REG_POWERPC_R12), + SMPL_REG(r13, PERF_REG_POWERPC_R13), + SMPL_REG(r14, PERF_REG_POWERPC_R14), + SMPL_REG(r15, PERF_REG_POWERPC_R15), + SMPL_REG(r16, PERF_REG_POWERPC_R16), + SMPL_REG(r17, PERF_REG_POWERPC_R17), + SMPL_REG(r18, PERF_REG_POWERPC_R18), + SMPL_REG(r19, PERF_REG_POWERPC_R19), + SMPL_REG(r20, PERF_REG_POWERPC_R20), + SMPL_REG(r21, PERF_REG_POWERPC_R21), + SMPL_REG(r22, PERF_REG_POWERPC_R22), + SMPL_REG(r23, PERF_REG_POWERPC_R23), + SMPL_REG(r24, PERF_REG_POWERPC_R24), + SMPL_REG(r25, PERF_REG_POWERPC_R25), + SMPL_REG(r26, PERF_REG_POWERPC_R26), + SMPL_REG(r27, PERF_REG_POWERPC_R27), + SMPL_REG(r28, PERF_REG_POWERPC_R28), + SMPL_REG(r29, PERF_REG_POWERPC_R29), + SMPL_REG(r30, PERF_REG_POWERPC_R30), + SMPL_REG(r31, PERF_REG_POWERPC_R31), + SMPL_REG(nip, PERF_REG_POWERPC_NIP), + SMPL_REG(msr, PERF_REG_POWERPC_MSR), + SMPL_REG(orig_r3, PERF_REG_POWERPC_ORIG_R3), + SMPL_REG(ctr, PERF_REG_POWERPC_CTR), + SMPL_REG(link, PERF_REG_POWERPC_LINK), + SMPL_REG(xer, PERF_REG_POWERPC_XER), + SMPL_REG(ccr, PERF_REG_POWERPC_CCR), + SMPL_REG(softe, PERF_REG_POWERPC_SOFTE), + SMPL_REG(trap, PERF_REG_POWERPC_TRAP), + SMPL_REG(dar, PERF_REG_POWERPC_DAR), + SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR), + SMPL_REG_END +}; -- cgit v0.10.2 From e009a7e858fed215cb4eed5174a31cadd42d8797 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Mon, 21 Mar 2016 14:32:48 -0500 Subject: cxl: Allow initialization on timebase sync failures Failure to synchronize the PSL timebase currently prevents the initialization of the cxl card, thus rendering the card useless. This is too extreme for a feature which is rarely used, if at all. No hardware AFUs or software is currently using PSL timebase. This patch still tries to synchronize the PSL timebase when the card is initialized, but ignores the error if it can't. Instead, it reports a status via /sys. Signed-off-by: Frederic Barrat Acked-by: Ian Munsie Signed-off-by: Michael Ellerman diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl index 7fd737e..4ba0a2a 100644 --- a/Documentation/ABI/testing/sysfs-class-cxl +++ b/Documentation/ABI/testing/sysfs-class-cxl @@ -233,3 +233,11 @@ Description: read/write 0 = don't trust, the image may be different (default) 1 = trust that the image will not change. Users: https://github.com/ibm-capi/libcxl + +What: /sys/class/cxl//psl_timebase_synced +Date: March 2016 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Returns 1 if the psl timebase register is synchronized + with the core timebase register, 0 otherwise. +Users: https://github.com/ibm-capi/libcxl diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 38e21cf..dfdbfb0 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -579,6 +579,7 @@ struct cxl { bool perst_loads_image; bool perst_select_user; bool perst_same_image; + bool psl_timebase_synced; }; int cxl_pci_alloc_one_irq(struct cxl *adapter); diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index 8213372..a83acf9 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -1101,6 +1101,12 @@ struct cxl *cxl_guest_init_adapter(struct device_node *np, struct platform_devic adapter->dev.release = release_adapter; dev_set_drvdata(&pdev->dev, adapter); + /* + * Hypervisor controls PSL timebase initialization (p1 register). + * On FW840, PSL is initialized. + */ + adapter->psl_timebase_synced = true; + if ((rc = cxl_of_read_adapter_handle(adapter, np))) goto err1; diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 94fd3f7..c6d5cf5 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -394,22 +394,24 @@ static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev #define TBSYNC_CNT(n) (((u64)n & 0x7) << (63-6)) #define _2048_250MHZ_CYCLES 1 -static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev) +static void cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev) { u64 psl_tb; int delta; unsigned int retry = 0; struct device_node *np; + adapter->psl_timebase_synced = false; + if (!(np = pnv_pci_get_phb_node(dev))) - return -ENODEV; + return; /* Do not fail when CAPP timebase sync is not supported by OPAL */ of_node_get(np); if (! of_get_property(np, "ibm,capp-timebase-sync", NULL)) { of_node_put(np); - pr_err("PSL: Timebase sync: OPAL support missing\n"); - return 0; + dev_info(&dev->dev, "PSL timebase inactive: OPAL support missing\n"); + return; } of_node_put(np); @@ -428,8 +430,8 @@ static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev) do { msleep(1); if (retry++ > 5) { - pr_err("PSL: Timebase sync: giving up!\n"); - return -EIO; + dev_info(&dev->dev, "PSL timebase can't synchronize\n"); + return; } psl_tb = cxl_p1_read(adapter, CXL_PSL_Timebase); delta = mftb() - psl_tb; @@ -437,7 +439,8 @@ static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev) delta = -delta; } while (tb_to_ns(delta) > 16000); - return 0; + adapter->psl_timebase_synced = true; + return; } static int init_implementation_afu_regs(struct cxl_afu *afu) @@ -1183,8 +1186,8 @@ static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev) if ((rc = pnv_phb_to_cxl_mode(dev, OPAL_PHB_CAPI_MODE_SNOOP_ON))) goto err; - if ((rc = cxl_setup_psl_timebase(adapter, dev))) - goto err; + /* Ignore error, adapter init is not dependant on timebase sync */ + cxl_setup_psl_timebase(adapter, dev); if ((rc = cxl_native_register_psl_err_irq(adapter))) goto err; diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c index 25913c0..b043c20 100644 --- a/drivers/misc/cxl/sysfs.c +++ b/drivers/misc/cxl/sysfs.c @@ -57,6 +57,15 @@ static ssize_t image_loaded_show(struct device *device, return scnprintf(buf, PAGE_SIZE, "factory\n"); } +static ssize_t psl_timebase_synced_show(struct device *device, + struct device_attribute *attr, + char *buf) +{ + struct cxl *adapter = to_cxl_adapter(device); + + return scnprintf(buf, PAGE_SIZE, "%i\n", adapter->psl_timebase_synced); +} + static ssize_t reset_adapter_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) @@ -142,6 +151,7 @@ static struct device_attribute adapter_attrs[] = { __ATTR_RO(psl_revision), __ATTR_RO(base_image), __ATTR_RO(image_loaded), + __ATTR_RO(psl_timebase_synced), __ATTR_RW(load_image_on_perst), __ATTR_RW(perst_reloads_same_image), __ATTR(reset, S_IWUSR, NULL, reset_adapter_store), -- cgit v0.10.2 From 4aec6ec0da9c72c0fa1a5b0d1133707481347bb3 Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Tue, 19 Apr 2016 18:34:24 +0200 Subject: cxl: Increase timeout for detection of AFU mmio hang PSL designers recommend a larger value for the mmio hang pulse, 256 us instead of 1 us. The CAIA architecture states that it needs to be smaller than 1/2 of the RTOS timeout set in the PHB for outbound non-posted transactions, which is still (easily) the case here. Signed-off-by: Frederic Barrat Acked-by: Ian Munsie Tested-by: Frank Haverkamp Tested-by: Manoj Kumar Signed-off-by: Michael Ellerman diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index c6d5cf5..a08fcc8 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -375,8 +375,10 @@ static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev return -ENODEV; } + psl_dsnctl = 0x0000900000000000ULL; /* pteupd ttype, scdone */ + psl_dsnctl |= (0x2ULL << (63-38)); /* MMIO hang pulse: 256 us */ /* Tell PSL where to route data to */ - psl_dsnctl = 0x0000900002000000ULL | (chipid << (63-5)); + psl_dsnctl |= (chipid << (63-5)); psl_dsnctl |= (capp_unit_id << (63-13)); cxl_p1_write(adapter, CXL_PSL_DSNDCTL, psl_dsnctl); -- cgit v0.10.2 From 3b1dbfa14f97188ec33fdfc7acb66bea59a3bb21 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 20 Apr 2016 03:59:47 -0400 Subject: cxl: Fix DAR check & use REGION_ID instead of opencoding The current code will set _PAGE_USER to the access flags for any fault address, because the ~ operation will be true for all address we take a fault on. But setting _PAGE_USER also means that the fault will be handled only if the page table have _PAGE_USER set. Hence there is no security hole with the current code. Now if it is an user space access, then the change in this patch really don't have an impact because we have (!ctx->kernel) set true and we take the if condition true. Now kernel context created fault on an address in the kernel range will result in a fault loop because we will not insert the hash pte due to access and pte permission mismatch. This patch fix the above issue. Fixes: f204e0b8cedd ("cxl: Driver code for powernv PCIe based cards for userspace access") Reviewed-by: Andrew Donnellan Acked-by: Ian Munsie Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 9a8650b..9a23654 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -152,7 +152,7 @@ static void cxl_handle_page_fault(struct cxl_context *ctx, access = _PAGE_PRESENT; if (dsisr & CXL_PSL_DSISR_An_S) access |= _PAGE_RW; - if ((!ctx->kernel) || ~(dar & (1ULL << 63))) + if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID)) access |= _PAGE_USER; if (dsisr & DSISR_NOHPTE) -- cgit v0.10.2 From 2d5217840ff1d0cf0f88201a922ab35bd083f7fb Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Tue, 26 Apr 2016 15:02:50 +1000 Subject: powerpc/eeh: fix misleading indentation Found by smatch. Signed-off-by: Andrew Donnellan Acked-by: Russell Currey Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index eea48d8..f0520da 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -249,7 +249,7 @@ static void *__eeh_pe_get(void *data, void *flag) } else { if (edev->pe_config_addr && (edev->pe_config_addr == pe->addr)) - return pe; + return pe; } /* Try BDF address */ -- cgit v0.10.2 From 4ad5e8831e1cb663f17112e44406b5ca9649ba1f Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Tue, 26 Apr 2016 17:55:04 +1000 Subject: powerpc/mpic: handle subsys_system_register() failure mpic_init_sys() currently doesn't check whether subsys_system_register() succeeded or not. Check the return code of subsys_system_register() and clean up if there's an error. Signed-off-by: Andrew Donnellan Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index afe3c7c..7de45b2 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -2004,8 +2004,15 @@ static struct syscore_ops mpic_syscore_ops = { static int mpic_init_sys(void) { + int rc; + register_syscore_ops(&mpic_syscore_ops); - subsys_system_register(&mpic_subsys, NULL); + rc = subsys_system_register(&mpic_subsys, NULL); + if (rc) { + unregister_syscore_ops(&mpic_syscore_ops); + pr_err("mpic: Failed to register subsystem!\n"); + return rc; + } return 0; } -- cgit v0.10.2 From 8a649045e75a4b9091ea9d041f5bb599f8ec1f8a Mon Sep 17 00:00:00 2001 From: Chris Smart Date: Tue, 26 Apr 2016 10:28:50 +1000 Subject: powerpc: Add support for userspace P9 copy paste The copy paste facility introduced in POWER9 provides an optimised mechanism for a userspace application to copy a cacheline. This is provided by a pair of instructions, copy and paste, while a third, cp_abort (copy paste abort), provides a clean up of the state in case of a failure. The copy instruction will read a 128 byte cacheline and store it in an internal buffer. The subsequent paste instruction will store this internal buffer to memory and set a CR field if the paste succeeds. Since the state of the copy paste buffer is internal (and not architecturally visible), in the unlikely event of a context switch, the state cannot be stored and the paste should therefore fail. The cp_abort instruction exists to fail and clean up any such interrupted copy paste sequence and is to be called by the kernel as part of the context switch. Doing so prevents data from a preceding copy in one process leaking into the paste of another. This code enables use of the cp_abort instruction if a supported processor is detected. NOTE: this is for userspace only, not in kernel, and does not deal with KVM guests. Patch created with much assistance from Michael Neuling Signed-off-by: Chris Smart Reviewed-by: Cyril Bur Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 7ab04fc..1d035c1 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -131,6 +131,7 @@ /* sorted alphabetically */ #define PPC_INST_BHRBE 0x7c00025c #define PPC_INST_CLRBHRB 0x7c00035c +#define PPC_INST_CP_ABORT 0x7c00068c #define PPC_INST_DCBA 0x7c0005ec #define PPC_INST_DCBA_MASK 0xfc0007fe #define PPC_INST_DCBAL 0x7c2005ec @@ -285,6 +286,7 @@ #endif /* Deal with instructions that older assemblers aren't aware of */ +#define PPC_CP_ABORT stringify_in_c(.long PPC_INST_CP_ABORT) #define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ __PPC_RA(a) | __PPC_RB(b)) #define PPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 39a79c8..4fd4922 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -37,6 +37,7 @@ #include #include #include +#include /* * System calls. @@ -509,6 +510,14 @@ BEGIN_FTR_SECTION ldarx r6,0,r1 END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) +BEGIN_FTR_SECTION +/* + * A cp_abort (copy paste abort) here ensures that when context switching, a + * copy from one process can't leak into the paste of another. + */ + PPC_CP_ABORT +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + #ifdef CONFIG_PPC_BOOK3S /* Cancel all explict user streams as they will have no use after context * switch and will stop the HW from creating streams itself -- cgit v0.10.2 From 8fe088850f13eabf8197df5b0bf38668fa6c1294 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 26 Apr 2016 23:49:09 +1000 Subject: powerpc: rework sparse for lib/xor_vmx.c Sparse doesn't seem to be passing -maltivec around properly, leading to lots of errors: .../include/altivec.h:34:2: error: Use the "-maltivec" flag to enable PowerPC AltiVec support arch/powerpc/lib/xor_vmx.c:27:16: error: Expected ; at end of declaration arch/powerpc/lib/xor_vmx.c:27:16: error: got signed arch/powerpc/lib/xor_vmx.c:60:9: error: No right hand side of '*'-expression arch/powerpc/lib/xor_vmx.c:60:9: error: Expected ; at end of statement arch/powerpc/lib/xor_vmx.c:60:9: error: got v1_in ... arch/powerpc/lib/xor_vmx.c:87:9: error: too many errors Only include the altivec.h header for non-__CHECKER__ builds. For builds with __CHECKER__, make up some stubs instead, as suggested by Balbir. (The vector size of 16 is arbitrary.) Suggested-by: Balbir Singh Signed-off-by: Daniel Axtens Tested-by: Balbir Singh Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c index 07f49f1..f9de69a 100644 --- a/arch/powerpc/lib/xor_vmx.c +++ b/arch/powerpc/lib/xor_vmx.c @@ -17,7 +17,17 @@ * * Author: Anton Blanchard */ + +/* + * Sparse (as at v0.5.0) gets very, very confused by this file. + * Make it a bit simpler for it. + */ +#if !defined(__CHECKER__) #include +#else +#define vec_xor(a, b) a ^ b +#define vector __attribute__((vector_size(16))) +#endif #include #include -- cgit v0.10.2 From 7132e2d669bd42c3783327f301aaac5f4463299b Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Mon, 25 Apr 2016 18:56:14 -0300 Subject: ftrace: Match dot symbols when searching functions on ppc64 In the ppc64 big endian ABI, function symbols point to function descriptors. The symbols which point to the function entry points have a dot in front of the function name. Consequently, when the ftrace filter mechanism searches for the symbol corresponding to an entry point address, it gets the dot symbol. As a result, ftrace filter users have to be aware of this ABI detail on ppc64 and prepend a dot to the function name when setting the filter. The perf probe command insulates the user from this by ignoring the dot in front of the symbol name when matching function names to symbols, but the sysfs interface does not. This patch makes the ftrace filter mechanism do the same when searching symbols. Fixes the following failure in ftracetest's kprobe_ftrace.tc: .../kprobe_ftrace.tc: line 9: echo: write error: Invalid argument That failure is on this line of kprobe_ftrace.tc: echo _do_fork > set_ftrace_filter This is because there's no _do_fork entry in the functions list: # cat available_filter_functions | grep _do_fork ._do_fork This change introduces no regressions on the perf and ftracetest testsuite results. Cc: Steven Rostedt Cc: Ingo Molnar Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Thiago Jung Bauermann Acked-by: Steven Rostedt Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 9dac18da..1123a4d 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -607,3 +607,13 @@ unsigned long __init arch_syscall_addr(int nr) return sys_call_table[nr*2]; } #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */ + +#if defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) +char *arch_ftrace_match_adjust(char *str, const char *search) +{ + if (str[0] == '.' && search[0] != '.') + return str + 1; + else + return str; +} +#endif /* defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7e8d792..a6c8252 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3456,11 +3456,23 @@ struct ftrace_glob { int type; }; +/* + * If symbols in an architecture don't correspond exactly to the user-visible + * name of what they represent, it is possible to define this function to + * perform the necessary adjustments. +*/ +char * __weak arch_ftrace_match_adjust(char *str, const char *search) +{ + return str; +} + static int ftrace_match(char *str, struct ftrace_glob *g) { int matched = 0; int slen; + str = arch_ftrace_match_adjust(str, g->search); + switch (g->type) { case MATCH_FULL: if (strcmp(str, g->search) == 0) -- cgit v0.10.2 From 5bcca743cb6e3ab97ff400ef92c7f57c787020f6 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Thu, 21 Apr 2016 15:46:34 +0530 Subject: powerpc/perf: Replace raw event hex values with #defines Minor cleanup patch to replace the raw event hex values in power8-pmu.c with #defines. Signed-off-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h index 741b77e..3a2e6e8 100644 --- a/arch/powerpc/perf/power8-events-list.h +++ b/arch/powerpc/perf/power8-events-list.h @@ -49,3 +49,43 @@ EVENT(PM_L3_PREF_ALL, 0x4e052) EVENT(PM_DTLB_MISS, 0x300fc) /* ITLB Reloaded */ EVENT(PM_ITLB_MISS, 0x400fc) +/* Run_Instructions */ +EVENT(PM_RUN_INST_CMPL, 0x500fa) +/* Alternate event code for PM_RUN_INST_CMPL */ +EVENT(PM_RUN_INST_CMPL_ALT, 0x400fa) +/* Run_cycles */ +EVENT(PM_RUN_CYC, 0x600f4) +/* Alternate event code for Run_cycles */ +EVENT(PM_RUN_CYC_ALT, 0x200f4) +/* Marked store completed */ +EVENT(PM_MRK_ST_CMPL, 0x10134) +/* Alternate event code for Marked store completed */ +EVENT(PM_MRK_ST_CMPL_ALT, 0x301e2) +/* Marked two path branch */ +EVENT(PM_BR_MRK_2PATH, 0x10138) +/* Alternate event code for PM_BR_MRK_2PATH */ +EVENT(PM_BR_MRK_2PATH_ALT, 0x40138) +/* L3 castouts in Mepf state */ +EVENT(PM_L3_CO_MEPF, 0x18082) +/* Alternate event code for PM_L3_CO_MEPF */ +EVENT(PM_L3_CO_MEPF_ALT, 0x3e05e) +/* Data cache was reloaded from a location other than L2 due to a marked load */ +EVENT(PM_MRK_DATA_FROM_L2MISS, 0x1d14e) +/* Alternate event code for PM_MRK_DATA_FROM_L2MISS */ +EVENT(PM_MRK_DATA_FROM_L2MISS_ALT, 0x401e8) +/* Alternate event code for PM_CMPLU_STALL */ +EVENT(PM_CMPLU_STALL_ALT, 0x1e054) +/* Two path branch */ +EVENT(PM_BR_2PATH, 0x20036) +/* Alternate event code for PM_BR_2PATH */ +EVENT(PM_BR_2PATH_ALT, 0x40036) +/* # PPC Dispatched */ +EVENT(PM_INST_DISP, 0x200f2) +/* Alternate event code for PM_INST_DISP */ +EVENT(PM_INST_DISP_ALT, 0x300f2) +/* Marked filter Match */ +EVENT(PM_MRK_FILT_MATCH, 0x2013c) +/* Alternate event code for PM_MRK_FILT_MATCH */ +EVENT(PM_MRK_FILT_MATCH_ALT, 0x3012e) +/* Alternate event code for PM_LD_MISS_L1 */ +EVENT(PM_LD_MISS_L1_ALT, 0x400f0) diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index 690d918..7cf3b43 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -274,7 +274,8 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long /* Ignore Linux defined bits when checking event below */ base_event = event & ~EVENT_LINUX_MASK; - if (pmc >= 5 && base_event != 0x500fa && base_event != 0x600f4) + if (pmc >= 5 && base_event != PM_RUN_INST_CMPL && + base_event != PM_RUN_CYC) return -1; mask |= CNST_PMC_MASK(pmc); @@ -488,17 +489,17 @@ static int power8_compute_mmcr(u64 event[], int n_ev, /* Table of alternatives, sorted by column 0 */ static const unsigned int event_alternatives[][MAX_ALT] = { - { 0x10134, 0x301e2 }, /* PM_MRK_ST_CMPL */ - { 0x10138, 0x40138 }, /* PM_BR_MRK_2PATH */ - { 0x18082, 0x3e05e }, /* PM_L3_CO_MEPF */ - { 0x1d14e, 0x401e8 }, /* PM_MRK_DATA_FROM_L2MISS */ - { 0x1e054, 0x4000a }, /* PM_CMPLU_STALL */ - { 0x20036, 0x40036 }, /* PM_BR_2PATH */ - { 0x200f2, 0x300f2 }, /* PM_INST_DISP */ - { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */ - { 0x2013c, 0x3012e }, /* PM_MRK_FILT_MATCH */ - { 0x3e054, 0x400f0 }, /* PM_LD_MISS_L1 */ - { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */ + { PM_MRK_ST_CMPL, PM_MRK_ST_CMPL_ALT }, + { PM_BR_MRK_2PATH, PM_BR_MRK_2PATH_ALT }, + { PM_L3_CO_MEPF, PM_L3_CO_MEPF_ALT }, + { PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L2MISS_ALT }, + { PM_CMPLU_STALL_ALT, PM_CMPLU_STALL }, + { PM_BR_2PATH, PM_BR_2PATH_ALT }, + { PM_INST_DISP, PM_INST_DISP_ALT }, + { PM_RUN_CYC_ALT, PM_RUN_CYC }, + { PM_MRK_FILT_MATCH, PM_MRK_FILT_MATCH_ALT }, + { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, + { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, }; /* @@ -546,17 +547,17 @@ static int power8_get_alternatives(u64 event, unsigned int flags, u64 alt[]) j = num_alt; for (i = 0; i < num_alt; ++i) { switch (alt[i]) { - case 0x1e: /* PM_CYC */ - alt[j++] = 0x600f4; /* PM_RUN_CYC */ + case PM_CYC: + alt[j++] = PM_RUN_CYC; break; - case 0x600f4: /* PM_RUN_CYC */ - alt[j++] = 0x1e; + case PM_RUN_CYC: + alt[j++] = PM_CYC; break; - case 0x2: /* PM_PPC_CMPL */ - alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */ + case PM_INST_CMPL: + alt[j++] = PM_RUN_INST_CMPL; break; - case 0x500fa: /* PM_RUN_INST_CMPL */ - alt[j++] = 0x2; /* PM_PPC_CMPL */ + case PM_RUN_INST_CMPL: + alt[j++] = PM_INST_CMPL; break; } } -- cgit v0.10.2 From 8ffb4103f5e28d7e7890ed4774d8e009f253f56e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:24 +1000 Subject: IB/qib: Use cache inhibitted and guarded mapping on powerpc The driver was requesting for a writethrough mapping. But with those flags we will end up with an SAO mapping because we now have memory conherence always enabled. ie, the existing mapping will end up with a WIMG value 0b1110 which is Strong Access Order. Update this to use cache inhibitted guarded mapping. Cc: Doug Ledford Cc: Sean Hefty Cc: Hal Rosenstock Cc: linux-rdma@vger.kernel.org Signed-off-by: Aneesh Kumar K.V Acked-by: Mike Marciniszyn Signed-off-by: Michael Ellerman diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index e449e39..a3d593f 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -822,10 +822,7 @@ static int mmap_piobufs(struct vm_area_struct *vma, phys = dd->physaddr + piobufs; #if defined(__powerpc__) - /* There isn't a generic way to specify writethrough mappings */ - pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE; - pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU; - pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); #endif /* diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 4758a38..6abe1c6 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -144,13 +144,7 @@ int qib_pcie_ddinit(struct qib_devdata *dd, struct pci_dev *pdev, addr = pci_resource_start(pdev, 0); len = pci_resource_len(pdev, 0); -#if defined(__powerpc__) - /* There isn't a generic way to specify writethrough mappings */ - dd->kregbase = __ioremap(addr, len, _PAGE_NO_CACHE | _PAGE_WRITETHRU); -#else dd->kregbase = ioremap_nocache(addr, len); -#endif - if (!dd->kregbase) return -ENOMEM; -- cgit v0.10.2 From 670eea924198456873f3346bb247b662d7ddffd7 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Apr 2016 23:25:25 +1000 Subject: powerpc/mm: Always use STRICT_MM_TYPECHECKS Testing done by Paul Mackerras has shown that with a modern compiler there is no negative effect on code generation from enabling STRICT_MM_TYPECHECKS. So remove the option, and always use the strict type definitions. Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 638f9ce..d3fcf7e 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -19,14 +19,6 @@ config PPC_WERROR depends on !PPC_DISABLE_WERROR default y -config STRICT_MM_TYPECHECKS - bool "Do extra type checking on mm types" - default n - help - This option turns on extra type checking for some mm related types. - - If you don't know what this means, say N. - config PRINT_STACK_DEPTH int "Stack depth to print" if DEBUG_KERNEL default 64 diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 43140f8..1464e74 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -1,9 +1,6 @@ #ifndef _ASM_POWERPC_PGTABLE_TYPES_H #define _ASM_POWERPC_PGTABLE_TYPES_H -#ifdef CONFIG_STRICT_MM_TYPECHECKS -/* These are used to make use of C type-checking. */ - /* PTE level */ typedef struct { pte_basic_t pte; } pte_t; #define __pte(x) ((pte_t) { (x) }) @@ -48,49 +45,6 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) }) -#else - -/* - * .. while these make it easier on the compiler - */ - -typedef pte_basic_t pte_t; -#define __pte(x) (x) -static inline pte_basic_t pte_val(pte_t pte) -{ - return pte; -} - -#ifdef CONFIG_PPC64 -typedef unsigned long pmd_t; -#define __pmd(x) (x) -static inline unsigned long pmd_val(pmd_t pmd) -{ - return pmd; -} - -#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) -typedef unsigned long pud_t; -#define __pud(x) (x) -static inline unsigned long pud_val(pud_t pud) -{ - return pud; -} -#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ -#endif /* CONFIG_PPC64 */ - -typedef unsigned long pgd_t; -#define __pgd(x) (x) -static inline unsigned long pgd_val(pgd_t pgd) -{ - return pgd; -} - -typedef unsigned long pgprot_t; -#define pgprot_val(x) (x) -#define __pgprot(x) (x) - -#endif /* CONFIG_STRICT_MM_TYPECHECKS */ /* * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages -- cgit v0.10.2 From 4bece39b503c2340de154c82bf6d928e72d74d4e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:26 +1000 Subject: powerpc/mm: Drop PTE_ATOMIC_UPDATES from pmd_hugepage_update() pmd_hugepage_update() is inside #ifdef CONFIG_TRANSPARENT_HUGEPAGE. THP can only be enabled if PPC_BOOK3S_64=y && PPC_64K_PAGES=y, aka. hash64. On hash64 we always define PTE_ATOMIC_UPDATES to 1, meaning the #ifdef in pmd_hugepage_update() is unnecessary, so drop it. That is also the only use of PTE_ATOMIC_UPDATES in any of the hash code, meaning we no longer need to #define it at all in the hash headers. Note it's still #defined and used in the nohash code. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/32/hash.h b/arch/powerpc/include/asm/book3s/32/hash.h index 264b754..880db13 100644 --- a/arch/powerpc/include/asm/book3s/32/hash.h +++ b/arch/powerpc/include/asm/book3s/32/hash.h @@ -39,8 +39,5 @@ #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) -/* Hash table based platforms need atomic updates of the linux PTE */ -#define PTE_ATOMIC_UPDATES 1 - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BOOK3S_32_HASH_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index d0ee6fc..8f2ae067 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -127,8 +127,6 @@ #define _PTEIDX_SECONDARY 0x8 #define _PTEIDX_GROUP_IX 0x7 -/* Hash table based platforms need atomic updates of the linux PTE */ -#define PTE_ATOMIC_UPDATES 1 #define _PTE_NONE_MASK _PAGE_HPTEFLAGS /* * The mask convered by the RPN must be a ULL on 32-bit platforms with diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 3471060..cda88b8 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -522,7 +522,6 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, assert_spin_locked(&mm->page_table_lock); #endif -#ifdef PTE_ATOMIC_UPDATES __asm__ __volatile__( "1: ldarx %0,0,%3\n\ andi. %1,%0,%6\n\ @@ -534,10 +533,7 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set) : "cc" ); -#else - old = pmd_val(*pmdp); - *pmdp = __pmd((old & ~clr) | set); -#endif + trace_hugepage_update(addr, old, clr, set); if (old & _PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp, old); -- cgit v0.10.2 From 3910a7f485e1222de2176ef5bf3a1d8de3152920 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Apr 2016 23:25:27 +1000 Subject: powerpc/mm: Add pte_xchg() helper We have five locations in 64-bit hash MMU code that do a cmpxchg() of a PTE. Currently doing it inline OK, but in a future patch we will be converting the PTEs to __be64 in some configs. In that case we will need casts at every cmpxchg() site in order to keep sparse happy. So move the logic into a helper, this is a reasonably nice cleanup on its own. Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 7529aab..42da513 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -319,11 +319,8 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing) if (writing && pte_write(old_pte)) new_pte = pte_mkdirty(new_pte); - if (pte_val(old_pte) == __cmpxchg_u64((unsigned long *)ptep, - pte_val(old_pte), - pte_val(new_pte))) { + if (pte_xchg(ptep, old_pte, new_pte)) break; - } } return new_pte; } diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 1464e74..e7f4f3e 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -54,4 +54,16 @@ typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; #endif + +#ifdef CONFIG_PPC_STD_MMU_64 +#include + +static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) +{ + unsigned long *p = (unsigned long *)ptep; + + return pte_val(old) == __cmpxchg_u64(p, pte_val(old), pte_val(new)); +} +#endif + #endif /* _ASM_POWERPC_PGTABLE_TYPES_H */ diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c index 47d1b26..dbd3133 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/hash64_4k.c @@ -47,8 +47,8 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; if (access & _PAGE_RW) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + /* * PP bits. _PAGE_USER is already PP bit 0x2, so we only * need to add in 0x1 if it's a read-only user page diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index b2d659c..6fbf983 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -79,8 +79,8 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_COMBO; if (access & _PAGE_RW) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + /* * Handle the subpage protection bits */ @@ -254,8 +254,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access, new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; if (access & _PAGE_RW) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); rflags = htab_convert_pte_flags(new_pte); diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 8555fce..9c05187 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -57,8 +57,8 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; if (access & _PAGE_RW) new_pte |= _PAGE_DIRTY; - } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + rflags = htab_convert_pte_flags(new_pte); sz = ((1UL) << shift); -- cgit v0.10.2 From 5dc1ef858c12f865e3676727e03519bece4ce6c1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:28 +1000 Subject: powerpc/mm: Use big endian Linux page tables for book3s 64 Traditionally Power server machines have used the Hashed Page Table MMU mode. In this mode Linux manages its own tree of nested page tables, aka. "the Linux page tables", which are not used by the hardware directly, and software loads translations into the hash page table for use by the hardware. Power ISA 3.0 defines a new MMU mode, known as Radix Tree Translation, where the hardware can directly operate on the Linux page tables. However the hardware requires that the page tables be in big endian format. To accommodate this, switch the pgtable types to __be64 and add appropriate endian conversions. Because we will be supporting a single kernel binary that boots using either radix or hash mode, we always store the Linux page tables big endian, even in hash mode where they are not actually used by the hardware. Signed-off-by: Aneesh Kumar K.V [mpe: Fix sparse errors, flesh out change log] Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 8f2ae067..7fc1a9a 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -247,23 +247,26 @@ static inline unsigned long pte_update(struct mm_struct *mm, unsigned long set, int huge) { - unsigned long old, tmp; + __be64 old_be, tmp_be; + unsigned long old; __asm__ __volatile__( "1: ldarx %0,0,%3 # pte_update\n\ - andi. %1,%0,%6\n\ + and. %1,%0,%6\n\ bne- 1b \n\ andc %1,%0,%4 \n\ or %1,%1,%7\n\ stdcx. %1,0,%3 \n\ bne- 1b" - : "=&r" (old), "=&r" (tmp), "=m" (*ptep) - : "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set) + : "=&r" (old_be), "=&r" (tmp_be), "=m" (*ptep) + : "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep), + "r" (cpu_to_be64(_PAGE_BUSY)), "r" (cpu_to_be64(set)) : "cc" ); /* huge pages use the old page table lock */ if (!huge) assert_pte_locked(mm, addr); + old = be64_to_cpu(old_be); if (old & _PAGE_HASHPTE) hpte_need_flush(mm, addr, ptep, old, huge); @@ -344,21 +347,22 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, */ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) { - unsigned long bits = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC | - _PAGE_SOFT_DIRTY); + __be64 old, tmp, val, mask; + + mask = cpu_to_be64(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | + _PAGE_EXEC | _PAGE_SOFT_DIRTY); - unsigned long old, tmp; + val = pte_raw(entry) & mask; __asm__ __volatile__( "1: ldarx %0,0,%4\n\ - andi. %1,%0,%6\n\ + and. %1,%0,%6\n\ bne- 1b \n\ or %0,%3,%0\n\ stdcx. %0,0,%4\n\ bne- 1b" :"=&r" (old), "=&r" (tmp), "=m" (*ptep) - :"r" (bits), "r" (ptep), "m" (*ptep), "i" (_PAGE_BUSY) + :"r" (val), "r" (ptep), "m" (*ptep), "r" (cpu_to_be64(_PAGE_BUSY)) :"cc"); } diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index ab3d897..158574d 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -288,7 +288,11 @@ extern long long virt_phys_offset; #ifndef __ASSEMBLY__ +#ifdef CONFIG_PPC_BOOK3S_64 +#include +#else #include +#endif typedef struct { signed long pd; } hugepd_t; diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h new file mode 100644 index 0000000..e2bf208 --- /dev/null +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -0,0 +1,92 @@ +#ifndef _ASM_POWERPC_PGTABLE_BE_TYPES_H +#define _ASM_POWERPC_PGTABLE_BE_TYPES_H + +#include + +/* PTE level */ +typedef struct { __be64 pte; } pte_t; +#define __pte(x) ((pte_t) { cpu_to_be64(x) }) +static inline unsigned long pte_val(pte_t x) +{ + return be64_to_cpu(x.pte); +} + +static inline __be64 pte_raw(pte_t x) +{ + return x.pte; +} + +/* PMD level */ +#ifdef CONFIG_PPC64 +typedef struct { __be64 pmd; } pmd_t; +#define __pmd(x) ((pmd_t) { cpu_to_be64(x) }) +static inline unsigned long pmd_val(pmd_t x) +{ + return be64_to_cpu(x.pmd); +} + +static inline __be64 pmd_raw(pmd_t x) +{ + return x.pmd; +} + +/* + * 64 bit hash always use 4 level table. Everybody else use 4 level + * only for 4K page size. + */ +#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) +typedef struct { __be64 pud; } pud_t; +#define __pud(x) ((pud_t) { cpu_to_be64(x) }) +static inline unsigned long pud_val(pud_t x) +{ + return be64_to_cpu(x.pud); +} +#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ +#endif /* CONFIG_PPC64 */ + +/* PGD level */ +typedef struct { __be64 pgd; } pgd_t; +#define __pgd(x) ((pgd_t) { cpu_to_be64(x) }) +static inline unsigned long pgd_val(pgd_t x) +{ + return be64_to_cpu(x.pgd); +} + +/* Page protection bits */ +typedef struct { unsigned long pgprot; } pgprot_t; +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) }) + +/* + * With hash config 64k pages additionally define a bigger "real PTE" type that + * gathers the "second half" part of the PTE for pseudo 64k pages + */ +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64) +typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; +#else +typedef struct { pte_t pte; } real_pte_t; +#endif + +static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) +{ + unsigned long *p = (unsigned long *)ptep; + __be64 prev; + + prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pte_raw(old), + (__force unsigned long)pte_raw(new)); + + return pte_raw(old) == prev; +} + +static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t new) +{ + unsigned long *p = (unsigned long *)pmdp; + __be64 prev; + + prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pmd_raw(old), + (__force unsigned long)pmd_raw(new)); + + return pmd_raw(old) == prev; +} + +#endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */ diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index 6fbf983..292f407 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -220,7 +220,6 @@ int __hash_page_64K(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned long flags, int ssize) { - unsigned long hpte_group; unsigned long rflags, pa; unsigned long old_pte, new_pte; diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index eb2accd..92c3c18 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -49,8 +49,8 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED; if (access & _PAGE_RW) new_pmd |= _PAGE_DIRTY; - } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp, - old_pmd, new_pmd)); + } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); + rflags = htab_convert_pte_flags(new_pmd); #if 0 diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index cda88b8..494fc11 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -515,7 +515,8 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, unsigned long set) { - unsigned long old, tmp; + __be64 old_be, tmp; + unsigned long old; #ifdef CONFIG_DEBUG_VM WARN_ON(!pmd_trans_huge(*pmdp)); @@ -524,16 +525,19 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, __asm__ __volatile__( "1: ldarx %0,0,%3\n\ - andi. %1,%0,%6\n\ + and. %1,%0,%6\n\ bne- 1b \n\ andc %1,%0,%4 \n\ or %1,%1,%7\n\ stdcx. %1,0,%3 \n\ bne- 1b" - : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) - : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set) + : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), + "r" (cpu_to_be64(_PAGE_BUSY)), "r" (cpu_to_be64(set)) : "cc" ); + old = be64_to_cpu(old_be); + trace_hugepage_update(addr, old, clr, set); if (old & _PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp, old); -- cgit v0.10.2 From ee3caed37d763ab28f8ac55563f38fb7530c6252 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Apr 2016 23:25:29 +1000 Subject: powerpc/mm: Use pte_raw() in pte_same()/pmd_same() We can avoid doing endian conversions by using pte_raw() in pxx_same(). The swap of the constant (_PAGE_HPTEFLAGS) should be done at compile time by the compiler. Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 0a7956a..f4949f1 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -272,7 +272,7 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) #define __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { - return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0); + return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); } static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 7fc1a9a..22064bd 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -372,7 +372,11 @@ static inline int pgd_bad(pgd_t pgd) } #define __HAVE_ARCH_PTE_SAME -#define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) +static inline int pte_same(pte_t pte_a, pte_t pte_b) +{ + return (((pte_raw(pte_a) ^ pte_raw(pte_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); +} + static inline unsigned long pgd_page_vaddr(pgd_t pgd) { return (unsigned long)__va(pgd_val(pgd) & ~PGD_MASKED_BITS); -- cgit v0.10.2 From c7d54842deb1fa357cff75b988275a1c9f259140 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:30 +1000 Subject: powerpc/mm: Use _PAGE_READ to indicate Read access This splits the _PAGE_RW bit into _PAGE_READ and _PAGE_WRITE. It also removes the dependency on _PAGE_USER for implying read only. Few things to note here is that, we have read implied with write and execute permission. Hence we should always find _PAGE_READ set on hash pte fault. We still can't switch PROT_NONE to !(_PAGE_RWX). Auto numa depends on marking a prot none pte _PAGE_WRITE. (For more details look at b191f9b106ea "mm: numa: preserve PTE write permissions across a NUMA hinting fault") Cc: Arnd Bergmann Cc: Jeremy Kerr Cc: Frederic Barrat Acked-by: Ian Munsie Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index f4949f1..f465050 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -291,10 +291,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if ((pmd_val(*pmdp) & _PAGE_RW) == 0) + if ((pmd_val(*pmdp) & _PAGE_WRITE) == 0) return; - pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0); + pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 22064bd..fb76f9c 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -16,8 +16,10 @@ #define _PAGE_BIT_SWAP_TYPE 0 #define _PAGE_EXEC 0x00001 /* execute permission */ -#define _PAGE_RW 0x00002 /* read & write access allowed */ +#define _PAGE_WRITE 0x00002 /* write access allowed */ #define _PAGE_READ 0x00004 /* read access allowed */ +#define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) +#define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) #define _PAGE_USER 0x00008 /* page may be accessed by userspace */ #define _PAGE_GUARDED 0x00010 /* G: guarded (side-effect) page */ /* M (memory coherence) is always set in the HPTE, so we don't need it here */ @@ -145,8 +147,8 @@ */ #define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ _PAGE_WRITETHRU | _PAGE_4K_PFN | \ - _PAGE_USER | _PAGE_ACCESSED | \ - _PAGE_RW | _PAGE_DIRTY | _PAGE_EXEC | \ + _PAGE_USER | _PAGE_ACCESSED | _PAGE_READ |\ + _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ _PAGE_SOFT_DIRTY) /* * We define 2 sets of base prot bits, one for basic pages (ie, @@ -171,10 +173,12 @@ #define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) #define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \ _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER ) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER ) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_READ) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_READ| \ + _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_READ) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_READ| \ + _PAGE_EXEC) #define __P000 PAGE_NONE #define __P001 PAGE_READONLY @@ -296,19 +300,19 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - if ((pte_val(*ptep) & _PAGE_RW) == 0) + if ((pte_val(*ptep) & _PAGE_WRITE) == 0) return; - pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0); } static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - if ((pte_val(*ptep) & _PAGE_RW) == 0) + if ((pte_val(*ptep) & _PAGE_WRITE) == 0) return; - pte_update(mm, addr, ptep, _PAGE_RW, 0, 1); + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1); } /* @@ -349,7 +353,7 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) { __be64 old, tmp, val, mask; - mask = cpu_to_be64(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | + mask = cpu_to_be64(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_READ | _PAGE_WRITE | _PAGE_EXEC | _PAGE_SOFT_DIRTY); val = pte_raw(entry) & mask; @@ -384,7 +388,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) /* Generic accessors to PTE bits */ -static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} +static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_WRITE);} static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } @@ -445,7 +449,7 @@ static inline unsigned long pte_pfn(pte_t pte) /* Generic modifiers for PTE bits */ static inline pte_t pte_wrprotect(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_RW); + return __pte(pte_val(pte) & ~_PAGE_WRITE); } static inline pte_t pte_mkclean(pte_t pte) @@ -460,6 +464,9 @@ static inline pte_t pte_mkold(pte_t pte) static inline pte_t pte_mkwrite(pte_t pte) { + /* + * write implies read, hence set both + */ return __pte(pte_val(pte) | _PAGE_RW); } diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 1ec67b0..9f5dea5 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -198,3 +198,8 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); /* Advertise support for _PAGE_SPECIAL */ #define __HAVE_ARCH_PTE_SPECIAL +#ifndef _PAGE_READ +/* if not defined, we should not find _PAGE_WRITE too */ +#define _PAGE_READ 0 +#define _PAGE_WRITE _PAGE_RW +#endif diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c index dbd3133..491b7d1 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/hash64_4k.c @@ -45,7 +45,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, * also add _PAGE_COMBO */ new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index 292f407..2d34721 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -77,7 +77,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, * also add _PAGE_COMBO */ new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_COMBO; - if (access & _PAGE_RW) + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); @@ -251,7 +251,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access, * a write access. */ new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 7635b1c..ed190e4 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -175,8 +175,9 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags) * or PP=0x3 for read-only (including writeable but clean pages). */ if (pteflags & _PAGE_USER) { - rflags |= 0x2; - if (!((pteflags & _PAGE_RW) && (pteflags & _PAGE_DIRTY))) + if (pteflags & _PAGE_RWX) + rflags |= 0x2; + if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) rflags |= 0x1; } /* @@ -1209,7 +1210,7 @@ EXPORT_SYMBOL_GPL(hash_page); int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, unsigned long dsisr) { - unsigned long access = _PAGE_PRESENT; + unsigned long access = _PAGE_PRESENT | _PAGE_READ; unsigned long flags = 0; struct mm_struct *mm = current->mm; @@ -1220,7 +1221,7 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, flags |= HPTE_NOHPTE_UPDATE; if (dsisr & DSISR_ISSTORE) - access |= _PAGE_RW; + access |= _PAGE_WRITE; /* * We need to set the _PAGE_USER bit if MSR_PR is set or if we are * accessing a userspace segment (even from the kernel). We assume diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 92c3c18..b4b6668 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -47,7 +47,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, * a write access */ new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + if (access & _PAGE_WRITE) new_pmd |= _PAGE_DIRTY; } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 9c05187..cdca743 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -55,7 +55,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, /* Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access */ new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index d991b9e..0bf269b 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -1003,9 +1003,9 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, end = pte_end; pte = READ_ONCE(*ptep); - mask = _PAGE_PRESENT | _PAGE_USER; + mask = _PAGE_PRESENT | _PAGE_USER | _PAGE_READ; if (write) - mask |= _PAGE_RW; + mask |= _PAGE_WRITE; if ((pte_val(pte) & mask) != mask) return 0; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index de37ff4..ef7b922 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -177,8 +177,8 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, * _PAGE_PRESENT, but we can be sure that it is not in hpte. * Hence we can use set_pte_at for them. */ - VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER)); + VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep)); + /* * Add the pte bit when tryint set a pte */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 494fc11..16bc751 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -277,7 +277,7 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, void *caller = __builtin_return_address(0); /* writeable implies dirty for kernel addresses */ - if (flags & _PAGE_RW) + if (flags & _PAGE_WRITE) flags |= _PAGE_DIRTY; /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ @@ -676,8 +676,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { #ifdef CONFIG_DEBUG_VM - WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER)); + WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); assert_spin_locked(&mm->page_table_lock); WARN_ON(!pmd_trans_huge(pmd)); #endif diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index 3ede04f..3cbe38f 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -197,7 +197,7 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr) (REGION_ID(ea) != USER_REGION_ID)) { spin_unlock(&spu->register_lock); - ret = hash_page(ea, _PAGE_PRESENT, 0x300, dsisr); + ret = hash_page(ea, _PAGE_PRESENT | _PAGE_READ, 0x300, dsisr); spin_lock(&spu->register_lock); if (!ret) { diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c index d98f845..c3a3bf1 100644 --- a/arch/powerpc/platforms/cell/spufs/fault.c +++ b/arch/powerpc/platforms/cell/spufs/fault.c @@ -141,8 +141,8 @@ int spufs_handle_class1(struct spu_context *ctx) /* we must not hold the lock when entering copro_handle_mm_fault */ spu_release(ctx); - access = (_PAGE_PRESENT | _PAGE_USER); - access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_RW : 0UL; + access = (_PAGE_PRESENT | _PAGE_READ | _PAGE_USER); + access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_WRITE : 0UL; local_irq_save(flags); ret = hash_page(ea, access, 0x300, dsisr); local_irq_restore(flags); diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 9a23654..0feeace 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -149,9 +149,9 @@ static void cxl_handle_page_fault(struct cxl_context *ctx, * update_mmu_cache() will not have loaded the hash since current->trap * is not a 0x400 or 0x300, so just call hash_page_mm() here. */ - access = _PAGE_PRESENT; + access = _PAGE_PRESENT | _PAGE_READ; if (dsisr & CXL_PSL_DSISR_An_S) - access |= _PAGE_RW; + access |= _PAGE_WRITE; if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID)) access |= _PAGE_USER; -- cgit v0.10.2 From 73a1441a9b2ae05c15eb7cc90d4de379b44249bf Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:31 +1000 Subject: powerpc/mm/subpage: Clear RWX bit to indicate no access Subpage protection used to depend on the _PAGE_USER bit to implement no access mode. This patch switches that to use _PAGE_RWX. We clear Read, Write and Execute access from the pte instead of clearing _PAGE_USER now. This was done so that we can switch to _PAGE_PRIVILEGED in a later patch. subpage_protection() returns pte bits that need to be cleared. Instead of updating the interface to handle no-access in a separate way, it appears simpler to clear RWX acecss to indicate no access. We still don't insert hash ptes for no access implied by !_PAGE_RWX. Hence we should not get PROT_FAULT with change. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index ed190e4..36e0037 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -921,7 +921,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) * Userspace sets the subpage permissions using the subpage_prot system call. * * Result is 0: full permissions, _PAGE_RW: read-only, - * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. + * _PAGE_RWX: no access. */ static int subpage_protection(struct mm_struct *mm, unsigned long ea) { @@ -947,8 +947,13 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea) /* extract 2-bit bitfield for this 4k subpage */ spp >>= 30 - 2 * ((ea >> 12) & 0xf); - /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */ - spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0); + /* + * 0 -> full premission + * 1 -> Read only + * 2 -> no access. + * We return the flag that need to be cleared. + */ + spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); return spp; } -- cgit v0.10.2 From 7e1e63c5e9d694c1198d410a6c7b2d0fa06da355 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Apr 2016 23:25:32 +1000 Subject: powerpc/mm: Convert pte_user() to static inline In a subsequent patch we want to add a second definition of pte_user(). Before we do that, make the signature clear, ie. it takes a pte_t and returns bool. We move it up inside the existing #ifndef __ASSEMBLY__ block, but otherwise it's a straight conversion. Convert the call in settlbcam(), which passes an unsigned long, to pass a pte_t. Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 9f5dea5..5f45753 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -76,6 +76,16 @@ */ #ifndef __ASSEMBLY__ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); + +/* + * Don't just check for any non zero bits in __PAGE_USER, since for book3e + * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in + * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. + */ +static inline bool pte_user(pte_t pte) +{ + return (pte_val(pte) & _PAGE_USER) == _PAGE_USER; +} #endif /* __ASSEMBLY__ */ /* Location of the PFN in the PTE. Most 32-bit platforms use the same @@ -184,13 +194,6 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); /* Make modules code happy. We don't set RO yet */ #define PAGE_KERNEL_EXEC PAGE_KERNEL_X -/* - * Don't just check for any non zero bits in __PAGE_USER, since for book3e - * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in - * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. - */ -#define pte_user(val) ((val & _PAGE_USER) == _PAGE_USER) - /* Advertise special mapping type for AGP */ #define PAGE_AGP (PAGE_KERNEL_NC) #define HAVE_PAGE_AGP diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index a1b2713..139dec4 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -135,7 +135,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, TLBCAM[index].MAS7 = (u64)phys >> 32; /* Below is unlikely -- only for large user pages or similar */ - if (pte_user(flags)) { + if (pte_user(__pte(flags))) { TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); } -- cgit v0.10.2 From e7bfc462d32fc417d3fea8ad07b62b59f000e925 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:33 +1000 Subject: powerpc/mm: Use pte_user() instead of open coding We have a common declaration in pte-common.h Add a book3s specific one and switch to pte_user() in callchain.c. In a subsequent patch we will switch _PAGE_USER to _PAGE_PRIVILEGED in the book3s version only. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 77d3ce0..60e8426 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -185,6 +185,11 @@ extern struct page *pgd_page(pgd_t pgd); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) #define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) +static inline bool pte_user(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_USER); +} + #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) #else diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index e04a675..0071de7 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -137,7 +137,7 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb) offset = addr & ((1UL << shift) - 1); pte = READ_ONCE(*ptep); - if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER)) + if (!pte_present(pte) || !pte_user(pte)) goto err_out; pfn = pte_pfn(pte); if (!page_is_ram(pfn)) -- cgit v0.10.2 From ac29c64089b74d107edb90879e63a2f7a03cd66b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:34 +1000 Subject: powerpc/mm: Replace _PAGE_USER with _PAGE_PRIVILEGED _PAGE_PRIVILEGED means the page can be accessed only by the kernel. This is done to keep pte bits similar to PowerISA 3.0 Radix PTE format. User pages are now marked by clearing _PAGE_PRIVILEGED bit. Previously we allowed the kernel to have a privileged page in the lower address range (USER_REGION). With this patch such access is denied. We also prevent a kernel access to a non-privileged page in higher address range (ie, REGION_ID != 0). Both the above access scenarios should never happen. Cc: Arnd Bergmann Cc: Jeremy Kerr Cc: Frederic Barrat Acked-by: Ian Munsie Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index fb76f9c..5e7e74d 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -20,7 +20,7 @@ #define _PAGE_READ 0x00004 /* read access allowed */ #define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) #define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) -#define _PAGE_USER 0x00008 /* page may be accessed by userspace */ +#define _PAGE_PRIVILEGED 0x00008 /* kernel access only */ #define _PAGE_GUARDED 0x00010 /* G: guarded (side-effect) page */ /* M (memory coherence) is always set in the HPTE, so we don't need it here */ #define _PAGE_COHERENT 0x0 @@ -114,10 +114,13 @@ #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #endif /* CONFIG_PPC_MM_SLICES */ -/* No separate kernel read-only */ -#define _PAGE_KERNEL_RW (_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */ +/* + * No separate kernel read-only, user access blocked by key + */ +#define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY) #define _PAGE_KERNEL_RO _PAGE_KERNEL_RW -#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) +#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ + _PAGE_RW | _PAGE_EXEC) /* Strong Access Ordering */ #define _PAGE_SAO (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT) @@ -147,7 +150,7 @@ */ #define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ _PAGE_WRITETHRU | _PAGE_4K_PFN | \ - _PAGE_USER | _PAGE_ACCESSED | _PAGE_READ |\ + _PAGE_PRIVILEGED | _PAGE_ACCESSED | _PAGE_READ |\ _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ _PAGE_SOFT_DIRTY) /* @@ -169,16 +172,13 @@ * * Note due to the way vm flags are laid out, the bits are XWR */ -#define PAGE_NONE __pgprot(_PAGE_BASE) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \ - _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_READ) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_READ| \ - _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_READ) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_READ| \ - _PAGE_EXEC) +#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_READ) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) #define __P000 PAGE_NONE #define __P001 PAGE_READONLY @@ -419,8 +419,8 @@ static inline pte_t pte_clear_soft_dirty(pte_t pte) */ static inline int pte_protnone(pte_t pte) { - return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT; + return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PRIVILEGED)) == + (_PAGE_PRESENT | _PAGE_PRIVILEGED); } #endif /* CONFIG_NUMA_BALANCING */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 60e8426..b609729 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -187,7 +187,7 @@ extern struct page *pgd_page(pgd_t pgd); static inline bool pte_user(pte_t pte) { - return !!(pte_val(pte) & _PAGE_USER); + return !(pte_val(pte) & _PAGE_PRIVILEGED); } #ifdef CONFIG_MEM_SOFT_DIRTY @@ -211,6 +211,22 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) } #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ +static inline bool check_pte_access(unsigned long access, unsigned long ptev) +{ + /* + * This check for _PAGE_RWX and _PAGE_PRESENT bits + */ + if (access & ~ptev) + return false; + /* + * This check for access to privilege space + */ + if ((access & _PAGE_PRIVILEGED) != (ptev & _PAGE_PRIVILEGED)) + return false; + + return true; +} + void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c index 491b7d1..529e492 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/hash64_4k.c @@ -37,7 +37,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, if (unlikely(old_pte & _PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index 2d34721..e778286 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -69,7 +69,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, if (unlikely(old_pte & _PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was @@ -237,7 +237,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access, if (unlikely(old_pte & _PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Check if PTE has the cache-inhibit bit set diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 36e0037..dc0f6a0 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -174,7 +174,7 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags) * User area is mapped with PP=0x2 for read/write * or PP=0x3 for read-only (including writeable but clean pages). */ - if (pteflags & _PAGE_USER) { + if (!(pteflags & _PAGE_PRIVILEGED)) { if (pteflags & _PAGE_RWX) rflags |= 0x2; if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) @@ -1090,7 +1090,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, /* Pre-check access permissions (will be re-checked atomically * in __hash_page_XX but this pre-check is a fast path */ - if (access & ~pte_val(*ptep)) { + if (!check_pte_access(access, pte_val(*ptep))) { DBG_LOW(" no access !\n"); rc = 1; goto bail; @@ -1228,12 +1228,16 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, if (dsisr & DSISR_ISSTORE) access |= _PAGE_WRITE; /* - * We need to set the _PAGE_USER bit if MSR_PR is set or if we are - * accessing a userspace segment (even from the kernel). We assume - * kernel addresses always have the high bit set. + * We set _PAGE_PRIVILEGED only when + * kernel mode access kernel space. + * + * _PAGE_PRIVILEGED is NOT set + * 1) when kernel mode access user space + * 2) user space access kernel space. */ + access |= _PAGE_PRIVILEGED; if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID)) - access |= _PAGE_USER; + access &= ~_PAGE_PRIVILEGED; if (trap == 0x400) access |= _PAGE_EXEC; diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index b4b6668..6cb6bdd 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -40,7 +40,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, if (unlikely(old_pmd & _PAGE_BUSY)) return 0; /* If PMD permissions don't match, take page fault */ - if (unlikely(access & ~old_pmd)) + if (unlikely(!check_pte_access(access, old_pmd))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index cdca743..bf90784 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -50,8 +50,9 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, if (unlikely(old_pte & _PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; + /* Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access */ new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0bf269b..6d91096 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -1003,7 +1003,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, end = pte_end; pte = READ_ONCE(*ptep); - mask = _PAGE_PRESENT | _PAGE_USER | _PAGE_READ; + mask = _PAGE_PRESENT | _PAGE_READ; if (write) mask |= _PAGE_WRITE; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index ef7b922..125fb4b 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -43,9 +43,20 @@ static inline int is_exec_fault(void) */ static inline int pte_looks_normal(pte_t pte) { + +#if defined(CONFIG_PPC_BOOK3S_64) + if ((pte_val(pte) & + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE)) == + _PAGE_PRESENT) { + if (pte_user(pte)) + return 1; + } + return 0; +#else return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER); + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER); +#endif } static struct page *maybe_pte_to_page(pte_t pte) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 16bc751..603db71 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -280,8 +280,17 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, if (flags & _PAGE_WRITE) flags |= _PAGE_DIRTY; - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC); + /* we don't want to let _PAGE_EXEC leak out */ + flags &= ~_PAGE_EXEC; + /* + * Force kernel mapping. + */ +#if defined(CONFIG_PPC_BOOK3S_64) + flags |= _PAGE_PRIVILEGED; +#else + flags &= ~_PAGE_USER; +#endif + #ifdef _PAGE_BAP_SR /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format @@ -664,7 +673,7 @@ void pmdp_huge_split_prepare(struct vm_area_struct *vma, * the translation is still valid, because we will withdraw * pgtable_t after this. */ - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0); + pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); } diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c index c3a3bf1..e29e4d5 100644 --- a/arch/powerpc/platforms/cell/spufs/fault.c +++ b/arch/powerpc/platforms/cell/spufs/fault.c @@ -141,7 +141,7 @@ int spufs_handle_class1(struct spu_context *ctx) /* we must not hold the lock when entering copro_handle_mm_fault */ spu_release(ctx); - access = (_PAGE_PRESENT | _PAGE_READ | _PAGE_USER); + access = (_PAGE_PRESENT | _PAGE_READ); access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_WRITE : 0UL; local_irq_save(flags); ret = hash_page(ea, access, 0x300, dsisr); diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 0feeace..377e650 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -152,8 +152,10 @@ static void cxl_handle_page_fault(struct cxl_context *ctx, access = _PAGE_PRESENT | _PAGE_READ; if (dsisr & CXL_PSL_DSISR_An_S) access |= _PAGE_WRITE; + + access |= _PAGE_PRIVILEGED; if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID)) - access |= _PAGE_USER; + access &= ~_PAGE_PRIVILEGED; if (dsisr & DSISR_NOHPTE) inv_flags |= HPTE_NOHPTE_UPDATE; -- cgit v0.10.2 From 96270b1fc25d527b015c73533119f6c85df2e0ff Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:35 +1000 Subject: powerpc/mm: Remove RPN_SHIFT and RPN_SIZE PTE_RPN_SHIFT is actually page size dependent. Even though PowerISA 3.0 expects only the lower 12 bits to be zero, we will always find the pages to be PAGE_SHIFT aligned. In case of hash config, this also allows us to use the additional 3 bits to track pte specific information. We need to make sure we use these bits only for hash specific pte flags. For both 4K and 64K config, pte now can hold 57 bits address. Inorder to keep things simpler, drop PTE_RPN_SHIFT and PTE_RPN_SIZE and specify the 57 bit detail explicitly. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 5f08a08..772850e 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -51,10 +51,6 @@ #define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | \ _PAGE_F_SECOND | _PAGE_F_GIX) -/* shift to put page number into pte */ -#define PTE_RPN_SHIFT (12) -#define PTE_RPN_SIZE (45) /* gives 57-bit real addresses */ - #define _PAGE_4K_PFN 0 #ifndef __ASSEMBLY__ /* diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index f465050..ecc7ce0 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -40,15 +40,6 @@ /* PTE flags to conserve for HPTE identification */ #define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_F_SECOND | \ _PAGE_F_GIX | _PAGE_HASHPTE | _PAGE_COMBO) - -/* Shift to put page number into pte. - * - * That gives us a max RPN of 41 bits, which means a max of 57 bits - * of addressable physical space, or 53 bits for the special 4k PFNs. - */ -#define PTE_RPN_SHIFT (16) -#define PTE_RPN_SIZE (41) - /* * we support 16 fragments per PTE page of 64K size. */ @@ -68,6 +59,7 @@ #define PGD_MASKED_BITS 0xc0000000000000ffUL #ifndef __ASSEMBLY__ +#include /* * With 64K pages on hash table, we have a special PTE format that @@ -124,10 +116,18 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index); #define pte_pagesize_index(mm, addr, pte) \ (((pte) & _PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K) -#define remap_4k_pfn(vma, addr, pfn, prot) \ - (WARN_ON(((pfn) >= (1UL << PTE_RPN_SIZE))) ? -EINVAL : \ - remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, \ - __pgprot(pgprot_val((prot)) | _PAGE_4K_PFN))) +extern int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); +static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) +{ + if (pfn > (PTE_RPN_MASK >> PAGE_SHIFT)) { + WARN(1, "remap_4k_pfn called with wrong pfn value\n"); + return -EINVAL; + } + return remap_pfn_range(vma, addr, pfn, PAGE_SIZE, + __pgprot(pgprot_val(prot) | _PAGE_4K_PFN)); +} #define PTE_TABLE_SIZE PTE_FRAG_SIZE #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 5e7e74d..e1abe39 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -49,7 +49,11 @@ * page, since THP huge page also need to track real subpage details */ #define _PAGE_THP_HUGE _PAGE_4K_PFN - +/* + * We support 57 bit real address in pte. Clear everything above 57, and + * every thing below PAGE_SHIFT; + */ +#define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK)) /* * set of bits not changed in pmd_modify. */ @@ -134,11 +138,6 @@ #define _PTE_NONE_MASK _PAGE_HPTEFLAGS /* - * The mask convered by the RPN must be a ULL on 32-bit platforms with - * 64-bit PTEs - */ -#define PTE_RPN_MASK (((1UL << PTE_RPN_SIZE) - 1) << PTE_RPN_SHIFT) -/* * _PAGE_CHG_MASK masks of bits that are to be preserved across * pgprot changes */ @@ -437,13 +436,13 @@ static inline int pte_present(pte_t pte) */ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) { - return __pte((((pte_basic_t)(pfn) << PTE_RPN_SHIFT) & PTE_RPN_MASK) | + return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) | pgprot_val(pgprot)); } static inline unsigned long pte_pfn(pte_t pte) { - return (pte_val(pte) & PTE_RPN_MASK) >> PTE_RPN_SHIFT; + return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT; } /* Generic modifiers for PTE bits */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index b609729..0fac737 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -172,10 +172,10 @@ extern struct page *pgd_page(pgd_t pgd); #define SWP_TYPE_BITS 5 #define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ & ((1UL << SWP_TYPE_BITS) - 1)) -#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PTE_RPN_SHIFT) +#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT) #define __swp_entry(type, offset) ((swp_entry_t) { \ ((type) << _PAGE_BIT_SWAP_TYPE) \ - | (((offset) << PTE_RPN_SHIFT) & PTE_RPN_MASK)}) + | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)}) /* * swp_entry_t must be independent of pte bits. We build a swp_entry_t from * swap type and offset we get from swap and convert that to pte to find a diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 603db71..9e0553e 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -757,7 +757,7 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) { unsigned long pmdv; - pmdv = (pfn << PTE_RPN_SHIFT) & PTE_RPN_MASK; + pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; return pmd_set_protbits(__pmd(pmdv), pgprot); } -- cgit v0.10.2 From e58e87adc8bf92e9c6abb1dc1f4af2500aa07ff3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:36 +1000 Subject: powerpc/mm: Update _PAGE_KERNEL_RO PS3 had used a PPP bit hack to implement a read only mapping in the kernel area. Since we are bolting the ioremap area, it used the pte flags _PAGE_PRESENT | _PAGE_USER to get a PPP value of 0x3 there by resulting in a read only mapping. This means the area can be accessed by user space, but kernel will never return such an address to user space. But we can do better by implementing a read only kernel mapping using PPP bits 0b110. This also allows us to do read only kernel mapping for radix in later patches. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index e1abe39..eb22479 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -119,10 +119,10 @@ #endif /* CONFIG_PPC_MM_SLICES */ /* - * No separate kernel read-only, user access blocked by key + * user access blocked by key */ #define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY) -#define _PAGE_KERNEL_RO _PAGE_KERNEL_RW +#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ) #define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ _PAGE_RW | _PAGE_EXEC) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index dc0f6a0..38ed869 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -167,14 +167,19 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags) if ((pteflags & _PAGE_EXEC) == 0) rflags |= HPTE_R_N; /* - * PP bits: + * PPP bits: * Linux uses slb key 0 for kernel and 1 for user. - * kernel areas are mapped with PP=00 - * and there is no kernel RO (_PAGE_KERNEL_RO). - * User area is mapped with PP=0x2 for read/write - * or PP=0x3 for read-only (including writeable but clean pages). + * kernel RW areas are mapped with PPP=0b000 + * User area is mapped with PPP=0b010 for read/write + * or PPP=0b011 for read-only (including writeable but clean pages). */ - if (!(pteflags & _PAGE_PRIVILEGED)) { + if (pteflags & _PAGE_PRIVILEGED) { + /* + * Kernel read only mapped with ppp bits 0b110 + */ + if (!(pteflags & _PAGE_WRITE)) + rflags |= (HPTE_R_PP0 | 0x2); + } else { if (pteflags & _PAGE_RWX) rflags |= 0x2; if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index a0bca05..5e8a40f 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -205,7 +205,7 @@ static void spu_unmap(struct spu *spu) static int __init setup_areas(struct spu *spu) { struct table {char* name; unsigned long addr; unsigned long size;}; - static const unsigned long shadow_flags = _PAGE_NO_CACHE | 3; + unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO)); spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr, sizeof(struct spe_shadow), -- cgit v0.10.2 From 72176dd0ad36c6d8e13515d085f7a229a55a2985 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:37 +1000 Subject: powerpc/mm: Use a helper for finding pte bits mapping I/O area Use a helper instead of open coding with constants. A later patch will drop the WIMG bits and use PowerISA 3.0 defines. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 41c011c..8275858 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -162,7 +162,7 @@ void btext_map(void) offset = ((unsigned long) dispDeviceBase) - base; size = dispDeviceRowBytes * dispDeviceRect[3] + offset + dispDeviceRect[0]; - vbase = __ioremap(base, size, _PAGE_NO_CACHE); + vbase = __ioremap(base, size, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); if (vbase == 0) return; logicalDisplayBase = vbase + offset; diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 0f19970..ae13161 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -109,14 +109,14 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node, size = 0x10000; __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - size, _PAGE_NO_CACHE|_PAGE_GUARDED); + size, pgprot_val(pgprot_noncached(__pgprot(0)))); return; inval_range: printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " "mapping 64k\n"); __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - 0x10000, _PAGE_NO_CACHE|_PAGE_GUARDED); + 0x10000, pgprot_val(pgprot_noncached(__pgprot(0)))); } diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 60bb187..41503d7 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -159,7 +159,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) /* Establish the mapping */ if (__ioremap_at(phys_page, area->addr, size_page, - _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL) + pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL) return -ENOMEM; /* Fixup hose IO resource */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 9e0553e..ea60a24 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -253,7 +253,7 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size, void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; + unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0))); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) @@ -263,7 +263,7 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size) void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - unsigned long flags = _PAGE_NO_CACHE; + unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0))); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index 5e8a40f..492b257 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -216,7 +216,7 @@ static int __init setup_areas(struct spu *spu) } spu->local_store = (__force void *)ioremap_prot(spu->local_store_phys, - LS_SIZE, _PAGE_NO_CACHE); + LS_SIZE, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); if (!spu->local_store) { pr_debug("%s:%d: ioremap local_store failed\n", diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c index 61cf61a..4d7bc3f 100644 --- a/drivers/pcmcia/electra_cf.c +++ b/drivers/pcmcia/electra_cf.c @@ -228,7 +228,7 @@ static int electra_cf_probe(struct platform_device *ofdev) if (!cf->mem_base || !cf->io_virt || !cf->gpio_base || (__ioremap_at(io.start, cf->io_virt, cf->io_size, - _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL)) { + pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL)) { dev_err(device, "can't ioremap ranges\n"); status = -ENOMEM; goto fail1; -- cgit v0.10.2 From 30bda41aba4efb2370c97e2cbe7385de93ccc372 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:38 +1000 Subject: powerpc/mm: Drop WIMG in favour of new constants PowerISA 3.0 introduces two pte bits with the below meaning for radix: 00 -> Normal Memory 01 -> Strong Access Order (SAO) 10 -> Non idempotent I/O (Cache inhibited and guarded) 11 -> Tolerant I/O (Cache inhibited) We drop the existing WIMG bits in the Linux page table in favour of the above constants. We loose _PAGE_WRITETHRU with this conversion. We only use writethru via pgprot_cached_wthru() which is used by fbdev/controlfb.c which is Apple control display and also PPC32. With respect to _PAGE_COHERENCE, we have been marking hpte always coherent for some time now. htab_convert_pte_flags() always added HPTE_R_M. NOTE: KVM changes need closer review. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index eb22479..eee951f 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -21,11 +21,9 @@ #define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) #define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) #define _PAGE_PRIVILEGED 0x00008 /* kernel access only */ -#define _PAGE_GUARDED 0x00010 /* G: guarded (side-effect) page */ -/* M (memory coherence) is always set in the HPTE, so we don't need it here */ -#define _PAGE_COHERENT 0x0 -#define _PAGE_NO_CACHE 0x00020 /* I: cache inhibit */ -#define _PAGE_WRITETHRU 0x00040 /* W: cache write-through */ +#define _PAGE_SAO 0x00010 /* Strong access order */ +#define _PAGE_NON_IDEMPOTENT 0x00020 /* non idempotent memory */ +#define _PAGE_TOLERANT 0x00030 /* tolerant memory, cache inhibited */ #define _PAGE_DIRTY 0x00080 /* C: page changed */ #define _PAGE_ACCESSED 0x00100 /* R: page referenced */ #define _PAGE_SPECIAL 0x00400 /* software: special page */ @@ -43,7 +41,12 @@ #define _PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ #define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ #define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ - +/* + * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE + * Instead of fixing all of them, add an alternate define which + * maps CI pte mapping. + */ +#define _PAGE_NO_CACHE _PAGE_TOLERANT /* * We need to differentiate between explicit huge page and THP huge * page, since THP huge page also need to track real subpage details @@ -126,9 +129,6 @@ #define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ _PAGE_RW | _PAGE_EXEC) -/* Strong Access Ordering */ -#define _PAGE_SAO (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT) - /* No page size encoding in the linux PTE */ #define _PAGE_PSIZE 0 @@ -147,10 +147,9 @@ /* * Mask of bits returned by pte_pgprot() */ -#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU | _PAGE_4K_PFN | \ - _PAGE_PRIVILEGED | _PAGE_ACCESSED | _PAGE_READ |\ - _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ +#define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \ + _PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \ + _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ _PAGE_SOFT_DIRTY) /* * We define 2 sets of base prot bits, one for basic pages (ie, @@ -159,7 +158,7 @@ * the processor might need it for DMA coherency. */ #define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) -#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) +#define _PAGE_BASE (_PAGE_BASE_NC) /* Permission masks used to generate the __P and __S table, * @@ -200,9 +199,9 @@ /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE) + _PAGE_TOLERANT) #define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE | _PAGE_GUARDED) + _PAGE_NON_IDEMPOTENT) #define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) @@ -509,45 +508,26 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, *ptep = pte; } -/* - * Macro to mark a page protection value as "uncacheable". - */ - -#define _PAGE_CACHE_CTL (_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU) +#define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) #define pgprot_noncached pgprot_noncached static inline pgprot_t pgprot_noncached(pgprot_t prot) { return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_NO_CACHE | _PAGE_GUARDED); + _PAGE_NON_IDEMPOTENT); } #define pgprot_noncached_wc pgprot_noncached_wc static inline pgprot_t pgprot_noncached_wc(pgprot_t prot) { return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_NO_CACHE); + _PAGE_TOLERANT); } #define pgprot_cached pgprot_cached static inline pgprot_t pgprot_cached(pgprot_t prot) { - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_COHERENT); -} - -#define pgprot_cached_wthru pgprot_cached_wthru -static inline pgprot_t pgprot_cached_wthru(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_COHERENT | _PAGE_WRITETHRU); -} - -#define pgprot_cached_noncoherent pgprot_cached_noncoherent -static inline pgprot_t pgprot_cached_noncoherent(pgprot_t prot) -{ - return __pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL); + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL)); } #define pgprot_writecombine pgprot_writecombine @@ -555,6 +535,18 @@ static inline pgprot_t pgprot_writecombine(pgprot_t prot) { return pgprot_noncached_wc(prot); } +/* + * check a pte mapping have cache inhibited property + */ +static inline bool pte_ci(pte_t pte) +{ + unsigned long pte_v = pte_val(pte); + + if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) || + ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)) + return true; + return false; +} #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 42da513..3c380c2 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -276,19 +276,24 @@ static inline unsigned long hpte_make_readonly(unsigned long ptel) return ptel; } -static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) +static inline bool hpte_cache_flags_ok(unsigned long hptel, bool is_ci) { - unsigned int wimg = ptel & HPTE_R_WIMG; + unsigned int wimg = hptel & HPTE_R_WIMG; /* Handle SAO */ if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) && cpu_has_feature(CPU_FTR_ARCH_206)) wimg = HPTE_R_M; - if (!io_type) + if (!is_ci) return wimg == HPTE_R_M; - - return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type; + /* + * if host is mapped cache inhibited, make sure hptel also have + * cache inhibited. + */ + if (wimg & HPTE_R_W) /* FIXME!! is this ok for all guest. ? */ + return false; + return !!(wimg & HPTE_R_I); } /* @@ -325,18 +330,6 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing) return new_pte; } - -/* Return HPTE cache control bits corresponding to Linux pte bits */ -static inline unsigned long hpte_cache_bits(unsigned long pte_val) -{ -#if _PAGE_NO_CACHE == HPTE_R_I && _PAGE_WRITETHRU == HPTE_R_W - return pte_val & (HPTE_R_W | HPTE_R_I); -#else - return ((pte_val & _PAGE_NO_CACHE) ? HPTE_R_I : 0) + - ((pte_val & _PAGE_WRITETHRU) ? HPTE_R_W : 0); -#endif -} - static inline bool hpte_read_permission(unsigned long pp, unsigned long key) { if (key) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index c7b78d8..05f09ae 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -447,7 +447,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, struct revmap_entry *rev; struct page *page, *pages[1]; long index, ret, npages; - unsigned long is_io; + bool is_ci; unsigned int writing, write_ok; struct vm_area_struct *vma; unsigned long rcbits; @@ -503,7 +503,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, smp_rmb(); ret = -EFAULT; - is_io = 0; + is_ci = false; pfn = 0; page = NULL; pte_size = PAGE_SIZE; @@ -521,7 +521,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, pfn = vma->vm_pgoff + ((hva - vma->vm_start) >> PAGE_SHIFT); pte_size = psize; - is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); write_ok = vma->vm_flags & VM_WRITE; } up_read(¤t->mm->mmap_sem); @@ -558,10 +558,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_put; /* Check WIMG vs. the actual page we're accessing */ - if (!hpte_cache_flags_ok(r, is_io)) { - if (is_io) + if (!hpte_cache_flags_ok(r, is_ci)) { + if (is_ci) goto out_put; - /* * Allow guest to map emulated device memory as * uncacheable, but actually make it cacheable. diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 4cb8db0..99b4e9d 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -175,7 +175,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, unsigned long g_ptel; struct kvm_memory_slot *memslot; unsigned hpage_shift; - unsigned long is_io; + bool is_ci; unsigned long *rmap; pte_t *ptep; unsigned int writing; @@ -199,7 +199,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, gfn = gpa >> PAGE_SHIFT; memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); pa = 0; - is_io = ~0ul; + is_ci = false; rmap = NULL; if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { /* Emulated MMIO - mark this with key=31 */ @@ -250,7 +250,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, if (writing && !pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); - is_io = hpte_cache_bits(pte_val(pte)); + is_ci = pte_ci(pte); pa = pte_pfn(pte) << PAGE_SHIFT; pa |= hva & (host_pte_size - 1); pa |= gpa & ~PAGE_MASK; @@ -267,9 +267,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, else pteh |= HPTE_V_ABSENT; - /* Check WIMG */ - if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) { - if (is_io) + /*If we had host pte mapping then Check WIMG */ + if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) { + if (is_ci) return H_PARAMETER; /* * Allow guest to map emulated device memory as diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index e778286..6753310 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -244,7 +244,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access, * If so, bail out and refault as a 4k page */ if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) && - unlikely(old_pte & _PAGE_NO_CACHE)) + unlikely(pte_ci(pte))) return 0; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 38ed869..eb928d8 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -192,12 +192,13 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags) /* * Add in WIG bits */ - if (pteflags & _PAGE_WRITETHRU) - rflags |= HPTE_R_W; - if (pteflags & _PAGE_NO_CACHE) + + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) rflags |= HPTE_R_I; - if (pteflags & _PAGE_GUARDED) - rflags |= HPTE_R_G; + if ((pteflags & _PAGE_CACHE_CTL ) == _PAGE_NON_IDEMPOTENT) + rflags |= (HPTE_R_I | HPTE_R_G); + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) + rflags |= (HPTE_R_I | HPTE_R_W); return rflags; } @@ -1142,8 +1143,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, /* If this PTE is non-cacheable and we have restrictions on * using non cacheable large pages, then we switch to 4k */ - if (mmu_ci_restrictions && psize == MMU_PAGE_64K && - (pte_val(*ptep) & _PAGE_NO_CACHE)) { + if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { if (user_region) { demote_segment_4k(mm, ea); psize = MMU_PAGE_4K; @@ -1297,13 +1297,13 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES - /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on + /* If either _PAGE_4K_PFN or cache inhibited is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take * care of it once we actually try to access the page. * That way we don't have to duplicate all of the logic for segment * page size demotion here */ - if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) + if ((pte_val(*ptep) & _PAGE_4K_PFN) || pte_ci(*ptep)) goto out_exit; #endif /* CONFIG_PPC_64K_PAGES */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 125fb4b..db277b6 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -38,16 +38,16 @@ static inline int is_exec_fault(void) /* We only try to do i/d cache coherency on stuff that looks like * reasonably "normal" PTEs. We currently require a PTE to be present - * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that + * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that * on userspace PTEs */ static inline int pte_looks_normal(pte_t pte) { #if defined(CONFIG_PPC_BOOK3S_64) - if ((pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE)) == - _PAGE_PRESENT) { + if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) { + if (pte_ci(pte)) + return 0; if (pte_user(pte)) return 1; } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index ea60a24..b3040b4 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -167,10 +167,6 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, if ((flags & _PAGE_PRESENT) == 0) flags |= pgprot_val(PAGE_KERNEL); - /* Non-cacheable page cannot be coherent */ - if (flags & _PAGE_NO_CACHE) - flags &= ~_PAGE_COHERENT; - /* We don't support the 4K PFN hack with ioremap */ if (flags & _PAGE_4K_PFN) return NULL; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 2415a0d..0d46089 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -152,10 +152,6 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, /* Exact = 0 */ flags = 0; - /* Make pHyp happy */ - if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU)) - hpte_r &= ~HPTE_R_M; - if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N)) flags |= H_COALESCE_CAND; -- cgit v0.10.2 From ff844b741e1db87a72aabb5f61f6e0ca82bfe100 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:39 +1000 Subject: powerpc/mm: Use generic version of pmdp_clear_flush_young() The radix variant is going to require a flush_pmd_tlb_range(). With flush_pmd_tlb_range() added, pmdp_clear_flush_young() is the same as the generic version. So drop the powerpc specific variant. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 0fac737..4745d31 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -298,9 +298,6 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma, #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -extern int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index b3040b4..177e489 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -588,22 +588,15 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, return pmd; } -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} - /* * We currently remove entries from the hashtable regardless of whether - * the entry was young or dirty. The generic routines only flush if the - * entry was young or dirty which is not good enough. + * the entry was young or dirty. * * We should be more intelligent about this but for the moment we override * these functions and force a tlb flush unconditionally */ -int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); } -- cgit v0.10.2 From 33d336d986edc544cd153e31a28a515d5648721b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:40 +1000 Subject: powerpc/mm: Use generic version of ptep_clear_flush_young() The radix variant is going to require a flush_tlb_range(). With flush_tlb_range() added, ptep_clear_flush_young() is the same as the generic version. So drop the powerpc specific variant. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index eee951f..4fffc96 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -275,6 +275,13 @@ static inline unsigned long pte_update(struct mm_struct *mm, return old; } +/* + * We currently remove entries from the hashtable regardless of whether + * the entry was young or dirty. + * + * We should be more intelligent about this but for the moment we override + * these functions and force a tlb flush unconditionally + */ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -313,22 +320,6 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1); } -/* - * We currently remove entries from the hashtable regardless of whether - * the entry was young or dirty. The generic routines only flush if the - * entry was young or dirty which is not good enough. - * - * We should be more intelligent about this but for the moment we override - * these functions and force a tlb flush unconditionally - */ -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(__vma, __address, __ptep) \ -({ \ - int __young = __ptep_test_and_clear_young((__vma)->vm_mm, __address, \ - __ptep); \ - __young; \ -}) - #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -- cgit v0.10.2 From 11a6f6abd74ab80865023200ca33515f6db43d63 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:41 +1000 Subject: powerpc/mm: Move radix/hash common data structures to book3s64 headers Start moving code that is generic between radix and hash to book3s64 specific headers from the book3s64 hash specific one. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 16f513e..b82e063 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -1,5 +1,5 @@ -#ifndef _ASM_POWERPC_MMU_HASH32_H_ -#define _ASM_POWERPC_MMU_HASH32_H_ +#ifndef _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ +#define _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ /* * 32-bit hash table MMU support */ @@ -90,4 +90,4 @@ typedef struct { #define mmu_virtual_psize MMU_PAGE_4K #define mmu_linear_psize MMU_PAGE_256M -#endif /* _ASM_POWERPC_MMU_HASH32_H_ */ +#endif /* _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 0cea480..ce73736b 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -1,5 +1,5 @@ -#ifndef _ASM_POWERPC_MMU_HASH64_H_ -#define _ASM_POWERPC_MMU_HASH64_H_ +#ifndef _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ +#define _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ /* * PowerPC64 memory management structures * @@ -127,24 +127,6 @@ extern struct hash_pte *htab_address; extern unsigned long htab_size_bytes; extern unsigned long htab_hash_mask; -/* - * Page size definition - * - * shift : is the "PAGE_SHIFT" value for that page size - * sllp : is a bit mask with the value of SLB L || LP to be or'ed - * directly to a slbmte "vsid" value - * penc : is the HPTE encoding mask for the "LP" field: - * - */ -struct mmu_psize_def -{ - unsigned int shift; /* number of bits */ - int penc[MMU_PAGE_COUNT]; /* HPTE encoding */ - unsigned int tlbiel; /* tlbiel supported for that page size */ - unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */ - unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */ -}; -extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; static inline int shift_to_mmu_psize(unsigned int shift) { @@ -210,11 +192,6 @@ static inline int segment_shift(int ssize) /* * The current system page and segment sizes */ -extern int mmu_linear_psize; -extern int mmu_virtual_psize; -extern int mmu_vmalloc_psize; -extern int mmu_vmemmap_psize; -extern int mmu_io_psize; extern int mmu_kernel_ssize; extern int mmu_highuser_ssize; extern u16 mmu_slb_size; @@ -512,38 +489,6 @@ static inline void subpage_prot_free(struct mm_struct *mm) {} static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } #endif /* CONFIG_PPC_SUBPAGE_PROT */ -typedef unsigned long mm_context_id_t; -struct spinlock; - -typedef struct { - mm_context_id_t id; - u16 user_psize; /* page size index */ - -#ifdef CONFIG_PPC_MM_SLICES - u64 low_slices_psize; /* SLB page size encodings */ - unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; -#else - u16 sllp; /* SLB page size encoding */ -#endif - unsigned long vdso_base; -#ifdef CONFIG_PPC_SUBPAGE_PROT - struct subpage_prot_table spt; -#endif /* CONFIG_PPC_SUBPAGE_PROT */ -#ifdef CONFIG_PPC_ICSWX - struct spinlock *cop_lockp; /* guard acop and cop_pid */ - unsigned long acop; /* mask of enabled coprocessor types */ - unsigned int cop_pid; /* pid value used with coprocessors */ -#endif /* CONFIG_PPC_ICSWX */ -#ifdef CONFIG_PPC_64K_PAGES - /* for 4K PTE fragment support */ - void *pte_frag; -#endif -#ifdef CONFIG_SPAPR_TCE_IOMMU - struct list_head iommu_group_mem_list; -#endif -} mm_context_t; - - #if 0 /* * The code below is equivalent to this function for arguments @@ -613,4 +558,4 @@ unsigned htab_shift_for_mem_size(unsigned long mem_size); #endif /* __ASSEMBLY__ */ -#endif /* _ASM_POWERPC_MMU_HASH64_H_ */ +#endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h new file mode 100644 index 0000000..aadb0bb --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -0,0 +1,72 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_MMU_H_ +#define _ASM_POWERPC_BOOK3S_64_MMU_H_ + +#ifndef __ASSEMBLY__ +/* + * Page size definition + * + * shift : is the "PAGE_SHIFT" value for that page size + * sllp : is a bit mask with the value of SLB L || LP to be or'ed + * directly to a slbmte "vsid" value + * penc : is the HPTE encoding mask for the "LP" field: + * + */ +struct mmu_psize_def { + unsigned int shift; /* number of bits */ + int penc[MMU_PAGE_COUNT]; /* HPTE encoding */ + unsigned int tlbiel; /* tlbiel supported for that page size */ + unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */ + unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */ +}; +extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_PPC_STD_MMU_64 +/* 64-bit classic hash table MMU */ +#include +#endif + +#ifndef __ASSEMBLY__ + +typedef unsigned long mm_context_id_t; +struct spinlock; + +typedef struct { + mm_context_id_t id; + u16 user_psize; /* page size index */ + +#ifdef CONFIG_PPC_MM_SLICES + u64 low_slices_psize; /* SLB page size encodings */ + unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; +#else + u16 sllp; /* SLB page size encoding */ +#endif + unsigned long vdso_base; +#ifdef CONFIG_PPC_SUBPAGE_PROT + struct subpage_prot_table spt; +#endif /* CONFIG_PPC_SUBPAGE_PROT */ +#ifdef CONFIG_PPC_ICSWX + struct spinlock *cop_lockp; /* guard acop and cop_pid */ + unsigned long acop; /* mask of enabled coprocessor types */ + unsigned int cop_pid; /* pid value used with coprocessors */ +#endif /* CONFIG_PPC_ICSWX */ +#ifdef CONFIG_PPC_64K_PAGES + /* for 4K PTE fragment support */ + void *pte_frag; +#endif +#ifdef CONFIG_SPAPR_TCE_IOMMU + struct list_head iommu_group_mem_list; +#endif +} mm_context_t; + +/* + * The current system page and segment sizes + */ +extern int mmu_linear_psize; +extern int mmu_virtual_psize; +extern int mmu_vmalloc_psize; +extern int mmu_vmemmap_psize; +extern int mmu_io_psize; + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 8ca1c98..5f55024 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -181,10 +181,13 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) #define MMU_PAGE_COUNT 15 -#if defined(CONFIG_PPC_STD_MMU_64) -/* 64-bit classic hash table MMU */ -#include -#elif defined(CONFIG_PPC_STD_MMU_32) +#ifdef CONFIG_PPC_BOOK3S_64 +#include +#else /* CONFIG_PPC_BOOK3S_64 */ + +#endif + +#if defined(CONFIG_PPC_STD_MMU_32) /* 32-bit classic hash table MMU */ #include #elif defined(CONFIG_40x) -- cgit v0.10.2 From e99833448c5fa198a120db6d7087dd17149dc273 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:42 +1000 Subject: powerpc/mm/radix: Add partition table format & callback Add structs and #defines related to the radix MMU partition table format. We also add a ppc_md callback for updating a partition table entry. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index aadb0bb..b86786f 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -21,12 +21,39 @@ struct mmu_psize_def { extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; #endif /* __ASSEMBLY__ */ -#ifdef CONFIG_PPC_STD_MMU_64 /* 64-bit classic hash table MMU */ #include -#endif #ifndef __ASSEMBLY__ +/* + * ISA 3.0 partiton and process table entry format + */ +struct prtb_entry { + __be64 prtb0; + __be64 prtb1; +}; +extern struct prtb_entry *process_tb; + +struct patb_entry { + __be64 patb0; + __be64 patb1; +}; +extern struct patb_entry *partition_tb; + +#define PATB_HR (1UL << 63) +#define PATB_GR (1UL << 63) +#define RPDB_MASK 0x0ffffffffffff00fUL +#define RPDB_SHIFT (1UL << 8) +/* + * Limit process table to PAGE_SIZE table. This + * also limit the max pid we can support. + * MAX_USER_CONTEXT * 16 bytes of space. + */ +#define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4) +/* + * Power9 currently only support 64K partition table size. + */ +#define PATB_SIZE_SHIFT 16 typedef unsigned long mm_context_id_t; struct spinlock; diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index fd22442..6bdcd0d 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -256,6 +256,7 @@ struct machdep_calls { #ifdef CONFIG_ARCH_RANDOM int (*get_random_seed)(unsigned long *v); #endif + int (*update_partition_table)(u64); }; extern void e500_idle(void); diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index cf09c6e..47082bc 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -587,6 +587,7 @@ #define SPRN_PIR 0x3FF /* Processor Identification Register */ #endif #define SPRN_TIR 0x1BE /* Thread Identification Register */ +#define SPRN_PTCR 0x1D0 /* Partition table control Register */ #define SPRN_PSPB 0x09F /* Problem State Priority Boost reg */ #define SPRN_PTEHI 0x3D5 /* 981 7450 PTE HI word (S/W TLB load) */ #define SPRN_PTELO 0x3D6 /* 982 7450 PTE LO word (S/W TLB load) */ -- cgit v0.10.2 From 50de596de8be6de75401a2190b90a822e8a1bab2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:43 +1000 Subject: powerpc/mm/hash: Add support for Power9 Hash PowerISA 3.0 adds a parition table indexed by LPID. Parition table allows us to specify the MMU model that will be used for guest and host translation. This patch adds support with SLB based hash model (UPRT = 0). What is required with this model is to support the new hash page table entry format and also setup partition table such that we use hash table for address translation. We don't have segment table support yet. In order to make sure we don't load KVM module on Power9 (since we don't have kvm support yet) this patch also disables KVM on Power9. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index ce73736b..843b5d8 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -78,6 +78,10 @@ #define HPTE_V_SECONDARY ASM_CONST(0x0000000000000002) #define HPTE_V_VALID ASM_CONST(0x0000000000000001) +/* + * ISA 3.0 have a different HPTE format. + */ +#define HPTE_R_3_0_SSIZE_SHIFT 58 #define HPTE_R_PP0 ASM_CONST(0x8000000000000000) #define HPTE_R_TS ASM_CONST(0x4000000000000000) #define HPTE_R_KEY_HI ASM_CONST(0x3000000000000000) @@ -224,7 +228,8 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize, */ v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm); v <<= HPTE_V_AVPN_SHIFT; - v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; return v; } @@ -248,8 +253,12 @@ static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize, * aligned for the requested page size */ static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize, - int actual_psize) + int actual_psize, int ssize) { + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT; + /* A 4K page needs no special encoding */ if (actual_psize == MMU_PAGE_4K) return pa & HPTE_R_RPN; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 84fb4fc..4cd37b4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3271,6 +3271,12 @@ static int kvmppc_core_check_processor_compat_hv(void) if (!cpu_has_feature(CPU_FTR_HVMODE) || !cpu_has_feature(CPU_FTR_ARCH_206)) return -EIO; + /* + * Disable KVM for Power9, untill the required bits merged. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return -EIO; + return 0; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 95bceca..ffbaf40 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1683,7 +1683,11 @@ static void kvmppc_core_destroy_vm_pr(struct kvm *kvm) static int kvmppc_core_check_processor_compat_pr(void) { - /* we are always compatible */ + /* + * Disable KVM for Power9 untill the required bits merged. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return -EIO; return 0; } diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 8eaac81..d873f65 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -221,7 +221,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, return -1; hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; + hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", @@ -719,6 +719,12 @@ static void native_flush_hash_range(unsigned long number, int local) local_irq_restore(flags); } +static int native_update_partition_table(u64 patb1) +{ + partition_tb->patb1 = cpu_to_be64(patb1); + return 0; +} + void __init hpte_init_native(void) { ppc_md.hpte_invalidate = native_hpte_invalidate; @@ -729,4 +735,7 @@ void __init hpte_init_native(void) ppc_md.hpte_clear_all = native_hpte_clear; ppc_md.flush_hash_range = native_flush_hash_range; ppc_md.hugepage_invalidate = native_hugepage_invalidate; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + ppc_md.update_partition_table = native_update_partition_table; } diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index eb928d8..f76a033 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -676,6 +676,41 @@ int remove_section_mapping(unsigned long start, unsigned long end) } #endif /* CONFIG_MEMORY_HOTPLUG */ +static void __init hash_init_partition_table(phys_addr_t hash_table, + unsigned long pteg_count) +{ + unsigned long ps_field; + unsigned long htab_size; + unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + + /* + * slb llp encoding for the page size used in VPM real mode. + * We can ignore that for lpid 0 + */ + ps_field = 0; + htab_size = __ilog2(pteg_count) - 11; + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); + partition_tb = __va(memblock_alloc_base(patb_size, patb_size, + MEMBLOCK_ALLOC_ANYWHERE)); + + /* Initialize the Partition Table with no entries */ + memset((void *)partition_tb, 0, patb_size); + partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size); + /* + * FIXME!! This should be done via update_partition table + * For now UPRT is 0 for us. + */ + partition_tb->patb1 = 0; + DBG("Partition table %p\n", partition_tb); + /* + * update partition table control register, + * 64 K size. + */ + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + +} + static void __init htab_initialize(void) { unsigned long table; @@ -744,8 +779,11 @@ static void __init htab_initialize(void) /* Initialize the HPT with no entries */ memset((void *)table, 0, htab_size_bytes); - /* Set SDR1 */ - mtspr(SPRN_SDR1, _SDR1); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + /* Set SDR1 */ + mtspr(SPRN_SDR1, _SDR1); + else + hash_init_partition_table(table, pteg_count); } prot = pgprot_val(PAGE_KERNEL); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 177e489..8f3b294 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -69,6 +69,13 @@ #endif #endif +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * partition table and process table for ISA 3.0 + */ +struct prtb_entry *process_tb; +struct patb_entry *partition_tb; +#endif unsigned long ioremap_bot = IOREMAP_BASE; #ifdef CONFIG_PPC_MMU_NOHASH diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index 2f95d33..c9a3e67 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -63,7 +63,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn, vflags &= ~HPTE_V_SECONDARY; hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags; + hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags; spin_lock_irqsave(&ps3_htab_lock, flags); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 0d46089..e4ceba2 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -139,7 +139,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, hpte_group, vpn, pa, rflags, vflags, psize); hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; + hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; if (!(vflags & HPTE_V_BOLTED)) pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r); -- cgit v0.10.2 From eee24b5aafe87f3591990528eae2ad22e2c1d50c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:44 +1000 Subject: powerpc/mm: Move hash and no hash code to separate files This patch reduces the number of #ifdefs in C code and will also help in adding radix changes later. Only code movement in this patch. Signed-off-by: Aneesh Kumar K.V [mpe: Propagate copyrights and update GPL text] Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index adfee3f..ef77899 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,7 +13,8 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o -obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o slb_low.o slb.o $(hash64-y) +obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o +obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o \ mmu_context_hash$(CONFIG_WORD_SIZE).o diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index ba65566..8d1daf7 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -189,75 +189,6 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size) return 0; } -/* On hash-based CPUs, the vmemmap is bolted in the hash table. - * - * On Book3E CPUs, the vmemmap is currently mapped in the top half of - * the vmalloc space using normal page tables, though the size of - * pages encoded in the PTEs can be different - */ - -#ifdef CONFIG_PPC_BOOK3E -static int __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - /* Create a PTE encoding without page size */ - unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | - _PAGE_KERNEL_RW; - - /* PTEs only contain page size encodings up to 32M */ - BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); - - /* Encode the size in the PTE */ - flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; - - /* For each PTE for that area, map things. Note that we don't - * increment phys because all PTEs are of the large size and - * thus must have the low bits clear - */ - for (i = 0; i < page_size; i += PAGE_SIZE) - BUG_ON(map_kernel_page(start + i, phys, flags)); - - return 0; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static void vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) -{ -} -#endif -#else /* CONFIG_PPC_BOOK3E */ -static int __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - int rc = htab_bolt_mapping(start, start + page_size, phys, - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); - if (rc < 0) { - int rc2 = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON(rc2 && (rc2 != -ENOENT)); - } - return rc; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static void vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) -{ - int rc = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON((rc < 0) && (rc != -ENOENT)); - WARN_ON(rc == -ENOENT); -} -#endif - -#endif /* CONFIG_PPC_BOOK3E */ - struct vmemmap_backing *vmemmap_list; static struct vmemmap_backing *next; static int num_left; @@ -309,6 +240,9 @@ static __meminit void vmemmap_list_populate(unsigned long phys, vmemmap_list = vmem_back; } +extern int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys); int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; @@ -347,6 +281,8 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) } #ifdef CONFIG_MEMORY_HOTPLUG +extern void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size); static unsigned long vmemmap_list_free(unsigned long start) { struct vmemmap_backing *vmem_back, *vmem_back_prev; diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c new file mode 100644 index 0000000..b323735 --- /dev/null +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -0,0 +1,121 @@ +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "mmu_decl.h" + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ +int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + + /* Encode the size in the PTE */ + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, flags)); + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +static __ref void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS))); + memset(pt, 0, size); + + return pt; +} + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } else { + pgdp = pgd_offset_k(ea); +#ifndef __PAGETABLE_PUD_FOLDED + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } +#endif /* !__PAGETABLE_PUD_FOLDED */ + pudp = pud_offset(pgdp, ea); + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } + + smp_wmb(); + return 0; +} diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c new file mode 100644 index 0000000..f405a67 --- /dev/null +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -0,0 +1,94 @@ +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "mmu_decl.h" + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * On hash-based CPUs, the vmemmap is bolted in the hash table. + * + */ +int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int rc = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); + if (rc < 0) { + int rc2 = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(rc2 && (rc2 != -ENOENT)); + } + return rc; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ + int rc = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON((rc < 0) && (rc != -ENOENT)); + WARN_ON(rc == -ENOENT); +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } else { + /* + * If the mm subsystem is not fully up, we cannot create a + * linux page table entry for this mapping. Simply bolt an + * entry in the hardware page table. + * + */ + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, + mmu_io_psize, mmu_kernel_ssize)) { + printk(KERN_ERR "Failed to do bolted mapping IO " + "memory at %016lx !\n", pa); + return -ENOMEM; + } + } + + smp_wmb(); + return 0; +} diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 8f3b294..1bfb112 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -78,89 +78,6 @@ struct patb_entry *partition_tb; #endif unsigned long ioremap_bot = IOREMAP_BASE; -#ifdef CONFIG_PPC_MMU_NOHASH -static __ref void *early_alloc_pgtable(unsigned long size) -{ - void *pt; - - pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS))); - memset(pt, 0, size); - - return pt; -} -#endif /* CONFIG_PPC_MMU_NOHASH */ - -/* - * map_kernel_page currently only called by __ioremap - * map_kernel_page adds an entry to the ioremap page table - * and adds an entry to the HPT, possibly bolting it - */ -int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) -{ - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - if (slab_is_available()) { - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); - } else { -#ifdef CONFIG_PPC_MMU_NOHASH - pgdp = pgd_offset_k(ea); -#ifdef PUD_TABLE_SIZE - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - BUG_ON(pudp == NULL); - pgd_populate(&init_mm, pgdp, pudp); - } -#endif /* PUD_TABLE_SIZE */ - pudp = pud_offset(pgdp, ea); - if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - BUG_ON(pmdp == NULL); - pud_populate(&init_mm, pudp, pmdp); - } - pmdp = pmd_offset(pudp, ea); - if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE); - BUG_ON(ptep == NULL); - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - ptep = pte_offset_kernel(pmdp, ea); - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); -#else /* CONFIG_PPC_MMU_NOHASH */ - /* - * If the mm subsystem is not fully up, we cannot create a - * linux page table entry for this mapping. Simply bolt an - * entry in the hardware page table. - * - */ - if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, - mmu_io_psize, mmu_kernel_ssize)) { - printk(KERN_ERR "Failed to do bolted mapping IO " - "memory at %016lx !\n", pa); - return -ENOMEM; - } -#endif /* !CONFIG_PPC_MMU_NOHASH */ - } - - smp_wmb(); - return 0; -} - - /** * __ioremap_at - Low level function to establish the page tables * for an IO mapping -- cgit v0.10.2 From 945537df7a107e0dad7b0d3d14df46cfba46f4c5 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:45 +1000 Subject: powerpc/mm/book3s: Rename hash specific PTE bits to carry H_ prefix This helps to make following hash only pte bits easier. We have kept _PAGE_CHG_MASK, _HPAGE_CHG_MASK and _PAGE_PROT_BITS as it is in this patch eventhough they use hash specific bits. Using them in radix as it is should be ok, because with radix we expect those bit positions to be zero. Only renames in this patch, no change in functionality. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 772850e..2f818cb 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -48,10 +48,14 @@ #define PGD_MASKED_BITS 0 /* PTE flags to conserve for HPTE identification */ -#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | \ - _PAGE_F_SECOND | _PAGE_F_GIX) - -#define _PAGE_4K_PFN 0 +#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \ + H_PAGE_F_SECOND | H_PAGE_F_GIX) +/* + * Not supported by 4k linux page size + */ +#define H_PAGE_4K_PFN 0x0 +#define H_PAGE_THP_HUGE 0x0 +#define H_PAGE_COMBO 0x0 #ifndef __ASSEMBLY__ /* * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range() diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index ecc7ce0..607cf32 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -29,17 +29,23 @@ #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#define _PAGE_COMBO 0x00001000 /* this is a combo 4k page */ -#define _PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ +#define H_PAGE_COMBO 0x00001000 /* this is a combo 4k page */ +#define H_PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ /* - * Used to track subpage group valid if _PAGE_COMBO is set - * This overloads _PAGE_F_GIX and _PAGE_F_SECOND + * We need to differentiate between explicit huge page and THP huge + * page, since THP huge page also need to track real subpage details */ -#define _PAGE_COMBO_VALID (_PAGE_F_GIX | _PAGE_F_SECOND) +#define H_PAGE_THP_HUGE H_PAGE_4K_PFN + +/* + * Used to track subpage group valid if H_PAGE_COMBO is set + * This overloads H_PAGE_F_GIX and H_PAGE_F_SECOND + */ +#define H_PAGE_COMBO_VALID (H_PAGE_F_GIX | H_PAGE_F_SECOND) /* PTE flags to conserve for HPTE identification */ -#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_F_SECOND | \ - _PAGE_F_GIX | _PAGE_HASHPTE | _PAGE_COMBO) +#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \ + H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO) /* * we support 16 fragments per PTE page of 64K size. */ @@ -75,9 +81,9 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep) rpte.pte = pte; rpte.hidx = 0; - if (pte_val(pte) & _PAGE_COMBO) { + if (pte_val(pte) & H_PAGE_COMBO) { /* - * Make sure we order the hidx load against the _PAGE_COMBO + * Make sure we order the hidx load against the H_PAGE_COMBO * check. The store side ordering is done in __hash_page_4K */ smp_rmb(); @@ -89,9 +95,9 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep) static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index) { - if ((pte_val(rpte.pte) & _PAGE_COMBO)) + if ((pte_val(rpte.pte) & H_PAGE_COMBO)) return (rpte.hidx >> (index<<2)) & 0xf; - return (pte_val(rpte.pte) >> _PAGE_F_GIX_SHIFT) & 0xf; + return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf; } #define __rpte_to_pte(r) ((r).pte) @@ -114,7 +120,7 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index); #define pte_iterate_hashed_end() } while(0); } } while(0) #define pte_pagesize_index(mm, addr, pte) \ - (((pte) & _PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K) + (((pte) & H_PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K) extern int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); @@ -126,7 +132,7 @@ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, return -EINVAL; } return remap_pfn_range(vma, addr, pfn, PAGE_SIZE, - __pgprot(pgprot_val(prot) | _PAGE_4K_PFN)); + __pgprot(pgprot_val(prot) | H_PAGE_4K_PFN)); } #define PTE_TABLE_SIZE PTE_FRAG_SIZE @@ -255,8 +261,8 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, */ static inline int pmd_trans_huge(pmd_t pmd) { - return !!((pmd_val(pmd) & (_PAGE_PTE | _PAGE_THP_HUGE)) == - (_PAGE_PTE | _PAGE_THP_HUGE)); + return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE)) == + (_PAGE_PTE | H_PAGE_THP_HUGE)); } static inline int pmd_large(pmd_t pmd) @@ -280,7 +286,7 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, { unsigned long old; - if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) + if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) return 0; old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); return ((old & _PAGE_ACCESSED) != 0); diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 4fffc96..5566702 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -26,19 +26,22 @@ #define _PAGE_TOLERANT 0x00030 /* tolerant memory, cache inhibited */ #define _PAGE_DIRTY 0x00080 /* C: page changed */ #define _PAGE_ACCESSED 0x00100 /* R: page referenced */ -#define _PAGE_SPECIAL 0x00400 /* software: special page */ -#define _PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ - +/* + * Software bits + */ #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY 0x200 /* software: software dirty tracking */ +#define _PAGE_SOFT_DIRTY 0x00200 /* software: software dirty tracking */ #else -#define _PAGE_SOFT_DIRTY 0x000 +#define _PAGE_SOFT_DIRTY 0x00000 #endif +#define _PAGE_SPECIAL 0x00400 /* software: special page */ +#define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ + -#define _PAGE_F_GIX_SHIFT 57 -#define _PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ -#define _PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ -#define _PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ +#define H_PAGE_F_GIX_SHIFT 57 +#define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ +#define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ +#define H_PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ #define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ #define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ /* @@ -48,11 +51,6 @@ */ #define _PAGE_NO_CACHE _PAGE_TOLERANT /* - * We need to differentiate between explicit huge page and THP huge - * page, since THP huge page also need to track real subpage details - */ -#define _PAGE_THP_HUGE _PAGE_4K_PFN -/* * We support 57 bit real address in pte. Clear everything above 57, and * every thing below PAGE_SHIFT; */ @@ -61,7 +59,7 @@ * set of bits not changed in pmd_modify. */ #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ - _PAGE_ACCESSED | _PAGE_THP_HUGE | _PAGE_PTE | \ + _PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \ _PAGE_SOFT_DIRTY) @@ -148,7 +146,7 @@ * Mask of bits returned by pte_pgprot() */ #define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \ - _PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \ + H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \ _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ _PAGE_SOFT_DIRTY) /* @@ -262,14 +260,14 @@ static inline unsigned long pte_update(struct mm_struct *mm, bne- 1b" : "=&r" (old_be), "=&r" (tmp_be), "=m" (*ptep) : "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep), - "r" (cpu_to_be64(_PAGE_BUSY)), "r" (cpu_to_be64(set)) + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) : "cc" ); /* huge pages use the old page table lock */ if (!huge) assert_pte_locked(mm, addr); old = be64_to_cpu(old_be); - if (old & _PAGE_HASHPTE) + if (old & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, ptep, old, huge); return old; @@ -287,7 +285,7 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, { unsigned long old; - if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) + if ((pte_val(*ptep) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) return 0; old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); return (old & _PAGE_ACCESSED) != 0; @@ -355,7 +353,7 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) stdcx. %0,0,%4\n\ bne- 1b" :"=&r" (old), "=&r" (tmp), "=m" (*ptep) - :"r" (val), "r" (ptep), "m" (*ptep), "r" (cpu_to_be64(_PAGE_BUSY)) + :"r" (val), "r" (ptep), "m" (*ptep), "r" (cpu_to_be64(H_PAGE_BUSY)) :"cc"); } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 4745d31..65eb819 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -45,7 +45,7 @@ #define __real_pte(e,p) ((real_pte_t){(e)}) #define __rpte_to_pte(r) ((r).pte) -#define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >>_PAGE_F_GIX_SHIFT) +#define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT) #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift) \ do { \ @@ -287,7 +287,7 @@ static inline int pmd_protnone(pmd_t pmd) static inline pmd_t pmd_mkhuge(pmd_t pmd) { - return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_THP_HUGE)); + return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE)); } #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 3c380c2..1f4497f 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -310,9 +310,9 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing) */ old_pte = READ_ONCE(*ptep); /* - * wait until _PAGE_BUSY is clear then set it atomically + * wait until H_PAGE_BUSY is clear then set it atomically */ - if (unlikely(pte_val(old_pte) & _PAGE_BUSY)) { + if (unlikely(pte_val(old_pte) & H_PAGE_BUSY)) { cpu_relax(); continue; } diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 5f45753..2eeaf80 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -206,3 +206,7 @@ static inline bool pte_user(pte_t pte) #define _PAGE_READ 0 #define _PAGE_WRITE _PAGE_RW #endif + +#ifndef H_PAGE_4K_PFN +#define H_PAGE_4K_PFN 0 +#endif diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c index 529e492..6333b27 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/hash64_4k.c @@ -34,7 +34,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ if (unlikely(!check_pte_access(access, old_pte))) @@ -42,9 +42,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. Since this is 4K insert of 64K page size - * also add _PAGE_COMBO + * also add H_PAGE_COMBO */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); @@ -60,22 +60,22 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* * There MIGHT be an HPTE for this pte */ hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K, MMU_PAGE_4K, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; hash = hpt_hash(vpn, shift, ssize); @@ -115,9 +115,10 @@ repeat: MMU_PAGE_4K, MMU_PAGE_4K, old_pte); return -1; } - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index 6753310..16644e1 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -23,7 +23,7 @@ bool __rpte_sub_valid(real_pte_t rpte, unsigned long index) unsigned long g_idx; unsigned long ptev = pte_val(rpte.pte); - g_idx = (ptev & _PAGE_COMBO_VALID) >> _PAGE_F_GIX_SHIFT; + g_idx = (ptev & H_PAGE_COMBO_VALID) >> H_PAGE_F_GIX_SHIFT; index = index >> 2; if (g_idx & (0x1 << index)) return true; @@ -37,12 +37,12 @@ static unsigned long mark_subptegroup_valid(unsigned long ptev, unsigned long in { unsigned long g_idx; - if (!(ptev & _PAGE_COMBO)) + if (!(ptev & H_PAGE_COMBO)) return ptev; index = index >> 2; g_idx = 0x1 << index; - return ptev | (g_idx << _PAGE_F_GIX_SHIFT); + return ptev | (g_idx << H_PAGE_F_GIX_SHIFT); } int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, @@ -66,7 +66,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ if (unlikely(!check_pte_access(access, old_pte))) @@ -74,9 +74,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. Since this is 4K insert of 64K page size - * also add _PAGE_COMBO + * also add H_PAGE_COMBO */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_COMBO; + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO; if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); @@ -103,21 +103,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, /* *None of the sub 4k page is hashed */ - if (!(old_pte & _PAGE_HASHPTE)) + if (!(old_pte & H_PAGE_HASHPTE)) goto htab_insert_hpte; /* * Check if the pte was already inserted into the hash table * as a 64k HW page, and invalidate the 64k HPTE if so. */ - if (!(old_pte & _PAGE_COMBO)) { + if (!(old_pte & H_PAGE_COMBO)) { flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags); /* * clear the old slot details from the old and new pte. * On hash insert failure we use old pte value and we don't * want slot information there if we have a insert failure. */ - old_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); - new_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); + old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND); + new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND); goto htab_insert_hpte; } /* @@ -143,15 +143,15 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, if (ret == -1) goto htab_insert_hpte; - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } htab_insert_hpte: /* - * handle _PAGE_4K_PFN case + * handle H_PAGE_4K_PFN case */ - if (old_pte & _PAGE_4K_PFN) { + if (old_pte & H_PAGE_4K_PFN) { /* * All the sub 4k page have the same * physical address. @@ -199,20 +199,20 @@ repeat: } /* * Insert slot number & secondary bit in PTE second half, - * clear _PAGE_BUSY and set appropriate HPTE slot bit - * Since we have _PAGE_BUSY set on ptep, we can be sure + * clear H_PAGE_BUSY and set appropriate HPTE slot bit + * Since we have H_PAGE_BUSY set on ptep, we can be sure * nobody is undating hidx. */ hidxp = (unsigned long *)(ptep + PTRS_PER_PTE); rpte.hidx &= ~(0xfUL << (subpg_index << 2)); *hidxp = rpte.hidx | (slot << (subpg_index << 2)); new_pte = mark_subptegroup_valid(new_pte, subpg_index); - new_pte |= _PAGE_HASHPTE; + new_pte |= H_PAGE_HASHPTE; /* * check __real_pte for details on matching smp_rmb() */ smp_wmb(); - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } @@ -234,7 +234,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ if (unlikely(!check_pte_access(access, old_pte))) @@ -250,7 +250,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access, * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); @@ -262,22 +262,22 @@ int __hash_page_64K(unsigned long ea, unsigned long access, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* * There MIGHT be an HPTE for this pte */ hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K, MMU_PAGE_64K, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; hash = hpt_hash(vpn, shift, ssize); @@ -317,9 +317,10 @@ repeat: MMU_PAGE_64K, MMU_PAGE_64K, old_pte); return -1; } - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index f76a033..2a193f2 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1172,8 +1172,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, #endif /* Do actual hashing */ #ifdef CONFIG_PPC_64K_PAGES - /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ - if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { + /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ + if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) { demote_segment_4k(mm, ea); psize = MMU_PAGE_4K; } @@ -1335,13 +1335,13 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES - /* If either _PAGE_4K_PFN or cache inhibited is set (and we are on + /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take * care of it once we actually try to access the page. * That way we don't have to duplicate all of the logic for segment * page size demotion here */ - if ((pte_val(*ptep) & _PAGE_4K_PFN) || pte_ci(*ptep)) + if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) goto out_exit; #endif /* CONFIG_PPC_64K_PAGES */ diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 6cb6bdd..ba3fc22 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -37,7 +37,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, old_pmd = pmd_val(pmd); /* If PMD busy, retry the access */ - if (unlikely(old_pmd & _PAGE_BUSY)) + if (unlikely(old_pmd & H_PAGE_BUSY)) return 0; /* If PMD permissions don't match, take page fault */ if (unlikely(!check_pte_access(access, old_pmd))) @@ -46,7 +46,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access */ - new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED; + new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED; if (access & _PAGE_WRITE) new_pmd |= _PAGE_DIRTY; } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); @@ -78,7 +78,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, * base page size. This is because demote_segment won't flush * hash page table entries. */ - if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) { + if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) { flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, ssize, flags); /* @@ -125,7 +125,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, hash = hpt_hash(vpn, shift, ssize); /* insert new entry */ pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; - new_pmd |= _PAGE_HASHPTE; + new_pmd |= H_PAGE_HASHPTE; repeat: hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; @@ -169,17 +169,17 @@ repeat: mark_hpte_slot_valid(hpte_slot_array, index, slot); } /* - * Mark the pte with _PAGE_COMBO, if we are trying to hash it with + * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with * base page size 4k. */ if (psize == MMU_PAGE_4K) - new_pmd |= _PAGE_COMBO; + new_pmd |= H_PAGE_COMBO; /* * The hpte valid is stored in the pgtable whose address is in the * second half of the PMD. Order this against clearing of the busy bit in * huge pmd. */ smp_wmb(); - *pmdp = __pmd(new_pmd & ~_PAGE_BUSY); + *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index bf90784..3058560 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -47,7 +47,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, do { old_pte = pte_val(*ptep); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ if (unlikely(!check_pte_access(access, old_pte))) @@ -55,7 +55,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, /* Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); @@ -69,28 +69,28 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); /* Check if pte already has an hpte (case 2) */ - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* There MIGHT be an HPTE for this pte */ unsigned long hash, slot; hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, mmu_psize, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { unsigned long hash = hpt_hash(vpn, shift, ssize); pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; /* clear HPTE slot informations in new PTE */ - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, mmu_psize, ssize); @@ -106,14 +106,14 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, return -1; } - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & - (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } /* * No need to use ldarx/stdcx here */ - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 1bfb112..1fe356c 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -92,7 +92,7 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, flags |= pgprot_val(PAGE_KERNEL); /* We don't support the 4K PFN hack with ioremap */ - if (flags & _PAGE_4K_PFN) + if (flags & H_PAGE_4K_PFN) return NULL; WARN_ON(pa & ~PAGE_MASK); @@ -462,13 +462,13 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, bne- 1b" : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), - "r" (cpu_to_be64(_PAGE_BUSY)), "r" (cpu_to_be64(set)) + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) : "cc" ); old = be64_to_cpu(old_be); trace_hugepage_update(addr, old, clr, set); - if (old & _PAGE_HASHPTE) + if (old & H_PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp, old); return old; } @@ -640,7 +640,7 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, psize = get_slice_psize(mm, addr); BUG_ON(psize == MMU_PAGE_16M); #endif - if (old_pmd & _PAGE_COMBO) + if (old_pmd & H_PAGE_COMBO) psize = MMU_PAGE_4K; else psize = MMU_PAGE_64K; diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index f7b8039..38497cf 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -218,7 +218,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, pte = pte_val(*ptep); if (is_thp) trace_hugepage_invalidate(start, pte); - if (!(pte & _PAGE_HASHPTE)) + if (!(pte & H_PAGE_HASHPTE)) continue; if (unlikely(is_thp)) hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); @@ -248,7 +248,7 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) start_pte = pte_offset_map(pmd, addr); for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { unsigned long pteval = pte_val(*pte); - if (pteval & _PAGE_HASHPTE) + if (pteval & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, pte, pteval, 0); addr += PAGE_SIZE; } -- cgit v0.10.2 From d2cf005038104cf383db1469a7f0bbb5b0368f33 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:46 +1000 Subject: powerpc/mm: Handle _PTE_NONE_MASK I am splitting this as a separate patch to get better review. If ok we should merge this with previous patch. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 5566702..451617e 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -36,8 +36,7 @@ #endif #define _PAGE_SPECIAL 0x00400 /* software: special page */ #define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ - - +#define H_PTE_NONE_MASK _PAGE_HPTEFLAGS #define H_PAGE_F_GIX_SHIFT 57 #define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ #define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ @@ -134,7 +133,6 @@ #define _PTEIDX_SECONDARY 0x8 #define _PTEIDX_GROUP_IX 0x7 -#define _PTE_NONE_MASK _PAGE_HPTEFLAGS /* * _PAGE_CHG_MASK masks of bits that are to be preserved across * pgprot changes @@ -379,7 +377,7 @@ static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_WRITE); static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } -static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } +static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~H_PTE_NONE_MASK) == 0; } static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY -- cgit v0.10.2 From 2e8735198af0392d9032a356ee6c82686afca779 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:47 +1000 Subject: powerpc/mm: Move common pte bits and accessors to book3s/64/pgtable.h Now that we have moved book3s hash64 Linux pte bits to match Power ISA 3.0 radix pte bit positions, we move the matching pte bits to a common header. Only code movement in this patch. No functionality change. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 451617e..19f9e6e 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -13,54 +13,12 @@ * We could create separate kernel read-only if we used the 3 PP bits * combinations that newer processors provide but we currently don't. */ -#define _PAGE_BIT_SWAP_TYPE 0 - -#define _PAGE_EXEC 0x00001 /* execute permission */ -#define _PAGE_WRITE 0x00002 /* write access allowed */ -#define _PAGE_READ 0x00004 /* read access allowed */ -#define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) -#define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) -#define _PAGE_PRIVILEGED 0x00008 /* kernel access only */ -#define _PAGE_SAO 0x00010 /* Strong access order */ -#define _PAGE_NON_IDEMPOTENT 0x00020 /* non idempotent memory */ -#define _PAGE_TOLERANT 0x00030 /* tolerant memory, cache inhibited */ -#define _PAGE_DIRTY 0x00080 /* C: page changed */ -#define _PAGE_ACCESSED 0x00100 /* R: page referenced */ -/* - * Software bits - */ -#ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY 0x00200 /* software: software dirty tracking */ -#else -#define _PAGE_SOFT_DIRTY 0x00000 -#endif -#define _PAGE_SPECIAL 0x00400 /* software: special page */ #define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ #define H_PTE_NONE_MASK _PAGE_HPTEFLAGS #define H_PAGE_F_GIX_SHIFT 57 #define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ #define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ #define H_PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ -#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ -#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ -/* - * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE - * Instead of fixing all of them, add an alternate define which - * maps CI pte mapping. - */ -#define _PAGE_NO_CACHE _PAGE_TOLERANT -/* - * We support 57 bit real address in pte. Clear everything above 57, and - * every thing below PAGE_SHIFT; - */ -#define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK)) -/* - * set of bits not changed in pmd_modify. - */ -#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ - _PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \ - _PAGE_SOFT_DIRTY) - #ifdef CONFIG_PPC_64K_PAGES #include @@ -118,105 +76,11 @@ #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #endif /* CONFIG_PPC_MM_SLICES */ -/* - * user access blocked by key - */ -#define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY) -#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ) -#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ - _PAGE_RW | _PAGE_EXEC) - -/* No page size encoding in the linux PTE */ -#define _PAGE_PSIZE 0 /* PTEIDX nibble */ #define _PTEIDX_SECONDARY 0x8 #define _PTEIDX_GROUP_IX 0x7 -/* - * _PAGE_CHG_MASK masks of bits that are to be preserved across - * pgprot changes - */ -#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ - _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \ - _PAGE_SOFT_DIRTY) -/* - * Mask of bits returned by pte_pgprot() - */ -#define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \ - H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \ - _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ - _PAGE_SOFT_DIRTY) -/* - * We define 2 sets of base prot bits, one for basic pages (ie, - * cacheable kernel and user pages) and one for non cacheable - * pages. We always set _PAGE_COHERENT when SMP is enabled or - * the processor might need it for DMA coherency. - */ -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) -#define _PAGE_BASE (_PAGE_BASE_NC) - -/* Permission masks used to generate the __P and __S table, - * - * Note:__pgprot is defined in arch/powerpc/include/asm/page.h - * - * Write permissions imply read permissions for now (we could make write-only - * pages on BookE but we don't bother for now). Execute permission control is - * possible on platforms that define _PAGE_EXEC - * - * Note due to the way vm flags are laid out, the bits are XWR - */ -#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_RW | _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_READ) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) - -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_X -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY_X -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_X -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED_X -#define __S111 PAGE_SHARED_X - -/* Permission masks used for kernel mappings */ -#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) -#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_TOLERANT) -#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NON_IDEMPOTENT) -#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) -#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) -#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) - -/* Protection used for kernel text. We want the debuggers to be able to - * set breakpoints anywhere, so don't write protect the kernel text - * on platforms where such control is possible. - */ -#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ - defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) -#define PAGE_KERNEL_TEXT PAGE_KERNEL_X -#else -#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX -#endif - -/* Make modules code happy. We don't set RO yet */ -#define PAGE_KERNEL_EXEC PAGE_KERNEL_X -#define PAGE_AGP (PAGE_KERNEL_NC) - #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) #define PUD_BAD_BITS (PMD_TABLE_SIZE-1) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 65eb819..47e2e77 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1,9 +1,148 @@ #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ + +/* + * Common bits between hash and Radix page table + */ +#define _PAGE_BIT_SWAP_TYPE 0 + +#define _PAGE_EXEC 0x00001 /* execute permission */ +#define _PAGE_WRITE 0x00002 /* write access allowed */ +#define _PAGE_READ 0x00004 /* read access allowed */ +#define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) +#define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) +#define _PAGE_PRIVILEGED 0x00008 /* kernel access only */ +#define _PAGE_SAO 0x00010 /* Strong access order */ +#define _PAGE_NON_IDEMPOTENT 0x00020 /* non idempotent memory */ +#define _PAGE_TOLERANT 0x00030 /* tolerant memory, cache inhibited */ +#define _PAGE_DIRTY 0x00080 /* C: page changed */ +#define _PAGE_ACCESSED 0x00100 /* R: page referenced */ +/* + * Software bits + */ +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SOFT_DIRTY 0x00200 /* software: software dirty tracking */ +#else +#define _PAGE_SOFT_DIRTY 0x00000 +#endif +#define _PAGE_SPECIAL 0x00400 /* software: special page */ + + +#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ +#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ +/* + * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE + * Instead of fixing all of them, add an alternate define which + * maps CI pte mapping. + */ +#define _PAGE_NO_CACHE _PAGE_TOLERANT +/* + * We support 57 bit real address in pte. Clear everything above 57, and + * every thing below PAGE_SHIFT; + */ +#define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK)) +/* + * set of bits not changed in pmd_modify. Even though we have hash specific bits + * in here, on radix we expect them to be zero. + */ +#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ + _PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \ + _PAGE_SOFT_DIRTY) +/* + * user access blocked by key + */ +#define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY) +#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ) +#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ + _PAGE_RW | _PAGE_EXEC) /* - * This file contains the functions and defines necessary to modify and use - * the ppc64 hashed page table. + * No page size encoding in the linux PTE */ +#define _PAGE_PSIZE 0 +/* + * _PAGE_CHG_MASK masks of bits that are to be preserved across + * pgprot changes + */ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ + _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \ + _PAGE_SOFT_DIRTY) +/* + * Mask of bits returned by pte_pgprot() + */ +#define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \ + H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \ + _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ + _PAGE_SOFT_DIRTY) +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#define _PAGE_BASE (_PAGE_BASE_NC) + +/* Permission masks used to generate the __P and __S table, + * + * Note:__pgprot is defined in arch/powerpc/include/asm/page.h + * + * Write permissions imply read permissions for now (we could make write-only + * pages on BookE but we don't bother for now). Execute permission control is + * possible on platforms that define _PAGE_EXEC + * + * Note due to the way vm flags are laid out, the bits are XWR + */ +#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_READ) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) + +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_X +#define __P101 PAGE_READONLY_X +#define __P110 PAGE_COPY_X +#define __P111 PAGE_COPY_X + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_X +#define __S101 PAGE_READONLY_X +#define __S110 PAGE_SHARED_X +#define __S111 PAGE_SHARED_X + +/* Permission masks used for kernel mappings */ +#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) +#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ + _PAGE_TOLERANT) +#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ + _PAGE_NON_IDEMPOTENT) +#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) +#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) +#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) + +/* + * Protection used for kernel text. We want the debuggers to be able to + * set breakpoints anywhere, so don't write protect the kernel text + * on platforms where such control is possible. + */ +#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \ + defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) +#define PAGE_KERNEL_TEXT PAGE_KERNEL_X +#else +#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX +#endif + +/* Make modules code happy. We don't set RO yet */ +#define PAGE_KERNEL_EXEC PAGE_KERNEL_X +#define PAGE_AGP (PAGE_KERNEL_NC) #include #include -- cgit v0.10.2 From 13f829a5a1032ff1b96c727429bf461f7a8ad3f2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:48 +1000 Subject: powerpc/mm: Move pte accessors that operate on common pte bits to pgtable.h These pte functions will remain the same between radix and hash. Move them to pgtable.h. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 19f9e6e..04c004d 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -135,66 +135,6 @@ static inline unsigned long pte_update(struct mm_struct *mm, return old; } -/* - * We currently remove entries from the hashtable regardless of whether - * the entry was young or dirty. - * - * We should be more intelligent about this but for the moment we override - * these functions and force a tlb flush unconditionally - */ -static inline int __ptep_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - unsigned long old; - - if ((pte_val(*ptep) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) - return 0; - old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); - return (old & _PAGE_ACCESSED) != 0; -} -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define ptep_test_and_clear_young(__vma, __addr, __ptep) \ -({ \ - int __r; \ - __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \ - __r; \ -}) - -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - - if ((pte_val(*ptep) & _PAGE_WRITE) == 0) - return; - - pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - if ((pte_val(*ptep) & _PAGE_WRITE) == 0) - return; - - pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1); -} - -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); - return __pte(old); -} - -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t * ptep) -{ - pte_update(mm, addr, ptep, ~0UL, 0, 0); -} - - /* Set the dirty and/or accessed bits atomically in a linux PTE, this * function doesn't need to flush the hash entry */ @@ -237,112 +177,7 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) /* Generic accessors to PTE bits */ -static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_WRITE);} -static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } -static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } -static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~H_PTE_NONE_MASK) == 0; } -static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } - -#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY -static inline bool pte_soft_dirty(pte_t pte) -{ - return !!(pte_val(pte) & _PAGE_SOFT_DIRTY); -} -static inline pte_t pte_mksoft_dirty(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); -} - -static inline pte_t pte_clear_soft_dirty(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY); -} -#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ - -#ifdef CONFIG_NUMA_BALANCING -/* - * These work without NUMA balancing but the kernel does not care. See the - * comment in include/asm-generic/pgtable.h . On powerpc, this will only - * work for user pages and always return true for kernel pages. - */ -static inline int pte_protnone(pte_t pte) -{ - return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PRIVILEGED)) == - (_PAGE_PRESENT | _PAGE_PRIVILEGED); -} -#endif /* CONFIG_NUMA_BALANCING */ - -static inline int pte_present(pte_t pte) -{ - return !!(pte_val(pte) & _PAGE_PRESENT); -} - -/* Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - * - * Even if PTEs can be unsigned long long, a PFN is always an unsigned - * long for now. - */ -static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) -{ - return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) | - pgprot_val(pgprot)); -} - -static inline unsigned long pte_pfn(pte_t pte) -{ - return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT; -} - -/* Generic modifiers for PTE bits */ -static inline pte_t pte_wrprotect(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_WRITE); -} - -static inline pte_t pte_mkclean(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_DIRTY); -} - -static inline pte_t pte_mkold(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_ACCESSED); -} - -static inline pte_t pte_mkwrite(pte_t pte) -{ - /* - * write implies read, hence set both - */ - return __pte(pte_val(pte) | _PAGE_RW); -} - -static inline pte_t pte_mkdirty(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); -} - -static inline pte_t pte_mkyoung(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_ACCESSED); -} - -static inline pte_t pte_mkspecial(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_SPECIAL); -} - -static inline pte_t pte_mkhuge(pte_t pte) -{ - return pte; -} - -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); -} /* This low level function performs the actual PTE insertion * Setting the PTE depends on the MMU type and other factors. It's @@ -359,46 +194,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, *ptep = pte; } -#define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) - -#define pgprot_noncached pgprot_noncached -static inline pgprot_t pgprot_noncached(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_NON_IDEMPOTENT); -} - -#define pgprot_noncached_wc pgprot_noncached_wc -static inline pgprot_t pgprot_noncached_wc(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_TOLERANT); -} - -#define pgprot_cached pgprot_cached -static inline pgprot_t pgprot_cached(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL)); -} - -#define pgprot_writecombine pgprot_writecombine -static inline pgprot_t pgprot_writecombine(pgprot_t prot) -{ - return pgprot_noncached_wc(prot); -} -/* - * check a pte mapping have cache inhibited property - */ -static inline bool pte_ci(pte_t pte) -{ - unsigned long pte_v = pte_val(pte); - - if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) || - ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)) - return true; - return false; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long old_pmd); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 47e2e77..7ec1766 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -201,6 +201,215 @@ #endif /* __real_pte */ +/* + * For hash even if we have _PAGE_ACCESSED = 0, we do a pte_update. + * We currently remove entries from the hashtable regardless of whether + * the entry was young or dirty. + * + * We should be more intelligent about this but for the moment we override + * these functions and force a tlb flush unconditionally + * For radix: H_PAGE_HASHPTE should be zero. Hence we can use the same + * function for both hash and radix. + */ +static inline int __ptep_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + unsigned long old; + + if ((pte_val(*ptep) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) + return 0; + old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); + return (old & _PAGE_ACCESSED) != 0; +} + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define ptep_test_and_clear_young(__vma, __addr, __ptep) \ +({ \ + int __r; \ + __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \ + __r; \ +}) + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + + if ((pte_val(*ptep) & _PAGE_WRITE) == 0) + return; + + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0); +} + +static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + if ((pte_val(*ptep) & _PAGE_WRITE) == 0) + return; + + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1); +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); + return __pte(old); +} + +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t * ptep) +{ + pte_update(mm, addr, ptep, ~0UL, 0, 0); +} +static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_WRITE);} +static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } +static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } +static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } +static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } + +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +static inline bool pte_soft_dirty(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_SOFT_DIRTY); +} +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_clear_soft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY); +} +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + +#ifdef CONFIG_NUMA_BALANCING +/* + * These work without NUMA balancing but the kernel does not care. See the + * comment in include/asm-generic/pgtable.h . On powerpc, this will only + * work for user pages and always return true for kernel pages. + */ +static inline int pte_protnone(pte_t pte) +{ + return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PRIVILEGED)) == + (_PAGE_PRESENT | _PAGE_PRIVILEGED); +} +#endif /* CONFIG_NUMA_BALANCING */ + +static inline int pte_present(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_PRESENT); +} +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + * + * Even if PTEs can be unsigned long long, a PFN is always an unsigned + * long for now. + */ +static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) +{ + return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) | + pgprot_val(pgprot)); +} + +static inline unsigned long pte_pfn(pte_t pte) +{ + return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT; +} + +/* Generic modifiers for PTE bits */ +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_WRITE); +} + +static inline pte_t pte_mkclean(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_DIRTY); +} + +static inline pte_t pte_mkold(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_ACCESSED); +} + +static inline pte_t pte_mkwrite(pte_t pte) +{ + /* + * write implies read, hence set both + */ + return __pte(pte_val(pte) | _PAGE_RW); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_ACCESSED); +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SPECIAL); +} + +static inline pte_t pte_mkhuge(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + /* FIXME!! check whether this need to be a conditional */ + return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); +} + +#define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) + +#define pgprot_noncached pgprot_noncached +static inline pgprot_t pgprot_noncached(pgprot_t prot) +{ + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | + _PAGE_NON_IDEMPOTENT); +} + +#define pgprot_noncached_wc pgprot_noncached_wc +static inline pgprot_t pgprot_noncached_wc(pgprot_t prot) +{ + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | + _PAGE_TOLERANT); +} + +#define pgprot_cached pgprot_cached +static inline pgprot_t pgprot_cached(pgprot_t prot) +{ + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL)); +} + +#define pgprot_writecombine pgprot_writecombine +static inline pgprot_t pgprot_writecombine(pgprot_t prot) +{ + return pgprot_noncached_wc(prot); +} +/* + * check a pte mapping have cache inhibited property + */ +static inline bool pte_ci(pte_t pte) +{ + unsigned long pte_v = pte_val(pte); + + if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) || + ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)) + return true; + return false; +} + static inline void pmd_set(pmd_t *pmdp, unsigned long val) { *pmdp = __pmd(val); -- cgit v0.10.2 From dd1842a2a448bb66d74aa02a550df6be8c25f20b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:49 +1000 Subject: powerpc/mm: Make page table size a variable Radix and hash MMU models support different page table sizes. Make the #defines a variable so that existing code can work with variable sizes. Slice related code is only used by hash, so use hash constants there. We will replicate some of the boundary conditions with resepct to TASK_SIZE using radix values too. Right now we do boundary condition check using hash constants. Swapper pgdir size is initialized in asm code. We select the max pgd size to keep it simple. For now we select hash pgdir. When adding radix we will switch that to radix pgdir which is 64K. BUILD_BUG_ON check which is removed is already done in hugepage_init() using MAYBE_BUILD_BUG_ON(). Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 2f818cb..dcb9d6e 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -5,48 +5,20 @@ * for each page table entry. The PMD and PGD level use a 32b record for * each entry by assuming that each entry is page aligned. */ -#define PTE_INDEX_SIZE 9 -#define PMD_INDEX_SIZE 7 -#define PUD_INDEX_SIZE 9 -#define PGD_INDEX_SIZE 9 +#define H_PTE_INDEX_SIZE 9 +#define H_PMD_INDEX_SIZE 7 +#define H_PUD_INDEX_SIZE 9 +#define H_PGD_INDEX_SIZE 9 #ifndef __ASSEMBLY__ -#define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) -#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) -#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) -#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) -#endif /* __ASSEMBLY__ */ - -#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) -#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) -#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) -#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) - -/* PMD_SHIFT determines what a second-level page table entry can map */ -#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) +#define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) +#define H_PMD_TABLE_SIZE (sizeof(pmd_t) << H_PMD_INDEX_SIZE) +#define H_PUD_TABLE_SIZE (sizeof(pud_t) << H_PUD_INDEX_SIZE) +#define H_PGD_TABLE_SIZE (sizeof(pgd_t) << H_PGD_INDEX_SIZE) /* With 4k base page size, hugepage PTEs go at the PMD level */ #define MIN_HUGEPTE_SHIFT PMD_SHIFT -/* PUD_SHIFT determines what a third-level page table entry can map */ -#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) - -/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ -#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -/* Bits to mask out from a PMD to get to the PTE page */ -#define PMD_MASKED_BITS 0 -/* Bits to mask out from a PUD to get to the PMD page */ -#define PUD_MASKED_BITS 0 -/* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0 - /* PTE flags to conserve for HPTE identification */ #define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \ H_PAGE_F_SECOND | H_PAGE_F_GIX) @@ -56,7 +28,6 @@ #define H_PAGE_4K_PFN 0x0 #define H_PAGE_THP_HUGE 0x0 #define H_PAGE_COMBO 0x0 -#ifndef __ASSEMBLY__ /* * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range() */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 607cf32..7d2a9b6 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -1,34 +1,14 @@ #ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H #define _ASM_POWERPC_BOOK3S_64_HASH_64K_H -#define PTE_INDEX_SIZE 8 -#define PMD_INDEX_SIZE 5 -#define PUD_INDEX_SIZE 5 -#define PGD_INDEX_SIZE 12 - -#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) -#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) -#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) -#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) +#define H_PTE_INDEX_SIZE 8 +#define H_PMD_INDEX_SIZE 5 +#define H_PUD_INDEX_SIZE 5 +#define H_PGD_INDEX_SIZE 12 /* With 4k base page size, hugepage PTEs go at the PMD level */ #define MIN_HUGEPTE_SHIFT PAGE_SHIFT -/* PMD_SHIFT determines what a second-level page table entry can map */ -#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) - -/* PUD_SHIFT determines what a third-level page table entry can map */ -#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) - -/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ -#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - #define H_PAGE_COMBO 0x00001000 /* this is a combo 4k page */ #define H_PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ /* @@ -57,13 +37,6 @@ #define PTE_FRAG_SIZE_SHIFT 12 #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) -/* Bits to mask out from a PMD to get to the PTE page */ -#define PMD_MASKED_BITS 0xc0000000000000ffUL -/* Bits to mask out from a PUD to get to the PMD page */ -#define PUD_MASKED_BITS 0xc0000000000000ffUL -/* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0xc0000000000000ffUL - #ifndef __ASSEMBLY__ #include @@ -135,14 +108,15 @@ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, __pgprot(pgprot_val(prot) | H_PAGE_4K_PFN)); } -#define PTE_TABLE_SIZE PTE_FRAG_SIZE +#define H_PTE_TABLE_SIZE PTE_FRAG_SIZE #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + (sizeof(unsigned long) << PMD_INDEX_SIZE)) +#define H_PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + \ + (sizeof(unsigned long) << PMD_INDEX_SIZE)) #else -#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) +#define H_PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) #endif -#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) -#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) +#define H_PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) +#define H_PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) #ifdef CONFIG_HUGETLB_PAGE /* diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 04c004d..03044af 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -29,14 +29,18 @@ /* * Size of EA range mapped by our pagetables. */ -#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ - PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) -#define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE) +#define H_PGTABLE_EADDR_SIZE (H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + \ + H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT) +#define H_PGTABLE_RANGE (ASM_CONST(1) << H_PGTABLE_EADDR_SIZE) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PMD_CACHE_INDEX (PMD_INDEX_SIZE + 1) +/* + * only with hash we need to use the second half of pmd page table + * to store pointer to deposited pgtable_t + */ +#define H_PMD_CACHE_INDEX (H_PMD_INDEX_SIZE + 1) #else -#define PMD_CACHE_INDEX PMD_INDEX_SIZE +#define H_PMD_CACHE_INDEX H_PMD_INDEX_SIZE #endif /* * Define the address range of the kernel non-linear virtual area diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 843b5d8..7da61b8 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -462,7 +462,7 @@ extern void slb_set_size(u16 size); add rt,rt,rx /* 4 bits per slice and we have one slice per 1TB */ -#define SLICE_ARRAY_SIZE (PGTABLE_RANGE >> 41) +#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) #ifndef __ASSEMBLY__ @@ -533,7 +533,7 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, /* * Bad address. We return VSID 0 for that */ - if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) + if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) return 0; if (ssize == MMU_SEGSIZE_256M) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 7ec1766..f30f404 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -144,6 +144,66 @@ #define PAGE_KERNEL_EXEC PAGE_KERNEL_X #define PAGE_AGP (PAGE_KERNEL_NC) +#ifndef __ASSEMBLY__ +/* + * page table defines + */ +extern unsigned long __pte_index_size; +extern unsigned long __pmd_index_size; +extern unsigned long __pud_index_size; +extern unsigned long __pgd_index_size; +extern unsigned long __pmd_cache_index; +#define PTE_INDEX_SIZE __pte_index_size +#define PMD_INDEX_SIZE __pmd_index_size +#define PUD_INDEX_SIZE __pud_index_size +#define PGD_INDEX_SIZE __pgd_index_size +#define PMD_CACHE_INDEX __pmd_cache_index +/* + * Because of use of pte fragments and THP, size of page table + * are not always derived out of index size above. + */ +extern unsigned long __pte_table_size; +extern unsigned long __pmd_table_size; +extern unsigned long __pud_table_size; +extern unsigned long __pgd_table_size; +#define PTE_TABLE_SIZE __pte_table_size +#define PMD_TABLE_SIZE __pmd_table_size +#define PUD_TABLE_SIZE __pud_table_size +#define PGD_TABLE_SIZE __pgd_table_size +/* + * Pgtable size used by swapper, init in asm code + * We will switch this later to radix PGD + */ +#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << H_PGD_INDEX_SIZE) + +#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) +#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) +#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) +#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) + +/* PMD_SHIFT determines what a second-level page table entry can map */ +#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) + +/* PUD_SHIFT determines what a third-level page table entry can map */ +#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) + +/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ +#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +/* Bits to mask out from a PMD to get to the PTE page */ +#define PMD_MASKED_BITS 0xc0000000000000ffUL +/* Bits to mask out from a PUD to get to the PMD page */ +#define PUD_MASKED_BITS 0xc0000000000000ffUL +/* Bits to mask out from a PGD to get to the PUD page */ +#define PGD_MASKED_BITS 0xc0000000000000ffUL +#endif /* __ASSEMBLY__ */ + #include #include diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index d908a46..7748885 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -93,7 +93,7 @@ extern u64 ppc64_pft_size; #define SLICE_LOW_TOP (0x100000000ul) #define SLICE_NUM_LOW (SLICE_LOW_TOP >> SLICE_LOW_SHIFT) -#define SLICE_NUM_HIGH (PGTABLE_RANGE >> SLICE_HIGH_SHIFT) +#define SLICE_NUM_HIGH (H_PGTABLE_RANGE >> SLICE_HIGH_SHIFT) #define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT) #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index c9370d4..9ea0955 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -438,7 +438,11 @@ int main(void) DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); #endif +#ifdef MAX_PGD_TABLE_SIZE + DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE); +#else DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE); +#endif DEFINE(PTE_SIZE, sizeof(pte_t)); #ifdef CONFIG_KVM diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 2a193f2..aafc613 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -870,6 +870,18 @@ static void __init htab_initialize(void) void __init early_init_mmu(void) { + /* + * initialize page table size + */ + __pte_index_size = H_PTE_INDEX_SIZE; + __pmd_index_size = H_PMD_INDEX_SIZE; + __pud_index_size = H_PUD_INDEX_SIZE; + __pgd_index_size = H_PGD_INDEX_SIZE; + __pmd_cache_index = H_PMD_CACHE_INDEX; + __pte_table_size = H_PTE_TABLE_SIZE; + __pmd_table_size = H_PMD_TABLE_SIZE; + __pud_table_size = H_PUD_TABLE_SIZE; + __pgd_table_size = H_PGD_TABLE_SIZE; /* Initialize the MMU Hash table and create the linear mapping * of memory. Has to be done before SLB initialization as this is * currently where the page size encoding is obtained. diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 8d1daf7..09ca65e 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -66,11 +66,11 @@ #include "mmu_decl.h" #ifdef CONFIG_PPC_STD_MMU_64 -#if PGTABLE_RANGE > USER_VSID_RANGE +#if H_PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif -#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) +#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) #warning TASK_SIZE is smaller than it needs to be. #endif #endif /* CONFIG_PPC_STD_MMU_64 */ diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c index b323735..a229893 100644 --- a/arch/powerpc/mm/pgtable-book3e.c +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -77,6 +77,7 @@ int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) pmd_t *pmdp; pte_t *ptep; + BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE); if (slab_is_available()) { pgdp = pgd_offset_k(ea); pudp = pud_alloc(&init_mm, pgdp, ea); diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index f405a67..6f5fa68 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -61,6 +61,7 @@ int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) pmd_t *pmdp; pte_t *ptep; + BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); if (slab_is_available()) { pgdp = pgd_offset_k(ea); pudp = pud_alloc(&init_mm, pgdp, ea); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 1fe356c..dafcd9f 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -58,11 +58,6 @@ #define CREATE_TRACE_POINTS #include -/* Some sanity checking */ -#if TASK_SIZE_USER64 > PGTABLE_RANGE -#error TASK_SIZE_USER64 exceeds pagetable range -#endif - #ifdef CONFIG_PPC_STD_MMU_64 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) #error TASK_SIZE_USER64 exceeds user VSID range @@ -75,6 +70,28 @@ */ struct prtb_entry *process_tb; struct patb_entry *partition_tb; +/* + * page table size + */ +unsigned long __pte_index_size; +EXPORT_SYMBOL(__pte_index_size); +unsigned long __pmd_index_size; +EXPORT_SYMBOL(__pmd_index_size); +unsigned long __pud_index_size; +EXPORT_SYMBOL(__pud_index_size); +unsigned long __pgd_index_size; +EXPORT_SYMBOL(__pgd_index_size); +unsigned long __pmd_cache_index; +EXPORT_SYMBOL(__pmd_cache_index); +unsigned long __pte_table_size; +EXPORT_SYMBOL(__pte_table_size); +unsigned long __pmd_table_size; +EXPORT_SYMBOL(__pmd_table_size); +unsigned long __pud_table_size; +EXPORT_SYMBOL(__pud_table_size); +unsigned long __pgd_table_size; +EXPORT_SYMBOL(__pgd_table_size); + #endif unsigned long ioremap_bot = IOREMAP_BASE; @@ -739,12 +756,6 @@ pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, int has_transparent_hugepage(void) { - BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER, - "hugepages can't be allocated by the buddy allocator"); - - BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2, - "We need more than 2 pages to do deferred thp split"); - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) return 0; /* diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index d337400..15b8f71 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -35,7 +35,7 @@ _GLOBAL(slb_allocate_realmode) * check for bad kernel/user address * (ea & ~REGION_MASK) >= PGTABLE_RANGE */ - rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4) + rldicr. r9,r3,4,(63 - H_PGTABLE_EADDR_SIZE - 4) bne- 8f srdi r9,r3,60 /* get region */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 42954f0..ee21b86 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -37,8 +37,8 @@ #include /* some sanity checks */ -#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE -#error PGTABLE_RANGE exceeds slice_mask high_slices size +#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE +#error H_PGTABLE_RANGE exceeds slice_mask high_slices size #endif static DEFINE_SPINLOCK(slice_convert_lock); -- cgit v0.10.2 From aba480e1371358b419b2b680cc632f3210280b03 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:50 +1000 Subject: powerpc/mm: Move page table index and and vaddr to pgtable.h Now that the page table size is a variable, we can move these to generic pgtable.h. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 03044af..b601e05 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -90,18 +90,8 @@ #ifndef __ASSEMBLY__ #define pmd_bad(pmd) (pmd_val(pmd) & PMD_BAD_BITS) -#define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS) #define pud_bad(pud) (pud_val(pud) & PUD_BAD_BITS) -#define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) - -/* Pointers in the page table tree are physical addresses */ -#define __pgtable_ptr_val(ptr) __pa(ptr) - -#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1)) -#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1)) -#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1)) -#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1)) extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge); @@ -174,12 +164,6 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) return (((pte_raw(pte_a) ^ pte_raw(pte_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); } -static inline unsigned long pgd_page_vaddr(pgd_t pgd) -{ - return (unsigned long)__va(pgd_val(pgd) & ~PGD_MASKED_BITS); -} - - /* Generic accessors to PTE bits */ static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~H_PTE_NONE_MASK) == 0; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index f30f404..3e64a7b 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -534,6 +534,18 @@ static inline pgd_t pte_pgd(pte_t pte) extern struct page *pgd_page(pgd_t pgd); +/* Pointers in the page table tree are physical addresses */ +#define __pgtable_ptr_val(ptr) __pa(ptr) + +#define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS) +#define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) +#define pgd_page_vaddr(pgd) __va(pgd_val(pgd) & ~PGD_MASKED_BITS) + +#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1)) +#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1)) +#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1)) +#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1)) + /* * Find an entry in a page-table-directory. We combine the address region * (the high order N bits) and the pgd portion of the address. -- cgit v0.10.2 From 34fbadd8e98bbd85f503298152654f677db47ae3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:51 +1000 Subject: powerpc/mm: Move pte related functions together Only code movement. No functionality change. Signed-off-by: Aneesh Kumar K.V Acked-by: Balbir Singh Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 3e64a7b..2643487 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -430,6 +430,77 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); } +static inline bool pte_user(pte_t pte) +{ + return !(pte_val(pte) & _PAGE_PRIVILEGED); +} + +/* Encode and de-code a swap entry */ +#define MAX_SWAPFILES_CHECK() do { \ + BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ + /* \ + * Don't have overlapping bits with _PAGE_HPTEFLAGS \ + * We filter HPTEFLAGS on set_pte. \ + */ \ + BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ + BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ + } while (0) +/* + * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; + */ +#define SWP_TYPE_BITS 5 +#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ + & ((1UL << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << _PAGE_BIT_SWAP_TYPE) \ + | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)}) +/* + * swp_entry_t must be independent of pte bits. We build a swp_entry_t from + * swap type and offset we get from swap and convert that to pte to find a + * matching pte in linux page table. + * Clear bits not found in swap entries here. + */ +#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) +#define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) +#else +#define _PAGE_SWP_SOFT_DIRTY 0UL +#endif /* CONFIG_MEM_SOFT_DIRTY */ + +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); +} +static inline bool pte_swp_soft_dirty(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY); +} +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); +} +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + +static inline bool check_pte_access(unsigned long access, unsigned long ptev) +{ + /* + * This check for _PAGE_RWX and _PAGE_PRESENT bits + */ + if (access & ~ptev) + return false; + /* + * This check for access to privilege space + */ + if ((access & _PAGE_PRIVILEGED) != (ptev & _PAGE_PRIVILEGED)) + return false; + + return true; +} + #define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) #define pgprot_noncached pgprot_noncached @@ -576,77 +647,6 @@ extern struct page *pgd_page(pgd_t pgd); #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -/* Encode and de-code a swap entry */ -#define MAX_SWAPFILES_CHECK() do { \ - BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ - /* \ - * Don't have overlapping bits with _PAGE_HPTEFLAGS \ - * We filter HPTEFLAGS on set_pte. \ - */ \ - BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ - BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ - } while (0) -/* - * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; - */ -#define SWP_TYPE_BITS 5 -#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ - & ((1UL << SWP_TYPE_BITS) - 1)) -#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT) -#define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << _PAGE_BIT_SWAP_TYPE) \ - | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)}) -/* - * swp_entry_t must be independent of pte bits. We build a swp_entry_t from - * swap type and offset we get from swap and convert that to pte to find a - * matching pte in linux page table. - * Clear bits not found in swap entries here. - */ -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) -#define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) - -static inline bool pte_user(pte_t pte) -{ - return !(pte_val(pte) & _PAGE_PRIVILEGED); -} - -#ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) -#else -#define _PAGE_SWP_SOFT_DIRTY 0UL -#endif /* CONFIG_MEM_SOFT_DIRTY */ - -#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY -static inline pte_t pte_swp_mksoft_dirty(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); -} -static inline bool pte_swp_soft_dirty(pte_t pte) -{ - return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY); -} -static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); -} -#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ - -static inline bool check_pte_access(unsigned long access, unsigned long ptev) -{ - /* - * This check for _PAGE_RWX and _PAGE_PRESENT bits - */ - if (access & ~ptev) - return false; - /* - * This check for access to privilege space - */ - if ((access & _PAGE_PRIVILEGED) != (ptev & _PAGE_PRIVILEGED)) - return false; - - return true; -} - void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); -- cgit v0.10.2 From b0b5e9b1304765d079c71373df236733765f42ac Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:52 +1000 Subject: powerpc/mm/radix: Add radix pte #defines This adds Power ISA 3.0 specific pte defines. We share most of the details with hash Linux page table format. This patch indicates only things where we differ. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 2643487..c47c3f7 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -205,6 +205,7 @@ extern unsigned long __pgd_table_size; #endif /* __ASSEMBLY__ */ #include +#include #include /* diff --git a/arch/powerpc/include/asm/book3s/64/radix-4k.h b/arch/powerpc/include/asm/book3s/64/radix-4k.h new file mode 100644 index 0000000..7c3b1fe --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/radix-4k.h @@ -0,0 +1,12 @@ +#ifndef _ASM_POWERPC_PGTABLE_RADIX_4K_H +#define _ASM_POWERPC_PGTABLE_RADIX_4K_H + +/* + * For 4K page size supported index is 13/9/9/9 + */ +#define RADIX_PTE_INDEX_SIZE 9 /* 2MB huge page */ +#define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */ +#define RADIX_PUD_INDEX_SIZE 9 +#define RADIX_PGD_INDEX_SIZE 13 + +#endif /* _ASM_POWERPC_PGTABLE_RADIX_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/radix-64k.h b/arch/powerpc/include/asm/book3s/64/radix-64k.h new file mode 100644 index 0000000..82dc355 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/radix-64k.h @@ -0,0 +1,12 @@ +#ifndef _ASM_POWERPC_PGTABLE_RADIX_64K_H +#define _ASM_POWERPC_PGTABLE_RADIX_64K_H + +/* + * For 64K page size supported index is 13/9/9/5 + */ +#define RADIX_PTE_INDEX_SIZE 5 /* 2MB huge page */ +#define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */ +#define RADIX_PUD_INDEX_SIZE 9 +#define RADIX_PGD_INDEX_SIZE 13 + +#endif /* _ASM_POWERPC_PGTABLE_RADIX_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h new file mode 100644 index 0000000..4b581a0 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -0,0 +1,125 @@ +#ifndef _ASM_POWERPC_PGTABLE_RADIX_H +#define _ASM_POWERPC_PGTABLE_RADIX_H + +#ifndef __ASSEMBLY__ +#include +#endif + +#ifdef CONFIG_PPC_64K_PAGES +#include +#else +#include +#endif + +/* An empty PTE can still have a R or C writeback */ +#define RADIX_PTE_NONE_MASK (_PAGE_DIRTY | _PAGE_ACCESSED) + +/* Bits to set in a RPMD/RPUD/RPGD */ +#define RADIX_PMD_VAL_BITS (0x8000000000000000UL | RADIX_PTE_INDEX_SIZE) +#define RADIX_PUD_VAL_BITS (0x8000000000000000UL | RADIX_PMD_INDEX_SIZE) +#define RADIX_PGD_VAL_BITS (0x8000000000000000UL | RADIX_PUD_INDEX_SIZE) + +/* Don't have anything in the reserved bits and leaf bits */ +#define RADIX_PMD_BAD_BITS 0x60000000000000e0UL +#define RADIX_PUD_BAD_BITS 0x60000000000000e0UL +#define RADIX_PGD_BAD_BITS 0x60000000000000e0UL + +/* + * Size of EA range mapped by our pagetables. + */ +#define RADIX_PGTABLE_EADDR_SIZE (RADIX_PTE_INDEX_SIZE + RADIX_PMD_INDEX_SIZE + \ + RADIX_PUD_INDEX_SIZE + RADIX_PGD_INDEX_SIZE + PAGE_SHIFT) +#define RADIX_PGTABLE_RANGE (ASM_CONST(1) << RADIX_PGTABLE_EADDR_SIZE) + +#ifndef __ASSEMBLY__ +#define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) +#define RADIX_PMD_TABLE_SIZE (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE) +#define RADIX_PUD_TABLE_SIZE (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE) +#define RADIX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) + +static inline unsigned long radix__pte_update(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, unsigned long clr, + unsigned long set, + int huge) +{ + pte_t pte; + unsigned long old_pte, new_pte; + + do { + pte = READ_ONCE(*ptep); + old_pte = pte_val(pte); + new_pte = (old_pte | set) & ~clr; + + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* We already do a sync in cmpxchg, is ptesync needed ?*/ + asm volatile("ptesync" : : : "memory"); + /* huge pages use the old page table lock */ + if (!huge) + assert_pte_locked(mm, addr); + + return old_pte; +} + +/* + * Set the dirty and/or accessed bits atomically in a linux PTE, this + * function doesn't need to invalidate tlb. + */ +static inline void radix__ptep_set_access_flags(pte_t *ptep, pte_t entry) +{ + pte_t pte; + unsigned long old_pte, new_pte; + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | + _PAGE_RW | _PAGE_EXEC); + do { + pte = READ_ONCE(*ptep); + old_pte = pte_val(pte); + new_pte = old_pte | set; + + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* We already do a sync in cmpxchg, is ptesync needed ?*/ + asm volatile("ptesync" : : : "memory"); +} + +static inline int radix__pte_same(pte_t pte_a, pte_t pte_b) +{ + return ((pte_raw(pte_a) ^ pte_raw(pte_b)) == 0); +} + +static inline int radix__pte_none(pte_t pte) +{ + return (pte_val(pte) & ~RADIX_PTE_NONE_MASK) == 0; +} + +static inline void radix__set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int percpu) +{ + *ptep = pte; + asm volatile("ptesync" : : : "memory"); +} + +static inline int radix__pmd_bad(pmd_t pmd) +{ + return !!(pmd_val(pmd) & RADIX_PMD_BAD_BITS); +} + +static inline int radix__pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + return ((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) == 0); +} + +static inline int radix__pud_bad(pud_t pud) +{ + return !!(pud_val(pud) & RADIX_PUD_BAD_BITS); +} + + +static inline int radix__pgd_bad(pgd_t pgd) +{ + return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS); +} + +#endif /* __ASSEMBLY__ */ +#endif -- cgit v0.10.2 From 566ca99af026d43c7cf7ab72f78463dbcc7e6ac2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:53 +1000 Subject: powerpc/mm/radix: Add dummy radix_enabled() In this patch we add the radix Kconfig and conditional check. radix_enabled() is written to always return 0 here. Once we have all needed radix changes added, we will update this to an mmu_feature check. We need to add this early so that we can get it all build in the early stage. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index b86786f..593d9e3 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -19,6 +19,8 @@ struct mmu_psize_def { unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */ }; extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; + +#define radix_enabled() (0) #endif /* __ASSEMBLY__ */ /* 64-bit classic hash table MMU */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 5f55024..decde4c 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -204,6 +204,9 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) # include #endif +#ifndef radix_enabled +#define radix_enabled() (0) +#endif #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MMU_H_ */ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 142dff5..8d2cfb8 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -331,6 +331,15 @@ config PPC_STD_MMU_64 def_bool y depends on PPC_STD_MMU && PPC64 +config PPC_RADIX_MMU + bool "Radix MMU Support" + depends on PPC_BOOK3S_64 + default y + help + Enable support for the Power ISA 3.0 Radix style MMU. Currently this + is only implemented by IBM Power9 CPUs, if you don't have one of them + you can probably disable this. + config PPC_MMU_NOHASH def_bool y depends on !PPC_STD_MMU -- cgit v0.10.2 From ac94ac79dc0fc473aebcbe2ba4c26a5341b866be Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:54 +1000 Subject: powerpc/mm: Add radix callbacks to pte accessors For those pte accessors, that operate on a different set of pte bits between hash/radix, we add a generic variant that does a conditional to hash linux or radix variant. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index b601e05..6271439 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -85,23 +85,26 @@ #define _PTEIDX_SECONDARY 0x8 #define _PTEIDX_GROUP_IX 0x7 -#define PMD_BAD_BITS (PTE_TABLE_SIZE-1) -#define PUD_BAD_BITS (PMD_TABLE_SIZE-1) +#define H_PMD_BAD_BITS (PTE_TABLE_SIZE-1) +#define H_PUD_BAD_BITS (PMD_TABLE_SIZE-1) #ifndef __ASSEMBLY__ -#define pmd_bad(pmd) (pmd_val(pmd) & PMD_BAD_BITS) - -#define pud_bad(pud) (pud_val(pud) & PUD_BAD_BITS) +#define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) +#define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) +static inline int hash__pgd_bad(pgd_t pgd) +{ + return (pgd_val(pgd) == 0); +} extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge); extern unsigned long htab_convert_pte_flags(unsigned long pteflags); /* Atomic PTE updates */ -static inline unsigned long pte_update(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, unsigned long clr, - unsigned long set, - int huge) +static inline unsigned long hash__pte_update(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, unsigned long clr, + unsigned long set, + int huge) { __be64 old_be, tmp_be; unsigned long old; @@ -132,7 +135,7 @@ static inline unsigned long pte_update(struct mm_struct *mm, /* Set the dirty and/or accessed bits atomically in a linux PTE, this * function doesn't need to flush the hash entry */ -static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) +static inline void hash__ptep_set_access_flags(pte_t *ptep, pte_t entry) { __be64 old, tmp, val, mask; @@ -153,27 +156,23 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) :"cc"); } -static inline int pgd_bad(pgd_t pgd) +static inline int hash__pte_same(pte_t pte_a, pte_t pte_b) { - return (pgd_val(pgd) == 0); + return (((pte_raw(pte_a) ^ pte_raw(pte_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); } -#define __HAVE_ARCH_PTE_SAME -static inline int pte_same(pte_t pte_a, pte_t pte_b) +static inline int hash__pte_none(pte_t pte) { - return (((pte_raw(pte_a) ^ pte_raw(pte_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); + return (pte_val(pte) & ~H_PTE_NONE_MASK) == 0; } -/* Generic accessors to PTE bits */ -static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~H_PTE_NONE_MASK) == 0; } - /* This low level function performs the actual PTE insertion * Setting the PTE depends on the MMU type and other factors. It's * an horrible mess that I'm not going to try to clean up now but * I'm keeping it in one place rather than spread around */ -static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, int percpu) +static inline void hash__set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int percpu) { /* * Anything else just stores the PTE normally. That covers all 64-bit diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c47c3f7..bb0639a 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -262,6 +262,14 @@ extern unsigned long __pgd_table_size; #endif /* __real_pte */ +static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long clr, + unsigned long set, int huge) +{ + if (radix_enabled()) + return radix__pte_update(mm, addr, ptep, clr, set, huge); + return hash__pte_update(mm, addr, ptep, clr, set, huge); +} /* * For hash even if we have _PAGE_ACCESSED = 0, we do a pte_update. * We currently remove entries from the hashtable regardless of whether @@ -501,6 +509,39 @@ static inline bool check_pte_access(unsigned long access, unsigned long ptev) return true; } +/* + * Generic functions with hash/radix callbacks + */ + +static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) +{ + if (radix_enabled()) + return radix__ptep_set_access_flags(ptep, entry); + return hash__ptep_set_access_flags(ptep, entry); +} + +#define __HAVE_ARCH_PTE_SAME +static inline int pte_same(pte_t pte_a, pte_t pte_b) +{ + if (radix_enabled()) + return radix__pte_same(pte_a, pte_b); + return hash__pte_same(pte_a, pte_b); +} + +static inline int pte_none(pte_t pte) +{ + if (radix_enabled()) + return radix__pte_none(pte); + return hash__pte_none(pte); +} + +static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int percpu) +{ + if (radix_enabled()) + return radix__set_pte_at(mm, addr, ptep, pte, percpu); + return hash__set_pte_at(mm, addr, ptep, pte, percpu); +} #define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) @@ -555,6 +596,13 @@ static inline void pmd_clear(pmd_t *pmdp) #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_present(pmd) (!pmd_none(pmd)) +static inline int pmd_bad(pmd_t pmd) +{ + if (radix_enabled()) + return radix__pmd_bad(pmd); + return hash__pmd_bad(pmd); +} + static inline void pud_set(pud_t *pudp, unsigned long val) { *pudp = __pud(val); @@ -580,6 +628,15 @@ static inline pud_t pte_pud(pte_t pte) return __pud(pte_val(pte)); } #define pud_write(pud) pte_write(pud_pte(pud)) + +static inline int pud_bad(pud_t pud) +{ + if (radix_enabled()) + return radix__pud_bad(pud); + return hash__pud_bad(pud); +} + + #define pgd_write(pgd) pte_write(pgd_pte(pgd)) static inline void pgd_set(pgd_t *pgdp, unsigned long val) { @@ -604,6 +661,13 @@ static inline pgd_t pte_pgd(pte_t pte) return __pgd(pte_val(pte)); } +static inline int pgd_bad(pgd_t pgd) +{ + if (radix_enabled()) + return radix__pgd_bad(pgd); + return hash__pgd_bad(pgd); +} + extern struct page *pgd_page(pgd_t pgd); /* Pointers in the page table tree are physical addresses */ -- cgit v0.10.2 From a9252aaefe7e72133e7a37e0eff4e950a4f33af1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:55 +1000 Subject: powerpc/mm: Move hugetlb and THP related pmd accessors to pgtable.h Here we create pgtable-64/4k.h and move pmd accessors that are common between hash and radix there. We can't do much sharing with 4K Linux page size because 4K Linux page size with hash config doesn't support THP. So for now it is empty. In later patches we will add functions that does conditional hash/radix accessors there. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 7d2a9b6..d4e1f1f 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -118,57 +118,6 @@ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, #define H_PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) #define H_PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) -#ifdef CONFIG_HUGETLB_PAGE -/* - * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have - * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; - * - * Defined in such a way that we can optimize away code block at build time - * if CONFIG_HUGETLB_PAGE=n. - */ -static inline int pmd_huge(pmd_t pmd) -{ - /* - * leaf pte for huge page - */ - return !!(pmd_val(pmd) & _PAGE_PTE); -} - -static inline int pud_huge(pud_t pud) -{ - /* - * leaf pte for huge page - */ - return !!(pud_val(pud) & _PAGE_PTE); -} - -static inline int pgd_huge(pgd_t pgd) -{ - /* - * leaf pte for huge page - */ - return !!(pgd_val(pgd) & _PAGE_PTE); -} -#define pgd_huge pgd_huge - -#ifdef CONFIG_DEBUG_VM -extern int hugepd_ok(hugepd_t hpd); -#define is_hugepd(hpd) (hugepd_ok(hpd)) -#else -/* - * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't - * need to setup hugepage directory for them. Our pte and page directory format - * enable us to have this enabled. - */ -static inline int hugepd_ok(hugepd_t hpd) -{ - return 0; -} -#define is_hugepd(pdep) 0 -#endif /* CONFIG_DEBUG_VM */ - -#endif /* CONFIG_HUGETLB_PAGE */ - #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, @@ -239,44 +188,12 @@ static inline int pmd_trans_huge(pmd_t pmd) (_PAGE_PTE | H_PAGE_THP_HUGE)); } -static inline int pmd_large(pmd_t pmd) -{ - return !!(pmd_val(pmd) & _PAGE_PTE); -} - -static inline pmd_t pmd_mknotpresent(pmd_t pmd) -{ - return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); -} - #define __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); } -static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - unsigned long old; - - if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) - return 0; - old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); - return ((old & _PAGE_ACCESSED) != 0); -} - -#define __HAVE_ARCH_PMDP_SET_WRPROTECT -static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - - if ((pmd_val(*pmdp) & _PAGE_WRITE) == 0) - return; - - pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); -} - #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h new file mode 100644 index 0000000..423735f --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h @@ -0,0 +1,7 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H +#define _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H +/* + * hash 4k can't share hugetlb and also doesn't support THP + */ + +#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h new file mode 100644 index 0000000..ceadc2f --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -0,0 +1,94 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H +#define _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H + +#ifndef __ASSEMBLY__ +#ifdef CONFIG_HUGETLB_PAGE +/* + * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have + * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; + * + * Defined in such a way that we can optimize away code block at build time + * if CONFIG_HUGETLB_PAGE=n. + */ +static inline int pmd_huge(pmd_t pmd) +{ + /* + * leaf pte for huge page + */ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + +static inline int pud_huge(pud_t pud) +{ + /* + * leaf pte for huge page + */ + return !!(pud_val(pud) & _PAGE_PTE); +} + +static inline int pgd_huge(pgd_t pgd) +{ + /* + * leaf pte for huge page + */ + return !!(pgd_val(pgd) & _PAGE_PTE); +} +#define pgd_huge pgd_huge + +#ifdef CONFIG_DEBUG_VM +extern int hugepd_ok(hugepd_t hpd); +#define is_hugepd(hpd) (hugepd_ok(hpd)) +#else +/* + * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't + * need to setup hugepage directory for them. Our pte and page directory format + * enable us to have this enabled. + */ +static inline int hugepd_ok(hugepd_t hpd) +{ + return 0; +} +#define is_hugepd(pdep) 0 +#endif /* CONFIG_DEBUG_VM */ + +#endif /* CONFIG_HUGETLB_PAGE */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_large(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); +} +/* + * For radix we should always find H_PAGE_HASHPTE zero. Hence + * the below will work for radix too + */ +static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + unsigned long old; + + if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) + return 0; + old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); + return ((old & _PAGE_ACCESSED) != 0); +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + + if ((pmd_val(*pmdp) & _PAGE_WRITE) == 0) + return; + + pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); +} + +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* __ASSEMBLY__ */ +#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index bb0639a..a22b478 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -206,6 +206,13 @@ extern unsigned long __pgd_table_size; #include #include + +#ifdef CONFIG_PPC_64K_PAGES +#include +#else +#include +#endif + #include /* -- cgit v0.10.2 From 6cc1a0ee4ce29ad1cbdc622db6f9bc16d3056067 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:56 +1000 Subject: powerpc/mm/radix: Add radix callback for pmd accessors This only does 64K Linux page support for now. 64K hash Linux config THP needs to differentiate it from hugetlb huge page because with THP we need to track hash pte slot information with respect to each subpage. This is not needed with hugetlb hugepage, because we don't do MPSS with hugetlb. Radix doesn't have any such restrictions. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index d4e1f1f..c00e39f 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -97,8 +97,8 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index); extern int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); -static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, pgprot_t prot) +static inline int hash__remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) { if (pfn > (PTE_RPN_MASK >> PAGE_SHIFT)) { WARN(1, "remap_4k_pfn called with wrong pfn value\n"); @@ -182,14 +182,13 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, * that for explicit huge pages. * */ -static inline int pmd_trans_huge(pmd_t pmd) +static inline int hash__pmd_trans_huge(pmd_t pmd) { return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE)) == (_PAGE_PTE | H_PAGE_THP_HUGE)); } -#define __HAVE_ARCH_PMD_SAME -static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index ceadc2f..27b5e34 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -89,6 +89,29 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); } +static inline int pmd_trans_huge(pmd_t pmd) +{ + if (radix_enabled()) + return radix__pmd_trans_huge(pmd); + return hash__pmd_trans_huge(pmd); +} + +#define __HAVE_ARCH_PMD_SAME +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + if (radix_enabled()) + return radix__pmd_same(pmd_a, pmd_b); + return hash__pmd_same(pmd_a, pmd_b); +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) +{ + if (radix_enabled()) + BUG(); + return hash__remap_4k_pfn(vma, addr, pfn, prot); + +} #endif /* __ASSEMBLY__ */ #endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 4b581a0..db7e678 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -121,5 +121,14 @@ static inline int radix__pgd_bad(pgd_t pgd) return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +static inline int radix__pmd_trans_huge(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + +#endif + #endif /* __ASSEMBLY__ */ #endif -- cgit v0.10.2 From 756d08d1ba169ed9eae4d48e9a0af014e86593fc Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:57 +1000 Subject: powerpc/mm: Abstract early MMU init in preparation for radix Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 593d9e3..37943f7 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -97,5 +97,25 @@ extern int mmu_vmalloc_psize; extern int mmu_vmemmap_psize; extern int mmu_io_psize; +/* MMU initialization */ +extern void hash__early_init_mmu(void); +static inline void early_init_mmu(void) +{ + return hash__early_init_mmu(); +} +extern void hash__early_init_mmu_secondary(void); +static inline void early_init_mmu_secondary(void) +{ + return hash__early_init_mmu_secondary(); +} + +extern void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size); +static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + return hash__setup_initial_memory_limit(first_memblock_base, + first_memblock_size); +} #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index decde4c..34070e9 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -122,13 +122,6 @@ static inline void mmu_clear_feature(unsigned long feature) extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup; -/* MMU initialization */ -extern void early_init_mmu(void); -extern void early_init_mmu_secondary(void); - -extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size); - #ifdef CONFIG_PPC64 /* This is our real memory area size on ppc64 server, on embedded, we * make it match the size our of bolted TLB area @@ -185,6 +178,13 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) #include #else /* CONFIG_PPC_BOOK3S_64 */ +#ifndef __ASSEMBLY__ +/* MMU initialization */ +extern void early_init_mmu(void); +extern void early_init_mmu_secondary(void); +extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size); +#endif /* __ASSEMBLY__ */ #endif #if defined(CONFIG_PPC_STD_MMU_32) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index aafc613..e6b53cd 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -868,7 +868,7 @@ static void __init htab_initialize(void) #undef KB #undef MB -void __init early_init_mmu(void) +void __init hash__early_init_mmu(void) { /* * initialize page table size @@ -893,7 +893,7 @@ void __init early_init_mmu(void) } #ifdef CONFIG_SMP -void early_init_mmu_secondary(void) +void hash__early_init_mmu_secondary(void) { /* Initialize hash table for that CPU */ if (!firmware_has_feature(FW_FEATURE_LPAR)) @@ -1635,7 +1635,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } #endif /* CONFIG_DEBUG_PAGEALLOC */ -void setup_initial_memory_limit(phys_addr_t first_memblock_base, +void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { /* We don't currently support the first MEMBLOCK not mapping 0 -- cgit v0.10.2 From 2bfd65e45e877fb5704730244da67c748d28a1b8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:58 +1000 Subject: powerpc/mm/radix: Add radix callbacks for early init routines This adds routines for early setup for radix. We use device tree property "ibm,processor-radix-AP-encodings" to find supported page sizes. If we don't find the above we consider 64K and 4K as supported page sizes. We do map vmemap using 2M page size if we can. The linear mapping is done such that we use required page size for that range. For example memory of 3.5G is mapped such that we use 1G mapping till 3G range and use 2M mapping for the rest. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 37943f7..94d52ff 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -16,7 +16,10 @@ struct mmu_psize_def { int penc[MMU_PAGE_COUNT]; /* HPTE encoding */ unsigned int tlbiel; /* tlbiel supported for that page size */ unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */ - unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */ + union { + unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */ + unsigned long ap; /* Ap encoding used by PowerISA 3.0 */ + }; }; extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; @@ -98,22 +101,34 @@ extern int mmu_vmemmap_psize; extern int mmu_io_psize; /* MMU initialization */ +extern void radix_init_native(void); extern void hash__early_init_mmu(void); +extern void radix__early_init_mmu(void); static inline void early_init_mmu(void) { + if (radix_enabled()) + return radix__early_init_mmu(); return hash__early_init_mmu(); } extern void hash__early_init_mmu_secondary(void); +extern void radix__early_init_mmu_secondary(void); static inline void early_init_mmu_secondary(void) { + if (radix_enabled()) + return radix__early_init_mmu_secondary(); return hash__early_init_mmu_secondary(); } extern void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size); +extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size); static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { + if (radix_enabled()) + return radix__setup_initial_memory_limit(first_memblock_base, + first_memblock_size); return hash__setup_initial_memory_limit(first_memblock_base, first_memblock_size); } diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index db7e678..a26259f 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -130,5 +130,7 @@ static inline int radix__pmd_trans_huge(pmd_t pmd) #endif +extern int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, unsigned int psz); #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 47082bc..c1e82e9 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -347,6 +347,7 @@ #define LPCR_LPES_SH 2 #define LPCR_RMI 0x00000002 /* real mode is cache inhibit */ #define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */ +#define LPCR_UPRT 0x00400000 /* Use Process Table (ISA 3) */ #ifndef SPRN_LPID #define SPRN_LPID 0x13F /* Logical Partition Identifier */ #endif diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index ef77899..d734934 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) +obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o \ mmu_context_hash$(CONFIG_WORD_SIZE).o diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c new file mode 100644 index 0000000..4918850 --- /dev/null +++ b/arch/powerpc/mm/pgtable-radix.c @@ -0,0 +1,356 @@ +/* + * Page table handling routines for radix page table. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static int native_update_partition_table(u64 patb1) +{ + partition_tb->patb1 = cpu_to_be64(patb1); + return 0; +} + +static __ref void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE)); + memset(pt, 0, size); + + return pt; +} + +int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + /* + * Make sure task size is correct as per the max adddr + */ + BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + if (map_page_size == PMD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + } else { + pgdp = pgd_offset_k(ea); + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } + pudp = pud_offset(pgdp, ea); + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (map_page_size == PMD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + } + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags)); + smp_wmb(); + return 0; +} + +static void __init radix_init_pgtable(void) +{ + int loop_count; + u64 base, end, start_addr; + unsigned long rts_field; + struct memblock_region *reg; + unsigned long linear_page_size; + + /* We don't support slb for radix */ + mmu_slb_size = 0; + /* + * Create the linear mapping, using standard page size for now + */ + loop_count = 0; + for_each_memblock(memory, reg) { + + start_addr = reg->base; + +redo: + if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift) + linear_page_size = PUD_SIZE; + else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift) + linear_page_size = PMD_SIZE; + else + linear_page_size = PAGE_SIZE; + + base = _ALIGN_UP(start_addr, linear_page_size); + end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size); + + pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n", + (unsigned long)base, (unsigned long)end, + linear_page_size); + + while (base < end) { + radix__map_kernel_page((unsigned long)__va(base), + base, PAGE_KERNEL_X, + linear_page_size); + base += linear_page_size; + } + /* + * map the rest using lower page size + */ + if (end < reg->base + reg->size) { + start_addr = end; + loop_count++; + goto redo; + } + } + /* + * Allocate Partition table and process table for the + * host. + */ + BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 23), "Process table size too large."); + process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT); + /* + * Fill in the process table. + * we support 52 bits, hence 52-28 = 24, 11000 + */ + rts_field = 3ull << PPC_BITLSHIFT(2); + process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); + /* + * Fill in the partition table. We are suppose to use effective address + * of process table here. But our linear mapping also enable us to use + * physical address here. + */ + ppc_md.update_partition_table(__pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR); + pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); +} + +static void __init radix_init_partition_table(void) +{ + unsigned long rts_field; + /* + * we support 52 bits, hence 52-28 = 24, 11000 + */ + rts_field = 3ull << PPC_BITLSHIFT(2); + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); + partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT); + partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | + RADIX_PGD_INDEX_SIZE | PATB_HR); + printk("Partition table %p\n", partition_tb); + + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + /* + * update partition table control register, + * 64 K size. + */ + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); +} + +void __init radix_init_native(void) +{ + ppc_md.update_partition_table = native_update_partition_table; +} + +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x15: + idx = MMU_PAGE_2M; + break; + case 0x1e: + idx = MMU_PAGE_1G; + break; + } + return idx; +} + +static int __init radix_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + int size = 0; + int shift, idx; + unsigned int ap; + const __be32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + for (; size >= 4; size -= 4, ++prop) { + + struct mmu_psize_def *def; + + /* top 3 bit is AP encoding */ + shift = be32_to_cpu(prop[0]) & ~(0xe << 28); + ap = be32_to_cpu(prop[0]) >> 29; + pr_info("Page size sift = %d AP=0x%x\n", shift, ap); + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + def = &mmu_psize_defs[idx]; + def->shift = shift; + def->ap = ap; + } + + /* needed ? */ + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; + return 1; +} + +static void __init radix_init_page_sizes(void) +{ + int rc; + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); + if (rc != 0) /* Found */ + goto found; + /* + * let's assume we have page 4k and 64k support + */ + mmu_psize_defs[MMU_PAGE_4K].shift = 12; + mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; + + mmu_psize_defs[MMU_PAGE_64K].shift = 16; + mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; +found: +#ifdef CONFIG_SPARSEMEM_VMEMMAP + if (mmu_psize_defs[MMU_PAGE_2M].shift) { + /* + * map vmemmap using 2M if available + */ + mmu_vmemmap_psize = MMU_PAGE_2M; + } +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + return; +} + +void __init radix__early_init_mmu(void) +{ + unsigned long lpcr; + /* + * setup LPCR UPRT based on mmu_features + */ + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT); + +#ifdef CONFIG_PPC_64K_PAGES + /* PAGE_SIZE mappings */ + mmu_virtual_psize = MMU_PAGE_64K; +#else + mmu_virtual_psize = MMU_PAGE_4K; +#endif + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* vmemmap mapping */ + mmu_vmemmap_psize = mmu_virtual_psize; +#endif + /* + * initialize page table size + */ + __pte_index_size = RADIX_PTE_INDEX_SIZE; + __pmd_index_size = RADIX_PMD_INDEX_SIZE; + __pud_index_size = RADIX_PUD_INDEX_SIZE; + __pgd_index_size = RADIX_PGD_INDEX_SIZE; + __pmd_cache_index = RADIX_PMD_INDEX_SIZE; + __pte_table_size = RADIX_PTE_TABLE_SIZE; + __pmd_table_size = RADIX_PMD_TABLE_SIZE; + __pud_table_size = RADIX_PUD_TABLE_SIZE; + __pgd_table_size = RADIX_PGD_TABLE_SIZE; + + radix_init_page_sizes(); + + if (!firmware_has_feature(FW_FEATURE_LPAR)) + radix_init_partition_table(); + + radix_init_pgtable(); +} + +void radix__early_init_mmu_secondary(void) +{ + unsigned long lpcr; + /* + * setup LPCR UPRT based on mmu_features + */ + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT); + /* + * update partition table control register, 64 K size. + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); +} + +void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* Finally limit subsequent allocations */ + memblock_set_current_limit(first_memblock_base + first_memblock_size); +} diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 1acb0c7..ee6430b 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -273,7 +273,10 @@ static int __init pnv_probe(void) if (!of_flat_dt_is_compatible(root, "ibm,powernv")) return 0; - hpte_init_native(); + if (IS_ENABLED(CONFIG_PPC_RADIX_MMU) && radix_enabled()) + radix_init_native(); + else if (IS_ENABLED(CONFIG_PPC_STD_MMU_64)) + hpte_init_native(); if (firmware_has_feature(FW_FEATURE_OPAL)) pnv_setup_machdep_opal(); -- cgit v0.10.2 From 31a14fae92b29b7a7fcc65600f072bf448d3b0eb Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:25:59 +1000 Subject: powerpc/mm: Abstraction for vmemmap and map_kernel_page() For hash we create vmemmap mapping using bolted hash page table entries. For radix we fill the radix page table. The next patch will add the radix details for creating vmemmap mappings. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 6271439..cd3e915 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -193,6 +193,14 @@ static inline void hpte_do_hugepage_flush(struct mm_struct *mm, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +extern int hash__map_kernel_page(unsigned long ea, unsigned long pa, + unsigned long flags); +extern int __meminit hash__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys); +extern void hash__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size); #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index a22b478..c1b787d 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -722,6 +722,26 @@ extern struct page *pgd_page(pgd_t pgd); void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); +static inline int map_kernel_page(unsigned long ea, unsigned long pa, + unsigned long flags) +{ + return hash__map_kernel_page(ea, pa, flags); +} + +static inline int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + return hash__vmemmap_create_mapping(start, page_size, phys); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static inline void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ + return hash__vmemmap_remove_mapping(start, page_size); +} +#endif struct page *realmode_pfn_to_page(unsigned long pfn); #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 10debb9..f143d6f 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -362,6 +362,13 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); +extern int map_kernel_page(unsigned long ea, unsigned long pa, + unsigned long flags); +extern int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys); +extern void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size); #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 09ca65e..33709bd 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -240,9 +240,6 @@ static __meminit void vmemmap_list_populate(unsigned long phys, vmemmap_list = vmem_back; } -extern int __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys); int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; @@ -281,8 +278,6 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) } #ifdef CONFIG_MEMORY_HOTPLUG -extern void vmemmap_remove_mapping(unsigned long start, - unsigned long page_size); static unsigned long vmemmap_list_free(unsigned long start) { struct vmemmap_backing *vmem_back, *vmem_back_prev; diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index bfb7c0b..6af6532 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -108,11 +108,6 @@ extern unsigned long Hash_size, Hash_mask; #endif /* CONFIG_PPC32 */ -#ifdef CONFIG_PPC64 -extern int map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags); -#endif /* CONFIG_PPC64 */ - extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; extern phys_addr_t __initial_memory_limit_addr; diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 6f5fa68..64975eb 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -20,9 +20,9 @@ * On hash-based CPUs, the vmemmap is bolted in the hash table. * */ -int __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) +int __meminit hash__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) { int rc = htab_bolt_mapping(start, start + page_size, phys, pgprot_val(PAGE_KERNEL), @@ -37,8 +37,8 @@ int __meminit vmemmap_create_mapping(unsigned long start, } #ifdef CONFIG_MEMORY_HOTPLUG -void vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) +void hash__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) { int rc = htab_remove_mapping(start, start + page_size, mmu_vmemmap_psize, @@ -54,7 +54,7 @@ void vmemmap_remove_mapping(unsigned long start, * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) { pgd_t *pgdp; pud_t *pudp; -- cgit v0.10.2 From d9225ad923514252ec8d10dac67e5a7a2f0aa345 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:00 +1000 Subject: powerpc/mm/radix: Add radix callbacks for vmemmap and map_kernel page() Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c1b787d..9bc2471 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -725,6 +725,13 @@ void pgtable_cache_init(void); static inline int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) { + if (radix_enabled()) { +#if defined(CONFIG_PPC_RADIX_MMU) && defined(DEBUG_VM) + unsigned long page_size = 1 << mmu_psize_defs[mmu_io_psize].shift; + WARN((page_size != PAGE_SIZE), "I/O page size != PAGE_SIZE"); +#endif + return radix__map_kernel_page(ea, pa, __pgprot(flags), PAGE_SIZE); + } return hash__map_kernel_page(ea, pa, flags); } @@ -732,6 +739,8 @@ static inline int __meminit vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys) { + if (radix_enabled()) + return radix__vmemmap_create_mapping(start, page_size, phys); return hash__vmemmap_create_mapping(start, page_size, phys); } @@ -739,6 +748,9 @@ static inline int __meminit vmemmap_create_mapping(unsigned long start, static inline void vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { + + if (radix_enabled()) + return radix__vmemmap_remove_mapping(start, page_size); return hash__vmemmap_remove_mapping(start, page_size); } #endif diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index a26259f..63eb629 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -130,6 +130,12 @@ static inline int radix__pmd_trans_huge(pmd_t pmd) #endif +extern int __meminit radix__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys); +extern void radix__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size); + extern int radix__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t flags, unsigned int psz); #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 4918850..0472f8a 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -354,3 +354,23 @@ void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, /* Finally limit subsequent allocations */ memblock_set_current_limit(first_memblock_base + first_memblock_size); } + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int __meminit radix__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding */ + unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; + + BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size)); + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) +{ + /* FIXME!! intel does more. We should free page tables mapping vmemmap ? */ +} +#endif +#endif -- cgit v0.10.2 From d2adba3fd137b46903dad813e2a4777b96c85e0f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:01 +1000 Subject: powerpc/mm: Abstraction for switch_mmu_context() How we switch MMU context differs between hash and radix. For hash we need to switch the SLB details and for radix we need to switch the PID SPR. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 4eaab40..4e2a294 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -33,16 +33,23 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); #endif - -extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next); extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm); extern void set_context(unsigned long id, pgd_t *pgd); #ifdef CONFIG_PPC_BOOK3S_64 +static inline void switch_mmu_context(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) +{ + return switch_slb(tsk, next); +} + extern int __init_new_context(void); extern void __destroy_context(int context_id); static inline void mmu_context_init(void) { } #else +extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); extern unsigned long __init_new_context(void); extern void __destroy_context(unsigned long context_id); extern void mmu_context_init(void); @@ -88,17 +95,11 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (cpu_has_feature(CPU_FTR_ALTIVEC)) asm volatile ("dssall"); #endif /* CONFIG_ALTIVEC */ - - /* The actual HW switching method differs between the various - * sub architectures. + /* + * The actual HW switching method differs between the various + * sub architectures. Out of line for now */ -#ifdef CONFIG_PPC_STD_MMU_64 - switch_slb(tsk, next); -#else - /* Out of line for now */ - switch_mmu_context(prev, next); -#endif - + switch_mmu_context(prev, next, tsk); } #define deactivate_mm(tsk,mm) do { } while (0) diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c index 6669b17..6ae9bd5 100644 --- a/arch/powerpc/kernel/swsusp.c +++ b/arch/powerpc/kernel/swsusp.c @@ -31,6 +31,6 @@ void save_processor_state(void) void restore_processor_state(void) { #ifdef CONFIG_PPC32 - switch_mmu_context(current->active_mm, current->active_mm); + switch_mmu_context(current->active_mm, current->active_mm, NULL); #endif } diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 986afbc..a36c43a 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -226,7 +226,8 @@ static void context_check_map(void) static void context_check_map(void) { } #endif -void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) { unsigned int i, id, cpu = smp_processor_id(); unsigned long *map; diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c index 1f49d97..bde503c 100644 --- a/drivers/cpufreq/pmac32-cpufreq.c +++ b/drivers/cpufreq/pmac32-cpufreq.c @@ -298,7 +298,7 @@ static int pmu_set_cpu_speed(int low_speed) _set_L3CR(save_l3cr); /* Restore userland MMU context */ - switch_mmu_context(NULL, current->active_mm); + switch_mmu_context(NULL, current->active_mm, NULL); #ifdef DEBUG_FREQ printk(KERN_DEBUG "HID1, after: %x\n", mfspr(SPRN_HID1)); diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index 01ee736..f8b6d14 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -1851,7 +1851,7 @@ static int powerbook_sleep_grackle(void) _set_L2CR(save_l2cr); /* Restore userland MMU context */ - switch_mmu_context(NULL, current->active_mm); + switch_mmu_context(NULL, current->active_mm, NULL); /* Power things up */ pmu_unlock(); @@ -1940,7 +1940,7 @@ powerbook_sleep_Core99(void) _set_L3CR(save_l3cr); /* Restore userland MMU context */ - switch_mmu_context(NULL, current->active_mm); + switch_mmu_context(NULL, current->active_mm, NULL); /* Tell PMU we are ready */ pmu_unlock(); -- cgit v0.10.2 From 7e381c0ff618cab3e687e953b72f59ef2a891ba9 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:02 +1000 Subject: powerpc/mm/radix: Add mmu context handling callback for radix Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 4e2a294..9d2cd0c 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -37,10 +37,14 @@ extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm); extern void set_context(unsigned long id, pgd_t *pgd); #ifdef CONFIG_PPC_BOOK3S_64 +extern void radix__switch_mmu_context(struct mm_struct *prev, + struct mm_struct *next); static inline void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { + if (radix_enabled()) + return radix__switch_mmu_context(prev, next); return switch_slb(tsk, next); } diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c index 9ca6fe1..b5288b4 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -58,6 +58,17 @@ again: return index; } EXPORT_SYMBOL_GPL(__init_new_context); +static int radix__init_new_context(struct mm_struct *mm, int index) +{ + unsigned long rts_field; + + /* + * set the process table entry, + */ + rts_field = 3ull << PPC_BITLSHIFT(2); + process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); + return 0; +} int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { @@ -67,13 +78,18 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) if (index < 0) return index; - /* The old code would re-promote on fork, we don't do that - * when using slices as it could cause problem promoting slices - * that have been forced down to 4K - */ - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); - subpage_prot_init_new_context(mm); + if (radix_enabled()) { + radix__init_new_context(mm, index); + } else { + + /* The old code would re-promote on fork, we don't do that + * when using slices as it could cause problem promoting slices + * that have been forced down to 4K + */ + if (slice_mm_new_context(mm)) + slice_set_user_psize(mm, mmu_virtual_psize); + subpage_prot_init_new_context(mm); + } mm->context.id = index; #ifdef CONFIG_PPC_ICSWX mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); @@ -144,8 +160,19 @@ void destroy_context(struct mm_struct *mm) mm->context.cop_lockp = NULL; #endif /* CONFIG_PPC_ICSWX */ + if (radix_enabled()) + process_tb[mm->context.id].prtb1 = 0; + else + subpage_prot_free(mm); destroy_pagetable_page(mm); __destroy_context(mm->context.id); - subpage_prot_free(mm); mm->context.id = MMU_NO_CONTEXT; } + +#ifdef CONFIG_PPC_RADIX_MMU +void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +{ + mtspr(SPRN_PID, next->context.id); + asm volatile("isync": : :"memory"); +} +#endif -- cgit v0.10.2 From f5df4b4be9ae5abf748065d7f40d6027bf9af148 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:03 +1000 Subject: powerpc/mm: Rename mmu_context_hash64.c to mmu_context_book3s64.c This file now contains both hash and radix specific code. Rename it to indicate this better. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index d734934..9589236 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -14,11 +14,10 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) +obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o -obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o -obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o \ - mmu_context_hash$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o +obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o ifeq ($(CONFIG_PPC_STD_MMU_64),y) obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c new file mode 100644 index 0000000..b5288b4 --- /dev/null +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -0,0 +1,178 @@ +/* + * MMU context allocation for 64-bit kernels. + * + * Copyright (C) 2004 Anton Blanchard, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "icswx.h" + +static DEFINE_SPINLOCK(mmu_context_lock); +static DEFINE_IDA(mmu_context_ida); + +int __init_new_context(void) +{ + int index; + int err; + +again: + if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&mmu_context_lock); + err = ida_get_new_above(&mmu_context_ida, 1, &index); + spin_unlock(&mmu_context_lock); + + if (err == -EAGAIN) + goto again; + else if (err) + return err; + + if (index > MAX_USER_CONTEXT) { + spin_lock(&mmu_context_lock); + ida_remove(&mmu_context_ida, index); + spin_unlock(&mmu_context_lock); + return -ENOMEM; + } + + return index; +} +EXPORT_SYMBOL_GPL(__init_new_context); +static int radix__init_new_context(struct mm_struct *mm, int index) +{ + unsigned long rts_field; + + /* + * set the process table entry, + */ + rts_field = 3ull << PPC_BITLSHIFT(2); + process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); + return 0; +} + +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + int index; + + index = __init_new_context(); + if (index < 0) + return index; + + if (radix_enabled()) { + radix__init_new_context(mm, index); + } else { + + /* The old code would re-promote on fork, we don't do that + * when using slices as it could cause problem promoting slices + * that have been forced down to 4K + */ + if (slice_mm_new_context(mm)) + slice_set_user_psize(mm, mmu_virtual_psize); + subpage_prot_init_new_context(mm); + } + mm->context.id = index; +#ifdef CONFIG_PPC_ICSWX + mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + if (!mm->context.cop_lockp) { + __destroy_context(index); + subpage_prot_free(mm); + mm->context.id = MMU_NO_CONTEXT; + return -ENOMEM; + } + spin_lock_init(mm->context.cop_lockp); +#endif /* CONFIG_PPC_ICSWX */ + +#ifdef CONFIG_PPC_64K_PAGES + mm->context.pte_frag = NULL; +#endif +#ifdef CONFIG_SPAPR_TCE_IOMMU + mm_iommu_init(&mm->context); +#endif + return 0; +} + +void __destroy_context(int context_id) +{ + spin_lock(&mmu_context_lock); + ida_remove(&mmu_context_ida, context_id); + spin_unlock(&mmu_context_lock); +} +EXPORT_SYMBOL_GPL(__destroy_context); + +#ifdef CONFIG_PPC_64K_PAGES +static void destroy_pagetable_page(struct mm_struct *mm) +{ + int count; + void *pte_frag; + struct page *page; + + pte_frag = mm->context.pte_frag; + if (!pte_frag) + return; + + page = virt_to_page(pte_frag); + /* drop all the pending references */ + count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; + /* We allow PTE_FRAG_NR fragments from a PTE page */ + if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) { + pgtable_page_dtor(page); + free_hot_cold_page(page, 0); + } +} + +#else +static inline void destroy_pagetable_page(struct mm_struct *mm) +{ + return; +} +#endif + + +void destroy_context(struct mm_struct *mm) +{ +#ifdef CONFIG_SPAPR_TCE_IOMMU + mm_iommu_cleanup(&mm->context); +#endif + +#ifdef CONFIG_PPC_ICSWX + drop_cop(mm->context.acop, mm); + kfree(mm->context.cop_lockp); + mm->context.cop_lockp = NULL; +#endif /* CONFIG_PPC_ICSWX */ + + if (radix_enabled()) + process_tb[mm->context.id].prtb1 = 0; + else + subpage_prot_free(mm); + destroy_pagetable_page(mm); + __destroy_context(mm->context.id); + mm->context.id = MMU_NO_CONTEXT; +} + +#ifdef CONFIG_PPC_RADIX_MMU +void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +{ + mtspr(SPRN_PID, next->context.id); + asm volatile("isync": : :"memory"); +} +#endif diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c deleted file mode 100644 index b5288b4..0000000 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ /dev/null @@ -1,178 +0,0 @@ -/* - * MMU context allocation for 64-bit kernels. - * - * Copyright (C) 2004 Anton Blanchard, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "icswx.h" - -static DEFINE_SPINLOCK(mmu_context_lock); -static DEFINE_IDA(mmu_context_ida); - -int __init_new_context(void) -{ - int index; - int err; - -again: - if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) - return -ENOMEM; - - spin_lock(&mmu_context_lock); - err = ida_get_new_above(&mmu_context_ida, 1, &index); - spin_unlock(&mmu_context_lock); - - if (err == -EAGAIN) - goto again; - else if (err) - return err; - - if (index > MAX_USER_CONTEXT) { - spin_lock(&mmu_context_lock); - ida_remove(&mmu_context_ida, index); - spin_unlock(&mmu_context_lock); - return -ENOMEM; - } - - return index; -} -EXPORT_SYMBOL_GPL(__init_new_context); -static int radix__init_new_context(struct mm_struct *mm, int index) -{ - unsigned long rts_field; - - /* - * set the process table entry, - */ - rts_field = 3ull << PPC_BITLSHIFT(2); - process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); - return 0; -} - -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - int index; - - index = __init_new_context(); - if (index < 0) - return index; - - if (radix_enabled()) { - radix__init_new_context(mm, index); - } else { - - /* The old code would re-promote on fork, we don't do that - * when using slices as it could cause problem promoting slices - * that have been forced down to 4K - */ - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); - subpage_prot_init_new_context(mm); - } - mm->context.id = index; -#ifdef CONFIG_PPC_ICSWX - mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); - if (!mm->context.cop_lockp) { - __destroy_context(index); - subpage_prot_free(mm); - mm->context.id = MMU_NO_CONTEXT; - return -ENOMEM; - } - spin_lock_init(mm->context.cop_lockp); -#endif /* CONFIG_PPC_ICSWX */ - -#ifdef CONFIG_PPC_64K_PAGES - mm->context.pte_frag = NULL; -#endif -#ifdef CONFIG_SPAPR_TCE_IOMMU - mm_iommu_init(&mm->context); -#endif - return 0; -} - -void __destroy_context(int context_id) -{ - spin_lock(&mmu_context_lock); - ida_remove(&mmu_context_ida, context_id); - spin_unlock(&mmu_context_lock); -} -EXPORT_SYMBOL_GPL(__destroy_context); - -#ifdef CONFIG_PPC_64K_PAGES -static void destroy_pagetable_page(struct mm_struct *mm) -{ - int count; - void *pte_frag; - struct page *page; - - pte_frag = mm->context.pte_frag; - if (!pte_frag) - return; - - page = virt_to_page(pte_frag); - /* drop all the pending references */ - count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; - /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } -} - -#else -static inline void destroy_pagetable_page(struct mm_struct *mm) -{ - return; -} -#endif - - -void destroy_context(struct mm_struct *mm) -{ -#ifdef CONFIG_SPAPR_TCE_IOMMU - mm_iommu_cleanup(&mm->context); -#endif - -#ifdef CONFIG_PPC_ICSWX - drop_cop(mm->context.acop, mm); - kfree(mm->context.cop_lockp); - mm->context.cop_lockp = NULL; -#endif /* CONFIG_PPC_ICSWX */ - - if (radix_enabled()) - process_tb[mm->context.id].prtb1 = 0; - else - subpage_prot_free(mm); - destroy_pagetable_page(mm); - __destroy_context(mm->context.id); - mm->context.id = MMU_NO_CONTEXT; -} - -#ifdef CONFIG_PPC_RADIX_MMU -void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) -{ - mtspr(SPRN_PID, next->context.id); - asm volatile("isync": : :"memory"); -} -#endif -- cgit v0.10.2 From 676012a66f651a98808459bc8ab75661828ed96f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:04 +1000 Subject: powerpc/mm: Hash abstraction for tlbflush routines Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 1b753f9..cc092ea 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -52,40 +52,42 @@ extern void flush_hash_range(unsigned long number, int local); extern void flush_hash_hugepage(unsigned long vsid, unsigned long addr, pmd_t *pmdp, unsigned int psize, int ssize, unsigned long flags); - -static inline void local_flush_tlb_mm(struct mm_struct *mm) +static inline void hash__local_flush_tlb_mm(struct mm_struct *mm) { } -static inline void flush_tlb_mm(struct mm_struct *mm) +static inline void hash__flush_tlb_mm(struct mm_struct *mm) { } -static inline void local_flush_tlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) +static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) { } -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) +static inline void hash__flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) { } -static inline void flush_tlb_page_nohash(struct vm_area_struct *vma, - unsigned long vmaddr) +static inline void hash__flush_tlb_page_nohash(struct vm_area_struct *vma, + unsigned long vmaddr) { } -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static inline void hash__flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { } -static inline void flush_tlb_kernel_range(unsigned long start, - unsigned long end) +static inline void hash__flush_tlb_kernel_range(unsigned long start, + unsigned long end) { } + +struct mmu_gather; +extern void hash__tlb_flush(struct mmu_gather *tlb); /* Private function for use by PCI IO mapping code */ extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, unsigned long end); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h new file mode 100644 index 0000000..476ea24 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -0,0 +1,56 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H +#define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H + +#include + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + return hash__flush_tlb_range(vma, start, end); +} + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + return hash__flush_tlb_kernel_range(start, end); +} + +static inline void local_flush_tlb_mm(struct mm_struct *mm) +{ + return hash__local_flush_tlb_mm(mm); +} + +static inline void local_flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + return hash__local_flush_tlb_page(vma, vmaddr); +} + +static inline void flush_tlb_page_nohash(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + return hash__flush_tlb_page_nohash(vma, vmaddr); +} + +static inline void tlb_flush(struct mmu_gather *tlb) +{ + return hash__tlb_flush(tlb); +} + +#ifdef CONFIG_SMP +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + return hash__flush_tlb_mm(mm); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + return hash__flush_tlb_page(vma, vmaddr); +} +#else +#define flush_tlb_mm(mm) local_flush_tlb_mm(mm) +#define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr) +#endif /* CONFIG_SMP */ + +#endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */ diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index 9f77f85..2fc4331 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -78,7 +78,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm) } #elif defined(CONFIG_PPC_STD_MMU_64) -#include +#include #else #error Unsupported MMU type #endif diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 38497cf..4517aa4 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -155,7 +155,7 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) batch->index = 0; } -void tlb_flush(struct mmu_gather *tlb) +void hash__tlb_flush(struct mmu_gather *tlb) { struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); -- cgit v0.10.2 From 1a472c9dba6b9646fd36717968f6a531b4441c7d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:05 +1000 Subject: powerpc/mm/radix: Add tlbflush routines Core kernel doesn't track the page size of the VA range that we are invalidating. Hence we end up flushing TLB for the entire mm here. Later patches will improve this. We also don't flush page walk cache separetly instead use RIC=2 when flushing TLB, because we do a MMU gather flush after freeing page table. MMU_NO_CONTEXT is updated for hash. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 7da61b8..290157e 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -119,6 +119,7 @@ #define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */ #define POWER8_TLB_SETS 512 /* # sets in POWER8 TLB */ #define POWER9_TLB_SETS_HASH 256 /* # sets in POWER9 TLB Hash mode */ +#define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index cc092ea..f12ddf5 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -1,8 +1,6 @@ #ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H #define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H -#define MMU_NO_CONTEXT 0 - /* * TLB flushing for 64-bit hash-MMU CPUs */ @@ -29,14 +27,21 @@ extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch); static inline void arch_enter_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); + struct ppc64_tlb_batch *batch; + if (radix_enabled()) + return; + batch = this_cpu_ptr(&ppc64_tlb_batch); batch->active = 1; } static inline void arch_leave_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); + struct ppc64_tlb_batch *batch; + + if (radix_enabled()) + return; + batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->index) __flush_tlb_pending(batch); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h new file mode 100644 index 0000000..13ef388 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -0,0 +1,33 @@ +#ifndef _ASM_POWERPC_TLBFLUSH_RADIX_H +#define _ASM_POWERPC_TLBFLUSH_RADIX_H + +struct vm_area_struct; +struct mm_struct; +struct mmu_gather; + +static inline int mmu_get_ap(int psize) +{ + return mmu_psize_defs[psize].ap; +} + +extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end); +extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); + +extern void radix__local_flush_tlb_mm(struct mm_struct *mm); +extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid); +extern void radix__tlb_flush(struct mmu_gather *tlb); +#ifdef CONFIG_SMP +extern void radix__flush_tlb_mm(struct mm_struct *mm); +extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid); +#else +#define radix__flush_tlb_mm(mm) radix__local_flush_tlb_mm(mm) +#define radix__flush_tlb_page(vma,addr) radix__local_flush_tlb_page(vma,addr) +#define radix___flush_tlb_page(mm,addr,p,i) radix___local_flush_tlb_page(mm,addr,p,i) +#endif + +#endif diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 476ea24..d98424a 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -1,51 +1,71 @@ #ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H #define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H +#define MMU_NO_CONTEXT ~0UL + + #include +#include static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { + if (radix_enabled()) + return radix__flush_tlb_range(vma, start, end); return hash__flush_tlb_range(vma, start, end); } static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) { + if (radix_enabled()) + return radix__flush_tlb_kernel_range(start, end); return hash__flush_tlb_kernel_range(start, end); } static inline void local_flush_tlb_mm(struct mm_struct *mm) { + if (radix_enabled()) + return radix__local_flush_tlb_mm(mm); return hash__local_flush_tlb_mm(mm); } static inline void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { + if (radix_enabled()) + return radix__local_flush_tlb_page(vma, vmaddr); return hash__local_flush_tlb_page(vma, vmaddr); } static inline void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long vmaddr) { + if (radix_enabled()) + return radix__flush_tlb_page(vma, vmaddr); return hash__flush_tlb_page_nohash(vma, vmaddr); } static inline void tlb_flush(struct mmu_gather *tlb) { + if (radix_enabled()) + return radix__tlb_flush(tlb); return hash__tlb_flush(tlb); } #ifdef CONFIG_SMP static inline void flush_tlb_mm(struct mm_struct *mm) { + if (radix_enabled()) + return radix__flush_tlb_mm(mm); return hash__flush_tlb_mm(mm); } static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { + if (radix_enabled()) + return radix__flush_tlb_page(vma, vmaddr); return hash__flush_tlb_page(vma, vmaddr); } #else diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index 2fc4331..1b38eea 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -58,6 +58,7 @@ extern void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, #elif defined(CONFIG_PPC_STD_MMU_32) +#define MMU_NO_CONTEXT (0) /* * TLB flushing for "classic" hash-MMU 32-bit CPUs, 6xx, 7xx, 7xxx */ diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index ee62b19..f646602 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -72,6 +72,9 @@ void __flush_tlb_power8(unsigned int action) void __flush_tlb_power9(unsigned int action) { + if (radix_enabled()) + flush_tlb_206(POWER9_TLB_SETS_RADIX, action); + flush_tlb_206(POWER9_TLB_SETS_HASH, action); } diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 9589236..48aa11a 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o -obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o +obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o ifeq ($(CONFIG_PPC_STD_MMU_64),y) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c new file mode 100644 index 0000000..ecfa00f --- /dev/null +++ b/arch/powerpc/mm/tlb-radix.c @@ -0,0 +1,242 @@ +/* + * TLB flush routines for radix kernels. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include +#include + +static DEFINE_RAW_SPINLOCK(native_tlbie_lock); + +static inline void __tlbiel_pid(unsigned long pid, int set) +{ + unsigned long rb,rs,ric,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rb |= set << PPC_BITLSHIFT(51); + rs = ((unsigned long)pid) << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 2; /* invalidate all the caches */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("ptesync": : :"memory"); +} + +/* + * We use 128 set in radix mode and 256 set in hpt mode. + */ +static inline void _tlbiel_pid(unsigned long pid) +{ + int set; + + for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { + __tlbiel_pid(pid, set); + } + return; +} + +static inline void _tlbie_pid(unsigned long pid) +{ + unsigned long rb,rs,ric,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 2; /* invalidate all the caches */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap) +{ + unsigned long rb,rs,ric,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 0; /* no cluster flush yet */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("ptesync": : :"memory"); +} + +static inline void _tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap) +{ + unsigned long rb,rs,ric,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + ric = 0; /* no cluster flush yet */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +/* + * Base TLB flushing operations: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * - local_* variants of page and mm only apply to the current + * processor + */ +void radix__local_flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbiel_pid(pid); + preempt_enable(); +} +EXPORT_SYMBOL(radix__local_flush_tlb_mm); + +void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid) +{ + unsigned int pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (pid != MMU_NO_CONTEXT) + _tlbiel_va(vmaddr, pid, ap); + preempt_enable(); +} + +void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + radix___local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_ap(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(radix__local_flush_tlb_page); + +#ifdef CONFIG_SMP +static int mm_is_core_local(struct mm_struct *mm) +{ + return cpumask_subset(mm_cpumask(mm), + topology_sibling_cpumask(smp_processor_id())); +} + +void radix__flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + + if (!mm_is_core_local(mm)) { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_pid(pid); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } else + _tlbiel_pid(pid); +no_context: + preempt_enable(); +} +EXPORT_SYMBOL(radix__flush_tlb_mm); + +void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid) +{ + unsigned int pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto bail; + if (!mm_is_core_local(mm)) { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_va(vmaddr, pid, ap); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } else + _tlbiel_va(vmaddr, pid, ap); +bail: + preempt_enable(); +} + +void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + radix___flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_ap(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(radix__flush_tlb_page); + +#endif /* CONFIG_SMP */ + +void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_pid(0); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); +} +EXPORT_SYMBOL(radix__flush_tlb_kernel_range); + +/* + * Currently, for range flushing, we just do a full mm flush. Because + * we use this in code path where we don' track the page size. + */ +void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ + struct mm_struct *mm = vma->vm_mm; + radix__flush_tlb_mm(mm); +} +EXPORT_SYMBOL(radix__flush_tlb_range); + + +void radix__tlb_flush(struct mmu_gather *tlb) +{ + struct mm_struct *mm = tlb->mm; + radix__flush_tlb_mm(mm); +} -- cgit v0.10.2 From 773edeadf672546cd554e0797fc02a9e6edf01ae Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 11 May 2016 15:30:47 +1000 Subject: powerpc/mm: Add mask of possible MMU features Follow the example of the cpu feature code, and add a mask of possible MMU features, MMU_FTRS_POSSIBLE. This is used in mmu_has_feature(), which allows the possible mask to act as a shortcut for any features that are not possible, but still allows the feature bit itself to be defined. We will use this in the next commit to allow MMU_FTR_RADIX checks to be elided when MMU_FTR_RADIX is not possible. Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 34070e9..fbd6f35 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -110,9 +110,21 @@ DECLARE_PER_CPU(int, next_tlbcam_idx); #endif +enum { + MMU_FTRS_POSSIBLE = MMU_FTR_HPTE_TABLE | MMU_FTR_TYPE_8xx | + MMU_FTR_TYPE_40x | MMU_FTR_TYPE_44x | MMU_FTR_TYPE_FSL_E | + MMU_FTR_TYPE_47x | MMU_FTR_USE_HIGH_BATS | MMU_FTR_BIG_PHYS | + MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_USE_TLBILX | + MMU_FTR_LOCK_BCAST_INVAL | MMU_FTR_NEED_DTLB_SW_LRU | + MMU_FTR_USE_TLBRSRV | MMU_FTR_USE_PAIRED_MAS | + MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL | + MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | + MMU_FTR_1T_SEGMENT, +}; + static inline int mmu_has_feature(unsigned long feature) { - return (cur_cpu_spec->mmu_features & feature); + return (MMU_FTRS_POSSIBLE & cur_cpu_spec->mmu_features & feature); } static inline void mmu_clear_feature(unsigned long feature) -- cgit v0.10.2 From a8ed87c92adf1fd45142323e04f15b522117d575 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:06 +1000 Subject: powerpc/mm/radix: Add MMU_FTR_RADIX We are going to add asm changes in the follow up patches. Add the feature bit now so that we can get it all build. mpe: When CONFIG_PPC_RADIX_MMU=n we omit MMU_FTR_RADIX from the MMU_FTRS_POSSIBLE mask. This allows the compiler to work out that those checks will always be false and so the code can be elided completely. Note we do *not* define MMU_FTR_RADIX to 0 in the RADIX_MMU=n case, because that doesn't work with the ASM_FTR patching. In particular an IF_SET section will result in a mask and value of zero, which is always true, meaning the section *won't* be patched, which is the opposite of what we want. Signed-off-by: Aneesh Kumar K.V Acked-by: Balbir Singh Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 94d52ff..5854263 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -23,7 +23,8 @@ struct mmu_psize_def { }; extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; -#define radix_enabled() (0) +#define radix_enabled() mmu_has_feature(MMU_FTR_RADIX) + #endif /* __ASSEMBLY__ */ /* 64-bit classic hash table MMU */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index fbd6f35..e53ebeb 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -88,6 +88,11 @@ */ #define MMU_FTR_1T_SEGMENT ASM_CONST(0x40000000) +/* + * Radix page table available + */ +#define MMU_FTR_RADIX ASM_CONST(0x80000000) + /* MMU feature bit sets for various CPUs */ #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2 \ MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2 @@ -119,7 +124,11 @@ enum { MMU_FTR_USE_TLBRSRV | MMU_FTR_USE_PAIRED_MAS | MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | - MMU_FTR_1T_SEGMENT, + MMU_FTR_1T_SEGMENT | +#ifdef CONFIG_PPC_RADIX_MMU + MMU_FTR_RADIX | +#endif + 0, }; static inline int mmu_has_feature(unsigned long feature) -- cgit v0.10.2 From caca285e5ab4a7a19fede51688106ceed6fc45dd Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:07 +1000 Subject: powerpc/mm/radix: Use STD_MMU_64 to properly isolate hash related code We also use MMU_FTR_RADIX to branch out from code path specific to hash. No functionality change. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 4fd4922..73e461a 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -529,7 +529,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) std r6,PACACURRENT(r13) /* Set new 'current' */ ld r8,KSP(r4) /* new stack pointer */ -#ifdef CONFIG_PPC_BOOK3S +#ifdef CONFIG_PPC_STD_MMU_64 +BEGIN_MMU_FTR_SECTION + b 2f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX) BEGIN_FTR_SECTION clrrdi r6,r8,28 /* get its ESID */ clrrdi r9,r1,28 /* get current sp ESID */ @@ -575,7 +578,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) slbmte r7,r0 isync 2: -#endif /* !CONFIG_PPC_BOOK3S */ +#endif /* CONFIG_PPC_STD_MMU_64 */ CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 52d2d74..7272e6c 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -939,7 +939,13 @@ data_access_common: ld r3,PACA_EXGEN+EX_DAR(r13) lwz r4,PACA_EXGEN+EX_DSISR(r13) li r5,0x300 + std r3,_DAR(r1) + std r4,_DSISR(r1) +BEGIN_MMU_FTR_SECTION b do_hash_page /* Try to handle as hpte fault */ +MMU_FTR_SECTION_ELSE + b handle_page_fault +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) .align 7 .globl h_data_storage_common @@ -964,7 +970,13 @@ instruction_access_common: ld r3,_NIP(r1) andis. r4,r12,0x5820 li r5,0x400 + std r3,_DAR(r1) + std r4,_DSISR(r1) +BEGIN_MMU_FTR_SECTION b do_hash_page /* Try to handle as hpte fault */ +MMU_FTR_SECTION_ELSE + b handle_page_fault +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) @@ -1375,8 +1387,11 @@ slb_miss_realmode: stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ +#ifdef CONFIG_PPC_STD_MMU_64 +BEGIN_MMU_FTR_SECTION bl slb_allocate_realmode - +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX) +#endif /* All done -- return from exception. */ ld r10,PACA_EXSLB+EX_LR(r13) @@ -1384,7 +1399,9 @@ slb_miss_realmode: lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ mtlr r10 - +BEGIN_MMU_FTR_SECTION + b 2f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX) andi. r10,r12,MSR_RI /* check for unrecoverable exception */ beq- 2f @@ -1435,9 +1452,7 @@ power4_fixup_nap: */ .align 7 do_hash_page: - std r3,_DAR(r1) - std r4,_DSISR(r1) - +#ifdef CONFIG_PPC_STD_MMU_64 andis. r0,r4,0xa410 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ andis. r0,r4,DSISR_DABRMATCH@h @@ -1465,6 +1480,7 @@ do_hash_page: /* Error */ blt- 13f +#endif /* CONFIG_PPC_STD_MMU_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: @@ -1491,6 +1507,7 @@ handle_dabr_fault: 12: b ret_from_except_lite +#ifdef CONFIG_PPC_STD_MMU_64 /* We have a page fault that hash_page could handle but HV refused * the PTE insertion */ @@ -1500,6 +1517,7 @@ handle_dabr_fault: ld r4,_DAR(r1) bl low_hash_fault b ret_from_except +#endif /* * We come here as a result of a DSI at a point where we don't want diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 0fbd75d..1da864c 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -76,6 +76,7 @@ int default_machine_kexec_prepare(struct kimage *image) * end of the blocked region (begin >= high). Use the * boolean identity !(a || b) === (!a && !b). */ +#ifdef CONFIG_PPC_STD_MMU_64 if (htab_address) { low = __pa(htab_address); high = low + htab_size_bytes; @@ -88,6 +89,7 @@ int default_machine_kexec_prepare(struct kimage *image) return -ETXTBSY; } } +#endif /* CONFIG_PPC_STD_MMU_64 */ /* We also should not overwrite the tce tables */ for_each_node_by_type(node, "pci") { @@ -381,7 +383,7 @@ void default_machine_kexec(struct kimage *image) /* NOTREACHED */ } -#ifndef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_STD_MMU_64 /* Values we need to export to the second kernel via the device tree. */ static unsigned long htab_base; static unsigned long htab_size; @@ -428,4 +430,4 @@ static int __init export_htab_values(void) return 0; } late_initcall(export_htab_values); -#endif /* !CONFIG_PPC_BOOK3E */ +#endif /* CONFIG_PPC_STD_MMU_64 */ diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index f646602..bec7033 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -80,6 +80,7 @@ void __flush_tlb_power9(unsigned int action) /* flush SLBs and reload */ +#ifdef CONFIG_PPC_MMU_STD_64 static void flush_and_reload_slb(void) { struct slb_shadow *slb; @@ -113,6 +114,7 @@ static void flush_and_reload_slb(void) asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); } } +#endif static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) { @@ -123,6 +125,7 @@ static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) * reset the error bits whenever we handle them so that at the end * we can check whether we handled all of them or not. * */ +#ifdef CONFIG_PPC_MMU_STD_64 if (dsisr & slb_error_bits) { flush_and_reload_slb(); /* reset error bits */ @@ -134,6 +137,7 @@ static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) /* reset error bits */ dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB; } +#endif /* Any other errors we don't understand? */ if (dsisr & 0xffffffffUL) handled = 0; @@ -153,6 +157,7 @@ static long mce_handle_common_ierror(uint64_t srr1) switch (P7_SRR1_MC_IFETCH(srr1)) { case 0: break; +#ifdef CONFIG_PPC_MMU_STD_64 case P7_SRR1_MC_IFETCH_SLB_PARITY: case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: /* flush and reload SLBs for SLB errors. */ @@ -165,6 +170,7 @@ static long mce_handle_common_ierror(uint64_t srr1) handled = 1; } break; +#endif default: break; } @@ -178,10 +184,12 @@ static long mce_handle_ierror_p7(uint64_t srr1) handled = mce_handle_common_ierror(srr1); +#ifdef CONFIG_PPC_MMU_STD_64 if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { flush_and_reload_slb(); handled = 1; } +#endif return handled; } @@ -324,10 +332,12 @@ static long mce_handle_ierror_p8(uint64_t srr1) handled = mce_handle_common_ierror(srr1); +#ifdef CONFIG_PPC_MMU_STD_64 if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { flush_and_reload_slb(); handled = 1; } +#endif return handled; } diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ad79816..ea8a28f 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1079,7 +1079,7 @@ struct task_struct *__switch_to(struct task_struct *prev, } #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_STD_MMU_64 batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->active) { current_thread_info()->local_flags |= _TLF_LAZY_MMU; @@ -1087,7 +1087,7 @@ struct task_struct *__switch_to(struct task_struct *prev, __flush_tlb_pending(batch); batch->active = 0; } -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_STD_MMU_64 */ #ifdef CONFIG_PPC_ADV_DEBUG_REGS switch_booke_debug_regs(&new->thread.debug); @@ -1133,7 +1133,7 @@ struct task_struct *__switch_to(struct task_struct *prev, last = _switch(old_thread, new_thread); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_STD_MMU_64 if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; batch = this_cpu_ptr(&ppc64_tlb_batch); @@ -1142,8 +1142,7 @@ struct task_struct *__switch_to(struct task_struct *prev, if (current_thread_info()->task->thread.regs) restore_math(current_thread_info()->task->thread.regs); - -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_STD_MMU_64 */ return last; } @@ -1378,6 +1377,9 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) unsigned long sp_vsid; unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; + if (radix_enabled()) + return; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) sp_vsid = get_kernel_vsid(sp, MMU_SEGSIZE_1T) << SLB_VSID_SHIFT_1T; @@ -1926,7 +1928,8 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * the heap, we can put it above 1TB so it is backed by a 1TB * segment. Otherwise the heap will be in the bottom 1TB * which always uses 256MB segments and this may result in a - * performance penalty. + * performance penalty. We don't need to worry about radix. For + * radix, mmu_highuser_ssize remains unchanged from 256MB. */ if (!is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T)) base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 942796f..308283b 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2913,7 +2913,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid, printf("%s", after); } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_STD_MMU_64 void dump_segments(void) { int i; -- cgit v0.10.2 From d8c476eeb697c9aac46b390e9cbd7af7032814f1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:08 +1000 Subject: powerpc/mm/radix: Isolate hash table function from pseries guest code Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index e4ceba2..7f6100d 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -89,18 +89,21 @@ void vpa_init(int cpu) "%lx failed with %ld\n", cpu, hwcpu, addr, ret); return; } + +#ifdef CONFIG_PPC_STD_MMU_64 /* * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. */ - addr = __pa(paca[cpu].slb_shadow_ptr); - if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) { + addr = __pa(paca[cpu].slb_shadow_ptr); ret = register_slb_shadow(hwcpu, addr); if (ret) pr_err("WARNING: SLB shadow buffer registration for " "cpu %d (hw %d) of area %lx failed with %ld\n", cpu, hwcpu, addr, ret); } +#endif /* CONFIG_PPC_STD_MMU_64 */ /* * Register dispatch trace log, if one has been allocated. @@ -123,6 +126,8 @@ void vpa_init(int cpu) } } +#ifdef CONFIG_PPC_STD_MMU_64 + static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, @@ -655,6 +660,8 @@ static void pSeries_set_page_state(struct page *page, int order, void arch_free_page(struct page *page, int order) { + if (radix_enabled()) + return; if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO)) return; @@ -662,7 +669,8 @@ void arch_free_page(struct page *page, int order) } EXPORT_SYMBOL(arch_free_page); -#endif +#endif /* CONFIG_PPC_SMLPAR */ +#endif /* CONFIG_PPC_STD_MMU_64 */ #ifdef CONFIG_TRACEPOINTS #ifdef HAVE_JUMP_LABEL diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index c9fecf0..afa05a2 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -484,8 +484,9 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) seq_printf(m, "shared_processor_mode=%d\n", lppaca_shared_proc(get_lppaca())); +#ifdef CONFIG_PPC_STD_MMU_64 seq_printf(m, "slb_size=%d\n", mmu_slb_size); - +#endif parse_em_data(m); return 0; -- cgit v0.10.2 From 764041e0f43cc7846f6d8eb246d65b53cc06c764 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:09 +1000 Subject: powerpc/mm/radix: Add checks in slice code to catch radix usage Radix doesn't need slice support. Catch incorrect usage of slice code when radix is enabled. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index ee21b86..2b27458 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -395,6 +395,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Sanity checks */ BUG_ON(mm->task_size == 0); + VM_BUG_ON(radix_enabled()); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", @@ -568,6 +569,16 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) unsigned char *hpsizes; int index, mask_index; + /* + * Radix doesn't use slice, but can get enabled along with MMU_SLICE + */ + if (radix_enabled()) { +#ifdef CONFIG_PPC_64K_PAGES + return MMU_PAGE_64K; +#else + return MMU_PAGE_4K; +#endif + } if (addr < SLICE_LOW_TOP) { u64 lpsizes; lpsizes = mm->context.low_slices_psize; @@ -605,6 +616,7 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize); + VM_BUG_ON(radix_enabled()); spin_lock_irqsave(&slice_convert_lock, flags); old_psize = mm->context.user_psize; @@ -649,6 +661,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, { struct slice_mask mask = slice_range_to_mask(start, len); + VM_BUG_ON(radix_enabled()); slice_convert(mm, mask, psize); } @@ -678,6 +691,9 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, struct slice_mask mask, available; unsigned int psize = mm->context.user_psize; + if (radix_enabled()) + return 0; + mask = slice_range_to_mask(addr, len); available = slice_mask_for_size(mm, psize); #ifdef CONFIG_PPC_64K_PAGES -- cgit v0.10.2 From 177ba7c647f37bc3f31667192059ee794347d79d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:10 +1000 Subject: powerpc/mm/radix: Limit paca allocation in radix On return from RTAS we access the paca variables and we have 64 bit disabled. This requires us to limit paca in 32 bit range. Fix this by setting ppc64_rma_size to first_memblock_size/1G range. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 0472f8a..04ec976 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -351,7 +351,25 @@ void radix__early_init_mmu_secondary(void) void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { - /* Finally limit subsequent allocations */ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + /* + * We limit the allocation that depend on ppc64_rma_size + * to first_memblock_size. We also clamp it to 1GB to + * avoid some funky things such as RTAS bugs. + * + * On radix config we really don't have a limitation + * on real mode access. But keeping it as above works + * well enough. + */ + ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); + /* + * Finally limit subsequent allocations. We really don't want + * to limit the memblock allocations to rma_size. FIXME!! should + * we even limit at all ? + */ memblock_set_current_limit(first_memblock_base + first_memblock_size); } -- cgit v0.10.2 From 7a0eedeedd5d192f64fd4cd46634e409cb7a513b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:11 +1000 Subject: powerpc/mm/radix: Pick the address layout for radix config Hash needs special get_unmapped_area() handling because of limitations around base page size, so we have to set HAVE_ARCH_UNMAPPED_AREA. With radix we don't have such restrictions, so we could use the generic code. But because we've set HAVE_ARCH_UNMAPPED_AREA (for hash), we have to re-implement the same logic as the generic code. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 30611f8..2f1e443 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include /* * Top of mmap area (just below the process stack). @@ -79,6 +81,111 @@ static inline unsigned long mmap_base(unsigned long rnd) return PAGE_ALIGN(TASK_SIZE - gap - rnd); } +#ifdef CONFIG_PPC_RADIX_MMU +/* + * Same function as generic code used only for radix, because we don't need to overload + * the generic one. But we will have to duplicate, because hash select + * HAVE_ARCH_UNMAPPED_AREA + */ +static unsigned long +radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct vm_unmapped_area_info info; + + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = 0; + return vm_unmapped_area(&info); +} + +static unsigned long +radix__arch_get_unmapped_area_topdown(struct file *filp, + const unsigned long addr0, + const unsigned long len, + const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = mm->mmap_base; + info.align_mask = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } + + return addr; +} + +static void radix__arch_pick_mmap_layout(struct mm_struct *mm, + unsigned long random_factor) +{ + if (mmap_is_legacy()) { + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = radix__arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor); + mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; + } +} +#else +/* dummy */ +extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, + unsigned long random_factor); +#endif /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: @@ -90,6 +197,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); + if (radix_enabled()) + return radix__arch_pick_mmap_layout(mm, random_factor); /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: -- cgit v0.10.2 From b5dcc6096971b7c11b443b79ae8b69ccb01ec04e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:12 +1000 Subject: powerpc/mm/radix: Update PTCR on secondary CPUs Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index e6b53cd..faaadef 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -896,9 +896,13 @@ void __init hash__early_init_mmu(void) void hash__early_init_mmu_secondary(void) { /* Initialize hash table for that CPU */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) - mtspr(SPRN_SDR1, _SDR1); - + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + mtspr(SPRN_SDR1, _SDR1); + else + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + } /* Initialize SLB */ slb_initialize(); } -- cgit v0.10.2 From 101ad5c65ef2631f402643c760574a6f0f7f64d0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:13 +1000 Subject: powerpc/mm: Make a copy of pgalloc.h for 32 and 64 book3s This patch start to make a book3s variant for pgalloc headers. We have multiple book3s specific changes such as: * 4 level page table * store physical address in higher level table * use pte_t * for pgtable_t Having a book3s64 specific variant helps to keep code simpler and remove lots of #ifdef around code. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h new file mode 100644 index 0000000..76d6b9e --- /dev/null +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -0,0 +1,109 @@ +#ifndef _ASM_POWERPC_PGALLOC_32_H +#define _ASM_POWERPC_PGALLOC_32_H + +#include + +/* For 32-bit, all levels of page tables are just drawn from get_free_page() */ +#define MAX_PGTABLE_INDEX_SIZE 0 + +extern void __bad_pte(pmd_t *pmd); + +extern pgd_t *pgd_alloc(struct mm_struct *mm); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); + +/* + * We don't have any real pmd's, and this code never triggers because + * the pgd will always be present.. + */ +/* #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) */ +#define pmd_free(mm, x) do { } while (0) +#define __pmd_free_tlb(tlb,x,a) do { } while (0) +/* #define pgd_populate(mm, pmd, pte) BUG() */ + +#ifndef CONFIG_BOOKE + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, + pte_t *pte) +{ + *pmdp = __pmd(__pa(pte) | _PMD_PRESENT); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pte_page) +{ + *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_PRESENT); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) +#else + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, + pte_t *pte) +{ + *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pte_page) +{ + *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | _PMD_PRESENT); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) +#endif + +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); +extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + free_page((unsigned long)pte); +} + +static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ + pgtable_page_dtor(ptepage); + __free_page(ptepage); +} + +static inline void pgtable_free(void *table, unsigned index_size) +{ + BUG_ON(index_size); /* 32-bit doesn't use this */ + free_page((unsigned long)table); +} + +#define check_pgt_cache() do { } while (0) + +#ifdef CONFIG_SMP +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + unsigned long pgf = (unsigned long)table; + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + pgf |= shift; + tlb_remove_table(tlb, (void *)pgf); +} + +static inline void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + pgtable_free(table, shift); +} +#else +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + pgtable_free(table, shift); +} +#endif + +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, + unsigned long address) +{ + tlb_flush_pgtable(tlb, address); + pgtable_page_dtor(table); + pgtable_free_tlb(tlb, page_address(table), 0); +} +#endif /* _ASM_POWERPC_PGALLOC_32_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h new file mode 100644 index 0000000..8d5fc3a --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -0,0 +1,266 @@ +#ifndef _ASM_POWERPC_PGALLOC_64_H +#define _ASM_POWERPC_PGALLOC_64_H +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +struct vmemmap_backing { + struct vmemmap_backing *list; + unsigned long phys; + unsigned long virt_addr; +}; +extern struct vmemmap_backing *vmemmap_list; + +/* + * Functions that deal with pagetables that could be at any level of + * the table need to be passed an "index_size" so they know how to + * handle allocation. For PTE pages (which are linked to a struct + * page for now, and drawn from the main get_free_pages() pool), the + * allocation size will be (2^index_size * sizeof(pointer)) and + * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). + * + * The maximum index size needs to be big enough to allow any + * pagetable sizes we need, but small enough to fit in the low bits of + * any page table pointer. In other words all pagetables, even tiny + * ones, must be aligned to allow at least enough low 0 bits to + * contain this value. This value is also used as a mask, so it must + * be one less than a power of two. + */ +#define MAX_PGTABLE_INDEX_SIZE 0xf + +extern struct kmem_cache *pgtable_cache[]; +#define PGT_CACHE(shift) ({ \ + BUG_ON(!(shift)); \ + pgtable_cache[(shift) - 1]; \ + }) + +static inline pgd_t *pgd_alloc(struct mm_struct *mm) +{ + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); +} + +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); +} + +#ifndef CONFIG_PPC_64K_PAGES + +#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, __pgtable_ptr_val(PUD)) + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), + GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); +} + +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + pud_set(pud, __pgtable_ptr_val(pmd)); +} + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, + pte_t *pte) +{ + pmd_set(pmd, __pgtable_ptr_val(pte)); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, + pgtable_t pte_page) +{ + pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page))); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) +{ + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm, + unsigned long address) +{ + struct page *page; + pte_t *pte; + + pte = pte_alloc_one_kernel(mm, address); + if (!pte) + return NULL; + page = virt_to_page(pte); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + return page; +} + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + free_page((unsigned long)pte); +} + +static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ + pgtable_page_dtor(ptepage); + __free_page(ptepage); +} + +static inline void pgtable_free(void *table, unsigned index_size) +{ + if (!index_size) + free_page((unsigned long)table); + else { + BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); + kmem_cache_free(PGT_CACHE(index_size), table); + } +} + +#ifdef CONFIG_SMP +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + unsigned long pgf = (unsigned long)table; + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + pgf |= shift; + tlb_remove_table(tlb, (void *)pgf); +} + +static inline void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + pgtable_free(table, shift); +} +#else /* !CONFIG_SMP */ +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + pgtable_free(table, shift); +} +#endif /* CONFIG_SMP */ + +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, + unsigned long address) +{ + tlb_flush_pgtable(tlb, address); + pgtable_page_dtor(table); + pgtable_free_tlb(tlb, page_address(table), 0); +} + +#else /* if CONFIG_PPC_64K_PAGES */ + +extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int); +extern void page_table_free(struct mm_struct *, unsigned long *, int); +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); +#ifdef CONFIG_SMP +extern void __tlb_remove_table(void *_table); +#endif + +#ifndef __PAGETABLE_PUD_FOLDED +/* book3s 64 is 4 level page table */ +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + pgd_set(pgd, __pgtable_ptr_val(pud)); +} + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), + GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); +} +#endif + +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + pud_set(pud, __pgtable_ptr_val(pmd)); +} + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, + pte_t *pte) +{ + pmd_set(pmd, __pgtable_ptr_val(pte)); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, + pgtable_t pte_page) +{ + pmd_set(pmd, __pgtable_ptr_val(pte_page)); +} + +static inline pgtable_t pmd_pgtable(pmd_t pmd) +{ + return (pgtable_t)pmd_page_vaddr(pmd); +} + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) +{ + return (pte_t *)page_table_alloc(mm, address, 1); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm, + unsigned long address) +{ + return (pgtable_t)page_table_alloc(mm, address, 0); +} + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + page_table_free(mm, (unsigned long *)pte, 1); +} + +static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ + page_table_free(mm, (unsigned long *)ptepage, 0); +} + +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, + unsigned long address) +{ + tlb_flush_pgtable(tlb, address); + pgtable_free_tlb(tlb, table, 0); +} +#endif /* CONFIG_PPC_64K_PAGES */ + +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), + GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); +} + +#define __pmd_free_tlb(tlb, pmd, addr) \ + pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) +#ifndef __PAGETABLE_PUD_FOLDED +#define __pud_free_tlb(tlb, pud, addr) \ + pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) + +#endif /* __PAGETABLE_PUD_FOLDED */ + +#define check_pgt_cache() do { } while (0) + +#endif /* _ASM_POWERPC_PGALLOC_64_H */ -- cgit v0.10.2 From 75a9b8a6c2533064a8d5d090ada7bf9e4c4c84c1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:14 +1000 Subject: powerpc/mm: Copy pgalloc (part 2) This moves the nohash variant of pgalloc headers to nohash/ directory Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index 76d6b9e..a235019 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -1,5 +1,5 @@ -#ifndef _ASM_POWERPC_PGALLOC_32_H -#define _ASM_POWERPC_PGALLOC_32_H +#ifndef _ASM_POWERPC_BOOK3S_32_PGALLOC_H +#define _ASM_POWERPC_BOOK3S_32_PGALLOC_H #include @@ -106,4 +106,4 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, pgtable_page_dtor(table); pgtable_free_tlb(tlb, page_address(table), 0); } -#endif /* _ASM_POWERPC_PGALLOC_32_H */ +#endif /* _ASM_POWERPC_BOOK3S_32_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 8d5fc3a..5401726 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -1,5 +1,5 @@ -#ifndef _ASM_POWERPC_PGALLOC_64_H -#define _ASM_POWERPC_PGALLOC_64_H +#ifndef _ASM_POWERPC_BOOK3S_64_PGALLOC_H +#define _ASM_POWERPC_BOOK3S_64_PGALLOC_H /* * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -52,8 +52,10 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) } #ifndef CONFIG_PPC_64K_PAGES - -#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, __pgtable_ptr_val(PUD)) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + pgd_set(pgd, __pgtable_ptr_val(pud)); +} static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { @@ -83,7 +85,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page))); } -#define pmd_pgtable(pmd) pmd_page(pmd) +static inline pgtable_t pmd_pgtable(pmd_t pmd) +{ + return pmd_page(pmd); +} static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) @@ -263,4 +268,4 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #define check_pgt_cache() do { } while (0) -#endif /* _ASM_POWERPC_PGALLOC_64_H */ +#endif /* _ASM_POWERPC_BOOK3S_64_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/book3s/pgalloc.h b/arch/powerpc/include/asm/book3s/pgalloc.h new file mode 100644 index 0000000..54f591e --- /dev/null +++ b/arch/powerpc/include/asm/book3s/pgalloc.h @@ -0,0 +1,19 @@ +#ifndef _ASM_POWERPC_BOOK3S_PGALLOC_H +#define _ASM_POWERPC_BOOK3S_PGALLOC_H + +#include + +extern void tlb_remove_table(struct mmu_gather *tlb, void *table); +static inline void tlb_flush_pgtable(struct mmu_gather *tlb, + unsigned long address) +{ + +} + +#ifdef CONFIG_PPC64 +#include +#else +#include +#endif + +#endif /* _ASM_POWERPC_BOOK3S_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h new file mode 100644 index 0000000..76d6b9e --- /dev/null +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -0,0 +1,109 @@ +#ifndef _ASM_POWERPC_PGALLOC_32_H +#define _ASM_POWERPC_PGALLOC_32_H + +#include + +/* For 32-bit, all levels of page tables are just drawn from get_free_page() */ +#define MAX_PGTABLE_INDEX_SIZE 0 + +extern void __bad_pte(pmd_t *pmd); + +extern pgd_t *pgd_alloc(struct mm_struct *mm); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); + +/* + * We don't have any real pmd's, and this code never triggers because + * the pgd will always be present.. + */ +/* #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) */ +#define pmd_free(mm, x) do { } while (0) +#define __pmd_free_tlb(tlb,x,a) do { } while (0) +/* #define pgd_populate(mm, pmd, pte) BUG() */ + +#ifndef CONFIG_BOOKE + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, + pte_t *pte) +{ + *pmdp = __pmd(__pa(pte) | _PMD_PRESENT); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pte_page) +{ + *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_PRESENT); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) +#else + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, + pte_t *pte) +{ + *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pte_page) +{ + *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | _PMD_PRESENT); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) +#endif + +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); +extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + free_page((unsigned long)pte); +} + +static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ + pgtable_page_dtor(ptepage); + __free_page(ptepage); +} + +static inline void pgtable_free(void *table, unsigned index_size) +{ + BUG_ON(index_size); /* 32-bit doesn't use this */ + free_page((unsigned long)table); +} + +#define check_pgt_cache() do { } while (0) + +#ifdef CONFIG_SMP +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + unsigned long pgf = (unsigned long)table; + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + pgf |= shift; + tlb_remove_table(tlb, (void *)pgf); +} + +static inline void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + pgtable_free(table, shift); +} +#else +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + pgtable_free(table, shift); +} +#endif + +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, + unsigned long address) +{ + tlb_flush_pgtable(tlb, address); + pgtable_page_dtor(table); + pgtable_free_tlb(tlb, page_address(table), 0); +} +#endif /* _ASM_POWERPC_PGALLOC_32_H */ diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h new file mode 100644 index 0000000..8d5fc3a --- /dev/null +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -0,0 +1,266 @@ +#ifndef _ASM_POWERPC_PGALLOC_64_H +#define _ASM_POWERPC_PGALLOC_64_H +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +struct vmemmap_backing { + struct vmemmap_backing *list; + unsigned long phys; + unsigned long virt_addr; +}; +extern struct vmemmap_backing *vmemmap_list; + +/* + * Functions that deal with pagetables that could be at any level of + * the table need to be passed an "index_size" so they know how to + * handle allocation. For PTE pages (which are linked to a struct + * page for now, and drawn from the main get_free_pages() pool), the + * allocation size will be (2^index_size * sizeof(pointer)) and + * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). + * + * The maximum index size needs to be big enough to allow any + * pagetable sizes we need, but small enough to fit in the low bits of + * any page table pointer. In other words all pagetables, even tiny + * ones, must be aligned to allow at least enough low 0 bits to + * contain this value. This value is also used as a mask, so it must + * be one less than a power of two. + */ +#define MAX_PGTABLE_INDEX_SIZE 0xf + +extern struct kmem_cache *pgtable_cache[]; +#define PGT_CACHE(shift) ({ \ + BUG_ON(!(shift)); \ + pgtable_cache[(shift) - 1]; \ + }) + +static inline pgd_t *pgd_alloc(struct mm_struct *mm) +{ + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); +} + +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); +} + +#ifndef CONFIG_PPC_64K_PAGES + +#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, __pgtable_ptr_val(PUD)) + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), + GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); +} + +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + pud_set(pud, __pgtable_ptr_val(pmd)); +} + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, + pte_t *pte) +{ + pmd_set(pmd, __pgtable_ptr_val(pte)); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, + pgtable_t pte_page) +{ + pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page))); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) +{ + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm, + unsigned long address) +{ + struct page *page; + pte_t *pte; + + pte = pte_alloc_one_kernel(mm, address); + if (!pte) + return NULL; + page = virt_to_page(pte); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + return page; +} + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + free_page((unsigned long)pte); +} + +static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ + pgtable_page_dtor(ptepage); + __free_page(ptepage); +} + +static inline void pgtable_free(void *table, unsigned index_size) +{ + if (!index_size) + free_page((unsigned long)table); + else { + BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); + kmem_cache_free(PGT_CACHE(index_size), table); + } +} + +#ifdef CONFIG_SMP +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + unsigned long pgf = (unsigned long)table; + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + pgf |= shift; + tlb_remove_table(tlb, (void *)pgf); +} + +static inline void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + pgtable_free(table, shift); +} +#else /* !CONFIG_SMP */ +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + pgtable_free(table, shift); +} +#endif /* CONFIG_SMP */ + +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, + unsigned long address) +{ + tlb_flush_pgtable(tlb, address); + pgtable_page_dtor(table); + pgtable_free_tlb(tlb, page_address(table), 0); +} + +#else /* if CONFIG_PPC_64K_PAGES */ + +extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int); +extern void page_table_free(struct mm_struct *, unsigned long *, int); +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); +#ifdef CONFIG_SMP +extern void __tlb_remove_table(void *_table); +#endif + +#ifndef __PAGETABLE_PUD_FOLDED +/* book3s 64 is 4 level page table */ +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + pgd_set(pgd, __pgtable_ptr_val(pud)); +} + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), + GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); +} +#endif + +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + pud_set(pud, __pgtable_ptr_val(pmd)); +} + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, + pte_t *pte) +{ + pmd_set(pmd, __pgtable_ptr_val(pte)); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, + pgtable_t pte_page) +{ + pmd_set(pmd, __pgtable_ptr_val(pte_page)); +} + +static inline pgtable_t pmd_pgtable(pmd_t pmd) +{ + return (pgtable_t)pmd_page_vaddr(pmd); +} + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) +{ + return (pte_t *)page_table_alloc(mm, address, 1); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm, + unsigned long address) +{ + return (pgtable_t)page_table_alloc(mm, address, 0); +} + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + page_table_free(mm, (unsigned long *)pte, 1); +} + +static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ + page_table_free(mm, (unsigned long *)ptepage, 0); +} + +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, + unsigned long address) +{ + tlb_flush_pgtable(tlb, address); + pgtable_free_tlb(tlb, table, 0); +} +#endif /* CONFIG_PPC_64K_PAGES */ + +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), + GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); +} + +#define __pmd_free_tlb(tlb, pmd, addr) \ + pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) +#ifndef __PAGETABLE_PUD_FOLDED +#define __pud_free_tlb(tlb, pud, addr) \ + pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) + +#endif /* __PAGETABLE_PUD_FOLDED */ + +#define check_pgt_cache() do { } while (0) + +#endif /* _ASM_POWERPC_PGALLOC_64_H */ diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h new file mode 100644 index 0000000..b39ec95 --- /dev/null +++ b/arch/powerpc/include/asm/nohash/pgalloc.h @@ -0,0 +1,23 @@ +#ifndef _ASM_POWERPC_NOHASH_PGALLOC_H +#define _ASM_POWERPC_NOHASH_PGALLOC_H + +#include + +extern void tlb_remove_table(struct mmu_gather *tlb, void *table); +#ifdef CONFIG_PPC64 +extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address); +#else +/* 44x etc which is BOOKE not BOOK3E */ +static inline void tlb_flush_pgtable(struct mmu_gather *tlb, + unsigned long address) +{ + +} +#endif /* !CONFIG_PPC_BOOK3E */ + +#ifdef CONFIG_PPC64 +#include +#else +#include +#endif +#endif /* _ASM_POWERPC_NOHASH_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/pgalloc-32.h deleted file mode 100644 index 76d6b9e..0000000 --- a/arch/powerpc/include/asm/pgalloc-32.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef _ASM_POWERPC_PGALLOC_32_H -#define _ASM_POWERPC_PGALLOC_32_H - -#include - -/* For 32-bit, all levels of page tables are just drawn from get_free_page() */ -#define MAX_PGTABLE_INDEX_SIZE 0 - -extern void __bad_pte(pmd_t *pmd); - -extern pgd_t *pgd_alloc(struct mm_struct *mm); -extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); - -/* - * We don't have any real pmd's, and this code never triggers because - * the pgd will always be present.. - */ -/* #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) */ -#define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb,x,a) do { } while (0) -/* #define pgd_populate(mm, pmd, pte) BUG() */ - -#ifndef CONFIG_BOOKE - -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, - pte_t *pte) -{ - *pmdp = __pmd(__pa(pte) | _PMD_PRESENT); -} - -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pte_page) -{ - *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_PRESENT); -} - -#define pmd_pgtable(pmd) pmd_page(pmd) -#else - -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, - pte_t *pte) -{ - *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT); -} - -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pte_page) -{ - *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | _PMD_PRESENT); -} - -#define pmd_pgtable(pmd) pmd_page(pmd) -#endif - -extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); -extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) -{ - pgtable_page_dtor(ptepage); - __free_page(ptepage); -} - -static inline void pgtable_free(void *table, unsigned index_size) -{ - BUG_ON(index_size); /* 32-bit doesn't use this */ - free_page((unsigned long)table); -} - -#define check_pgt_cache() do { } while (0) - -#ifdef CONFIG_SMP -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - unsigned long pgf = (unsigned long)table; - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - pgf |= shift; - tlb_remove_table(tlb, (void *)pgf); -} - -static inline void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - pgtable_free(table, shift); -} -#else -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - pgtable_free(table, shift); -} -#endif - -static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, - unsigned long address) -{ - tlb_flush_pgtable(tlb, address); - pgtable_page_dtor(table); - pgtable_free_tlb(tlb, page_address(table), 0); -} -#endif /* _ASM_POWERPC_PGALLOC_32_H */ diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h deleted file mode 100644 index 8d5fc3a..0000000 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ /dev/null @@ -1,266 +0,0 @@ -#ifndef _ASM_POWERPC_PGALLOC_64_H -#define _ASM_POWERPC_PGALLOC_64_H -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include - -struct vmemmap_backing { - struct vmemmap_backing *list; - unsigned long phys; - unsigned long virt_addr; -}; -extern struct vmemmap_backing *vmemmap_list; - -/* - * Functions that deal with pagetables that could be at any level of - * the table need to be passed an "index_size" so they know how to - * handle allocation. For PTE pages (which are linked to a struct - * page for now, and drawn from the main get_free_pages() pool), the - * allocation size will be (2^index_size * sizeof(pointer)) and - * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). - * - * The maximum index size needs to be big enough to allow any - * pagetable sizes we need, but small enough to fit in the low bits of - * any page table pointer. In other words all pagetables, even tiny - * ones, must be aligned to allow at least enough low 0 bits to - * contain this value. This value is also used as a mask, so it must - * be one less than a power of two. - */ -#define MAX_PGTABLE_INDEX_SIZE 0xf - -extern struct kmem_cache *pgtable_cache[]; -#define PGT_CACHE(shift) ({ \ - BUG_ON(!(shift)); \ - pgtable_cache[(shift) - 1]; \ - }) - -static inline pgd_t *pgd_alloc(struct mm_struct *mm) -{ - return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); -} - -static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); -} - -#ifndef CONFIG_PPC_64K_PAGES - -#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, __pgtable_ptr_val(PUD)) - -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), - GFP_KERNEL|__GFP_REPEAT); -} - -static inline void pud_free(struct mm_struct *mm, pud_t *pud) -{ - kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); -} - -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) -{ - pud_set(pud, __pgtable_ptr_val(pmd)); -} - -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, - pte_t *pte) -{ - pmd_set(pmd, __pgtable_ptr_val(pte)); -} - -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, - pgtable_t pte_page) -{ - pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page))); -} - -#define pmd_pgtable(pmd) pmd_page(pmd) - -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) -{ - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) -{ - struct page *page; - pte_t *pte; - - pte = pte_alloc_one_kernel(mm, address); - if (!pte) - return NULL; - page = virt_to_page(pte); - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - return page; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) -{ - pgtable_page_dtor(ptepage); - __free_page(ptepage); -} - -static inline void pgtable_free(void *table, unsigned index_size) -{ - if (!index_size) - free_page((unsigned long)table); - else { - BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); - kmem_cache_free(PGT_CACHE(index_size), table); - } -} - -#ifdef CONFIG_SMP -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - unsigned long pgf = (unsigned long)table; - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - pgf |= shift; - tlb_remove_table(tlb, (void *)pgf); -} - -static inline void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - pgtable_free(table, shift); -} -#else /* !CONFIG_SMP */ -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - pgtable_free(table, shift); -} -#endif /* CONFIG_SMP */ - -static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, - unsigned long address) -{ - tlb_flush_pgtable(tlb, address); - pgtable_page_dtor(table); - pgtable_free_tlb(tlb, page_address(table), 0); -} - -#else /* if CONFIG_PPC_64K_PAGES */ - -extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int); -extern void page_table_free(struct mm_struct *, unsigned long *, int); -extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); -#ifdef CONFIG_SMP -extern void __tlb_remove_table(void *_table); -#endif - -#ifndef __PAGETABLE_PUD_FOLDED -/* book3s 64 is 4 level page table */ -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) -{ - pgd_set(pgd, __pgtable_ptr_val(pud)); -} - -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), - GFP_KERNEL|__GFP_REPEAT); -} - -static inline void pud_free(struct mm_struct *mm, pud_t *pud) -{ - kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); -} -#endif - -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) -{ - pud_set(pud, __pgtable_ptr_val(pmd)); -} - -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, - pte_t *pte) -{ - pmd_set(pmd, __pgtable_ptr_val(pte)); -} - -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, - pgtable_t pte_page) -{ - pmd_set(pmd, __pgtable_ptr_val(pte_page)); -} - -static inline pgtable_t pmd_pgtable(pmd_t pmd) -{ - return (pgtable_t)pmd_page_vaddr(pmd); -} - -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, - unsigned long address) -{ - return (pte_t *)page_table_alloc(mm, address, 1); -} - -static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) -{ - return (pgtable_t)page_table_alloc(mm, address, 0); -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - page_table_free(mm, (unsigned long *)pte, 1); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) -{ - page_table_free(mm, (unsigned long *)ptepage, 0); -} - -static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, - unsigned long address) -{ - tlb_flush_pgtable(tlb, address); - pgtable_free_tlb(tlb, table, 0); -} -#endif /* CONFIG_PPC_64K_PAGES */ - -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), - GFP_KERNEL|__GFP_REPEAT); -} - -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) -{ - kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); -} - -#define __pmd_free_tlb(tlb, pmd, addr) \ - pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) -#ifndef __PAGETABLE_PUD_FOLDED -#define __pud_free_tlb(tlb, pud, addr) \ - pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) - -#endif /* __PAGETABLE_PUD_FOLDED */ - -#define check_pgt_cache() do { } while (0) - -#endif /* _ASM_POWERPC_PGALLOC_64_H */ diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index fc3ee06..0413457 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -1,25 +1,12 @@ #ifndef _ASM_POWERPC_PGALLOC_H #define _ASM_POWERPC_PGALLOC_H -#ifdef __KERNEL__ #include -#ifdef CONFIG_PPC_BOOK3E -extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address); -#else /* CONFIG_PPC_BOOK3E */ -static inline void tlb_flush_pgtable(struct mmu_gather *tlb, - unsigned long address) -{ -} -#endif /* !CONFIG_PPC_BOOK3E */ - -extern void tlb_remove_table(struct mmu_gather *tlb, void *table); - -#ifdef CONFIG_PPC64 -#include +#ifdef CONFIG_PPC_BOOK3S +#include #else -#include +#include #endif -#endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_PGALLOC_H */ -- cgit v0.10.2 From 27209206a60fa6ffc92e8d9886ae32abaad5b442 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:15 +1000 Subject: powerpc/mm: Revert changes made to nohash pgalloc-64.h This reverts pgalloc related changes WRT implementing 4-level page table for 64K Linux page size and storing of physical address in higher level page tables since they are only applicable to book3s64 variant and we now have a separate copy for book3s64. This helps to keep these headers simpler. Cc: Scott Wood Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 8d5fc3a..69ef28a 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -53,7 +53,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #ifndef CONFIG_PPC_64K_PAGES -#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, __pgtable_ptr_val(PUD)) +#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, (unsigned long)PUD) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { @@ -68,19 +68,19 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { - pud_set(pud, __pgtable_ptr_val(pmd)); + pud_set(pud, (unsigned long)pmd); } static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - pmd_set(pmd, __pgtable_ptr_val(pte)); + pmd_set(pmd, (unsigned long)pte); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page))); + pmd_set(pmd, (unsigned long)page_address(pte_page)); } #define pmd_pgtable(pmd) pmd_page(pmd) @@ -171,45 +171,23 @@ extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); extern void __tlb_remove_table(void *_table); #endif -#ifndef __PAGETABLE_PUD_FOLDED -/* book3s 64 is 4 level page table */ -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) -{ - pgd_set(pgd, __pgtable_ptr_val(pud)); -} - -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), - GFP_KERNEL|__GFP_REPEAT); -} - -static inline void pud_free(struct mm_struct *mm, pud_t *pud) -{ - kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); -} -#endif - -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) -{ - pud_set(pud, __pgtable_ptr_val(pmd)); -} +#define pud_populate(mm, pud, pmd) pud_set(pud, (unsigned long)pmd) static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - pmd_set(pmd, __pgtable_ptr_val(pte)); + pmd_set(pmd, (unsigned long)pte); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, __pgtable_ptr_val(pte_page)); + pmd_set(pmd, (unsigned long)pte_page); } static inline pgtable_t pmd_pgtable(pmd_t pmd) { - return (pgtable_t)pmd_page_vaddr(pmd); + return (pgtable_t)(pmd_val(pmd) & ~PMD_MASKED_BITS); } static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, @@ -255,11 +233,11 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #define __pmd_free_tlb(tlb, pmd, addr) \ pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) -#ifndef __PAGETABLE_PUD_FOLDED +#ifndef CONFIG_PPC_64K_PAGES #define __pud_free_tlb(tlb, pud, addr) \ pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) -#endif /* __PAGETABLE_PUD_FOLDED */ +#endif /* CONFIG_PPC_64K_PAGES */ #define check_pgt_cache() do { } while (0) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index f143d6f..d4d808c 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -108,9 +108,6 @@ #ifndef __ASSEMBLY__ /* pte_clear moved to later in this file */ -/* Pointers in the page table tree are virtual addresses */ -#define __pgtable_ptr_val(ptr) ((unsigned long)(ptr)) - #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) #define PUD_BAD_BITS (PMD_TABLE_SIZE-1) -- cgit v0.10.2 From bcbe7f777eb396f9bbf4cb1cab3b206f50881191 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:16 +1000 Subject: powerpc/mm: Simplify the code dropping 4-level table #ifdef Simplify the code by dropping 4-level page table #ifdef. We are always 4-level now. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 5401726..2a24624 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -51,7 +51,6 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); } -#ifndef CONFIG_PPC_64K_PAGES static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { pgd_set(pgd, __pgtable_ptr_val(pud)); @@ -78,7 +77,13 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, { pmd_set(pmd, __pgtable_ptr_val(pte)); } - +/* + * FIXME!! + * Between 4K and 64K pages, we differ in what is stored in pmd. ie. + * typedef pte_t *pgtable_t; -> 64K + * typedef struct page *pgtable_t; -> 4k + */ +#ifdef CONFIG_PPC_4K_PAGES static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { @@ -176,36 +181,6 @@ extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); extern void __tlb_remove_table(void *_table); #endif -#ifndef __PAGETABLE_PUD_FOLDED -/* book3s 64 is 4 level page table */ -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) -{ - pgd_set(pgd, __pgtable_ptr_val(pud)); -} - -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), - GFP_KERNEL|__GFP_REPEAT); -} - -static inline void pud_free(struct mm_struct *mm, pud_t *pud) -{ - kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); -} -#endif - -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) -{ - pud_set(pud, __pgtable_ptr_val(pmd)); -} - -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, - pte_t *pte) -{ - pmd_set(pmd, __pgtable_ptr_val(pte)); -} - static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { @@ -245,7 +220,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, tlb_flush_pgtable(tlb, address); pgtable_free_tlb(tlb, table, 0); } -#endif /* CONFIG_PPC_64K_PAGES */ +#endif /* CONFIG_PPC_4K_PAGES */ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { @@ -258,13 +233,17 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); } -#define __pmd_free_tlb(tlb, pmd, addr) \ - pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) -#ifndef __PAGETABLE_PUD_FOLDED -#define __pud_free_tlb(tlb, pud, addr) \ - pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long address) +{ + return pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX); +} -#endif /* __PAGETABLE_PUD_FOLDED */ +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE); +} #define check_pgt_cache() do { } while (0) -- cgit v0.10.2 From 74701d5947a6fb38ece37c1ff1e5a77c36ee7b9c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:17 +1000 Subject: powerpc/mm: Rename function to indicate we are allocating fragments Only code cleanup. No functionality change. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 2a24624..37283e3 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -174,8 +174,8 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, #else /* if CONFIG_PPC_64K_PAGES */ -extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int); -extern void page_table_free(struct mm_struct *, unsigned long *, int); +extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); +extern void pte_fragment_free(unsigned long *, int); extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); #ifdef CONFIG_SMP extern void __tlb_remove_table(void *_table); @@ -195,23 +195,23 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)page_table_alloc(mm, address, 1); + return (pte_t *)pte_fragment_alloc(mm, address, 1); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - return (pgtable_t)page_table_alloc(mm, address, 0); + return (pgtable_t)pte_fragment_alloc(mm, address, 0); } static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - page_table_free(mm, (unsigned long *)pte, 1); + pte_fragment_free((unsigned long *)pte, 1); } static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) { - page_table_free(mm, (unsigned long *)ptepage, 0); + pte_fragment_free((unsigned long *)ptepage, 0); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 69ef28a..be0cce7 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -164,8 +164,8 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, #else /* if CONFIG_PPC_64K_PAGES */ -extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int); -extern void page_table_free(struct mm_struct *, unsigned long *, int); +extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); +extern void pte_fragment_free(unsigned long *, int); extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); #ifdef CONFIG_SMP extern void __tlb_remove_table(void *_table); @@ -193,23 +193,23 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)page_table_alloc(mm, address, 1); + return (pte_t *)pte_fragment_alloc(mm, address, 1); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - return (pgtable_t)page_table_alloc(mm, address, 0); + return (pgtable_t)pte_fragment_alloc(mm, address, 0); } static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - page_table_free(mm, (unsigned long *)pte, 1); + pte_fragment_fre((unsigned long *)pte, 1); } static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) { - page_table_free(mm, (unsigned long *)ptepage, 0); + pte_fragment_free((unsigned long *)ptepage, 0); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index dafcd9f..a9a2238 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -357,7 +357,7 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) return (pte_t *)ret; } -pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) +pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) { pte_t *pte; @@ -368,7 +368,7 @@ pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) return __alloc_for_cache(mm, kernel); } -void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) +void pte_fragment_free(unsigned long *table, int kernel) { struct page *page = virt_to_page(table); if (put_page_testzero(page)) { @@ -379,15 +379,6 @@ void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) } #ifdef CONFIG_SMP -static void page_table_free_rcu(void *table) -{ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } -} - void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { unsigned long pgf = (unsigned long)table; @@ -404,7 +395,7 @@ void __tlb_remove_table(void *_table) if (!shift) /* PTE page needs special handling */ - page_table_free_rcu(table); + pte_fragment_free(table, 0); else { BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); kmem_cache_free(PGT_CACHE(shift), table); @@ -415,11 +406,7 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { if (!shift) { /* PTE page needs special handling */ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } + pte_fragment_free(table, 0); } else { BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); kmem_cache_free(PGT_CACHE(shift), table); -- cgit v0.10.2 From 934828edfadc43be07e53429ce501741bedf4a5e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:18 +1000 Subject: powerpc/mm: Make 4K and 64K use pte_t for pgtable_t This patch switches 4K Linux page size config to use pte_t * type instead of struct page * for pgtable_t. This simplifies the code a lot and helps in consolidating both 64K and 4K page allocator routines. The changes should not have any impact, because we already store physical address in the upper level page table tree and that implies we already do struct page * to physical address conversion. One change to note here is we move the pgtable_page_dtor() call for nohash to pte_fragment_free_mm(). The nohash related change is due to the related changes in pgtable_64.c. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 37283e3..faad131 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -41,6 +41,15 @@ extern struct kmem_cache *pgtable_cache[]; pgtable_cache[(shift) - 1]; \ }) +#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO + +extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); +extern void pte_fragment_free(unsigned long *, int); +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); +#ifdef CONFIG_SMP +extern void __tlb_remove_table(void *_table); +#endif + static inline pgd_t *pgd_alloc(struct mm_struct *mm) { return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); @@ -72,29 +81,47 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) pud_set(pud, __pgtable_ptr_val(pmd)); } +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE); +} + +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), + GFP_KERNEL|__GFP_REPEAT); +} + +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); +} + +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long address) +{ + return pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX); +} + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { pmd_set(pmd, __pgtable_ptr_val(pte)); } -/* - * FIXME!! - * Between 4K and 64K pages, we differ in what is stored in pmd. ie. - * typedef pte_t *pgtable_t; -> 64K - * typedef struct page *pgtable_t; -> 4k - */ -#ifdef CONFIG_PPC_4K_PAGES + static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page))); + pmd_set(pmd, __pgtable_ptr_val(pte_page)); } static inline pgtable_t pmd_pgtable(pmd_t pmd) { - return pmd_page(pmd); + return (pgtable_t)pmd_page_vaddr(pmd); } +#ifdef CONFIG_PPC_4K_PAGES static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { @@ -115,83 +142,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, __free_page(page); return NULL; } - return page; -} - -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) -{ - free_page((unsigned long)pte); -} - -static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) -{ - pgtable_page_dtor(ptepage); - __free_page(ptepage); -} - -static inline void pgtable_free(void *table, unsigned index_size) -{ - if (!index_size) - free_page((unsigned long)table); - else { - BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); - kmem_cache_free(PGT_CACHE(index_size), table); - } -} - -#ifdef CONFIG_SMP -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - unsigned long pgf = (unsigned long)table; - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - pgf |= shift; - tlb_remove_table(tlb, (void *)pgf); -} - -static inline void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - pgtable_free(table, shift); -} -#else /* !CONFIG_SMP */ -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - pgtable_free(table, shift); -} -#endif /* CONFIG_SMP */ - -static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, - unsigned long address) -{ - tlb_flush_pgtable(tlb, address); - pgtable_page_dtor(table); - pgtable_free_tlb(tlb, page_address(table), 0); + return pte; } - #else /* if CONFIG_PPC_64K_PAGES */ -extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); -extern void pte_fragment_free(unsigned long *, int); -extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); -#ifdef CONFIG_SMP -extern void __tlb_remove_table(void *_table); -#endif - -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, - pgtable_t pte_page) -{ - pmd_set(pmd, __pgtable_ptr_val(pte_page)); -} - -static inline pgtable_t pmd_pgtable(pmd_t pmd) -{ - return (pgtable_t)pmd_page_vaddr(pmd); -} - static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { @@ -199,10 +153,11 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, - unsigned long address) + unsigned long address) { return (pgtable_t)pte_fragment_alloc(mm, address, 0); } +#endif static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { @@ -220,30 +175,6 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, tlb_flush_pgtable(tlb, address); pgtable_free_tlb(tlb, table, 0); } -#endif /* CONFIG_PPC_4K_PAGES */ - -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), - GFP_KERNEL|__GFP_REPEAT); -} - -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) -{ - kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); -} - -static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, - unsigned long address) -{ - return pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX); -} - -static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, - unsigned long address) -{ - pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE); -} #define check_pgt_cache() do { } while (0) diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index be0cce7..0c12a3b 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -119,46 +119,14 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) __free_page(ptepage); } -static inline void pgtable_free(void *table, unsigned index_size) -{ - if (!index_size) - free_page((unsigned long)table); - else { - BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); - kmem_cache_free(PGT_CACHE(index_size), table); - } -} - +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); #ifdef CONFIG_SMP -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - unsigned long pgf = (unsigned long)table; - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - pgf |= shift; - tlb_remove_table(tlb, (void *)pgf); -} - -static inline void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - pgtable_free(table, shift); -} -#else /* !CONFIG_SMP */ -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - pgtable_free(table, shift); -} -#endif /* CONFIG_SMP */ - +extern void __tlb_remove_table(void *_table); +#endif static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { tlb_flush_pgtable(tlb, address); - pgtable_page_dtor(table); pgtable_free_tlb(tlb, page_address(table), 0); } diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 158574d..51db3a3 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -316,12 +316,20 @@ void arch_free_page(struct page *page, int order); #endif struct vm_area_struct; - +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * For BOOK3s 64 with 4k and 64K linux page size + * we want to use pointers, because the page table + * actually store pfn + */ +typedef pte_t *pgtable_t; +#else #if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC64) typedef pte_t *pgtable_t; #else typedef struct page *pgtable_t; #endif +#endif #include #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index a9a2238..326c3d4 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -367,6 +367,7 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel return __alloc_for_cache(mm, kernel); } +#endif /* CONFIG_PPC_64K_PAGES */ void pte_fragment_free(unsigned long *table, int kernel) { @@ -413,7 +414,6 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) } } #endif -#endif /* CONFIG_PPC_64K_PAGES */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -- cgit v0.10.2 From a2f41eb99208adf4732529c4df1fd12b39f4c333 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:19 +1000 Subject: powerpc/mm: Add radix pgalloc details Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index faad131..488279e 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -50,19 +50,45 @@ extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); extern void __tlb_remove_table(void *_table); #endif +static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm) +{ +#ifdef CONFIG_PPC_64K_PAGES + return (pgd_t *)__get_free_page(PGALLOC_GFP); +#else + struct page *page; + page = alloc_pages(PGALLOC_GFP, 4); + if (!page) + return NULL; + return (pgd_t *) page_address(page); +#endif +} + +static inline void radix__pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_PPC_64K_PAGES + free_page((unsigned long)pgd); +#else + free_pages((unsigned long)pgd, 4); +#endif +} + static inline pgd_t *pgd_alloc(struct mm_struct *mm) { + if (radix_enabled()) + return radix__pgd_alloc(mm); return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { + if (radix_enabled()) + return radix__pgd_free(mm, pgd); kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); } static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { - pgd_set(pgd, __pgtable_ptr_val(pud)); + pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) @@ -78,7 +104,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { - pud_set(pud, __pgtable_ptr_val(pmd)); + pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS); } static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, @@ -107,13 +133,13 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - pmd_set(pmd, __pgtable_ptr_val(pte)); + pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, __pgtable_ptr_val(pte_page)); + pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS); } static inline pgtable_t pmd_pgtable(pmd_t pmd) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 9bc2471..32a9756 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -170,11 +170,17 @@ extern unsigned long __pgd_table_size; #define PMD_TABLE_SIZE __pmd_table_size #define PUD_TABLE_SIZE __pud_table_size #define PGD_TABLE_SIZE __pgd_table_size + +extern unsigned long __pmd_val_bits; +extern unsigned long __pud_val_bits; +extern unsigned long __pgd_val_bits; +#define PMD_VAL_BITS __pmd_val_bits +#define PUD_VAL_BITS __pud_val_bits +#define PGD_VAL_BITS __pgd_val_bits /* * Pgtable size used by swapper, init in asm code - * We will switch this later to radix PGD */ -#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << H_PGD_INDEX_SIZE) +#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index faaadef..64165a7 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -882,6 +882,13 @@ void __init hash__early_init_mmu(void) __pmd_table_size = H_PMD_TABLE_SIZE; __pud_table_size = H_PUD_TABLE_SIZE; __pgd_table_size = H_PGD_TABLE_SIZE; + /* + * 4k use hugepd format, so for hash set then to + * zero + */ + __pmd_val_bits = 0; + __pud_val_bits = 0; + __pgd_val_bits = 0; /* Initialize the MMU Hash table and create the linear mapping * of memory. Has to be done before SLB initialization as this is * currently where the page size encoding is obtained. diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 04ec976..6182b6c 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -324,8 +324,11 @@ void __init radix__early_init_mmu(void) __pud_table_size = RADIX_PUD_TABLE_SIZE; __pgd_table_size = RADIX_PGD_TABLE_SIZE; - radix_init_page_sizes(); + __pmd_val_bits = RADIX_PMD_VAL_BITS; + __pud_val_bits = RADIX_PUD_VAL_BITS; + __pgd_val_bits = RADIX_PGD_VAL_BITS; + radix_init_page_sizes(); if (!firmware_has_feature(FW_FEATURE_LPAR)) radix_init_partition_table(); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 326c3d4..b8c75e6 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -91,6 +91,12 @@ unsigned long __pud_table_size; EXPORT_SYMBOL(__pud_table_size); unsigned long __pgd_table_size; EXPORT_SYMBOL(__pgd_table_size); +unsigned long __pmd_val_bits; +EXPORT_SYMBOL(__pmd_val_bits); +unsigned long __pud_val_bits; +EXPORT_SYMBOL(__pud_val_bits); +unsigned long __pgd_val_bits; +EXPORT_SYMBOL(__pgd_val_bits); #endif unsigned long ioremap_bot = IOREMAP_BASE; -- cgit v0.10.2 From 4dfb88ca9b66690d21030ccacc1cca73db90655e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:20 +1000 Subject: powerpc/mm: Update pte filter for radix Signed-off-by: Aneesh Kumar K.V Acked-by: Balbir Singh Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index db277b6..88a3075 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -82,6 +82,9 @@ static struct page *maybe_pte_to_page(pte_t pte) static pte_t set_pte_filter(pte_t pte) { + if (radix_enabled()) + return pte; + pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || cpu_has_feature(CPU_FTR_NOEXECUTE))) { -- cgit v0.10.2 From d6a9996e84ac4beb7713e9485f4563e100a9b03e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:21 +1000 Subject: powerpc/mm: vmalloc abstraction in preparation for radix The vmalloc range differs between hash and radix config. Hence make VMALLOC_START and related constants a variable which will be runtime initialized depending on whether hash or radix mode is active. Signed-off-by: Aneesh Kumar K.V [mpe: Fix missing init of ioremap_bot in pgtable_64.c for ppc64e] Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index cd3e915..f61cad3 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -45,17 +45,17 @@ /* * Define the address range of the kernel non-linear virtual area */ -#define KERN_VIRT_START ASM_CONST(0xD000000000000000) -#define KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) +#define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) +#define H_KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) /* * The vmalloc space starts at the beginning of that region, and * occupies half of it on hash CPUs and a quarter of it on Book3E * (we keep a quarter for the virtual memmap) */ -#define VMALLOC_START KERN_VIRT_START -#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) -#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) +#define H_VMALLOC_START H_KERN_VIRT_START +#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1) +#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) /* * Region IDs @@ -64,7 +64,7 @@ #define REGION_MASK (0xfUL << REGION_SHIFT) #define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) -#define VMALLOC_REGION_ID (REGION_ID(VMALLOC_START)) +#define VMALLOC_REGION_ID (REGION_ID(H_VMALLOC_START)) #define KERNEL_REGION_ID (REGION_ID(PAGE_OFFSET)) #define VMEMMAP_REGION_ID (0xfUL) /* Server only */ #define USER_REGION_ID (0UL) @@ -73,7 +73,7 @@ * Defines the address of the vmemap area, in its own region on * hash table CPUs. */ -#define VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) +#define H_VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) #ifdef CONFIG_PPC_MM_SLICES #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 32a9756..f5628f9 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -208,6 +208,18 @@ extern unsigned long __pgd_val_bits; #define PUD_MASKED_BITS 0xc0000000000000ffUL /* Bits to mask out from a PGD to get to the PUD page */ #define PGD_MASKED_BITS 0xc0000000000000ffUL + +extern unsigned long __vmalloc_start; +extern unsigned long __vmalloc_end; +#define VMALLOC_START __vmalloc_start +#define VMALLOC_END __vmalloc_end + +extern unsigned long __kernel_virt_start; +extern unsigned long __kernel_virt_size; +#define KERN_VIRT_START __kernel_virt_start +#define KERN_VIRT_SIZE __kernel_virt_size +extern struct page *vmemmap; +extern unsigned long ioremap_bot; #endif /* __ASSEMBLY__ */ #include @@ -220,7 +232,6 @@ extern unsigned long __pgd_val_bits; #endif #include - /* * The second half of the kernel virtual space is used for IO mappings, * it's itself carved into the PIO region (ISA and PHB IO space) and @@ -239,8 +250,6 @@ extern unsigned long __pgd_val_bits; #define IOREMAP_BASE (PHB_IO_END) #define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE) -#define vmemmap ((struct page *)VMEMMAP_BASE) - /* Advertise special mapping type for AGP */ #define HAVE_PAGE_AGP diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 63eb629..f470902 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -31,6 +31,74 @@ RADIX_PUD_INDEX_SIZE + RADIX_PGD_INDEX_SIZE + PAGE_SHIFT) #define RADIX_PGTABLE_RANGE (ASM_CONST(1) << RADIX_PGTABLE_EADDR_SIZE) +/* + * We support 52 bit address space, Use top bit for kernel + * virtual mapping. Also make sure kernel fit in the top + * quadrant. + * + * +------------------+ + * +------------------+ Kernel virtual map (0xc008000000000000) + * | | + * | | + * | | + * 0b11......+------------------+ Kernel linear map (0xc....) + * | | + * | 2 quadrant | + * | | + * 0b10......+------------------+ + * | | + * | 1 quadrant | + * | | + * 0b01......+------------------+ + * | | + * | 0 quadrant | + * | | + * 0b00......+------------------+ + * + * + * 3rd quadrant expanded: + * +------------------------------+ + * | | + * | | + * | | + * +------------------------------+ Kernel IO map end (0xc010000000000000) + * | | + * | | + * | 1/2 of virtual map | + * | | + * | | + * +------------------------------+ Kernel IO map start + * | | + * | 1/4 of virtual map | + * | | + * +------------------------------+ Kernel vmemap start + * | | + * | 1/4 of virtual map | + * | | + * +------------------------------+ Kernel virt start (0xc008000000000000) + * | | + * | | + * | | + * +------------------------------+ Kernel linear (0xc.....) + */ + +#define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000) +#define RADIX_KERN_VIRT_SIZE ASM_CONST(0x0008000000000000) + +/* + * The vmalloc space starts at the beginning of that region, and + * occupies a quarter of it on radix config. + * (we keep a quarter for the virtual memmap) + */ +#define RADIX_VMALLOC_START RADIX_KERN_VIRT_START +#define RADIX_VMALLOC_SIZE (RADIX_KERN_VIRT_SIZE >> 2) +#define RADIX_VMALLOC_END (RADIX_VMALLOC_START + RADIX_VMALLOC_SIZE) +/* + * Defines the address of the vmemap area, in its own region on + * hash table CPUs. + */ +#define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END) + #ifndef __ASSEMBLY__ #define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) #define RADIX_PMD_TABLE_SIZE (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE) diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 41503d7..3759df5 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -38,7 +38,7 @@ * ISA drivers use hard coded offsets. If no ISA bus exists nothing * is mapped on the first 64K of IO space */ -unsigned long pci_io_base = ISA_IO_BASE; +unsigned long pci_io_base; EXPORT_SYMBOL(pci_io_base); static int __init pcibios_init(void) @@ -47,6 +47,7 @@ static int __init pcibios_init(void) printk(KERN_INFO "PCI: Probing PCI hardware\n"); + pci_io_base = ISA_IO_BASE; /* For now, override phys_mem_access_prot. If we need it,g * later, we may move that initialization to each ppc_md */ diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 64165a7..68aee43 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -889,6 +889,14 @@ void __init hash__early_init_mmu(void) __pmd_val_bits = 0; __pud_val_bits = 0; __pgd_val_bits = 0; + + __kernel_virt_start = H_KERN_VIRT_START; + __kernel_virt_size = H_KERN_VIRT_SIZE; + __vmalloc_start = H_VMALLOC_START; + __vmalloc_end = H_VMALLOC_END; + vmemmap = (struct page *)H_VMEMMAP_BASE; + ioremap_bot = IOREMAP_BASE; + /* Initialize the MMU Hash table and create the linear mapping * of memory. Has to be done before SLB initialization as this is * currently where the page size encoding is obtained. diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 6182b6c..13afacd 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -328,6 +328,13 @@ void __init radix__early_init_mmu(void) __pud_val_bits = RADIX_PUD_VAL_BITS; __pgd_val_bits = RADIX_PGD_VAL_BITS; + __kernel_virt_start = RADIX_KERN_VIRT_START; + __kernel_virt_size = RADIX_KERN_VIRT_SIZE; + __vmalloc_start = RADIX_VMALLOC_START; + __vmalloc_end = RADIX_VMALLOC_END; + vmemmap = (struct page *)RADIX_VMEMMAP_BASE; + ioremap_bot = IOREMAP_BASE; + radix_init_page_sizes(); if (!firmware_has_feature(FW_FEATURE_LPAR)) radix_init_partition_table(); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index b8c75e6..216e2bd 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -97,9 +97,20 @@ unsigned long __pud_val_bits; EXPORT_SYMBOL(__pud_val_bits); unsigned long __pgd_val_bits; EXPORT_SYMBOL(__pgd_val_bits); - -#endif +unsigned long __kernel_virt_start; +EXPORT_SYMBOL(__kernel_virt_start); +unsigned long __kernel_virt_size; +EXPORT_SYMBOL(__kernel_virt_size); +unsigned long __vmalloc_start; +EXPORT_SYMBOL(__vmalloc_start); +unsigned long __vmalloc_end; +EXPORT_SYMBOL(__vmalloc_end); +struct page *vmemmap; +EXPORT_SYMBOL(vmemmap); +unsigned long ioremap_bot; +#else /* !CONFIG_PPC_BOOK3S_64 */ unsigned long ioremap_bot = IOREMAP_BASE; +#endif /** * __ioremap_at - Low level function to establish the page tables diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 15b8f71..dfdb90c 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -91,7 +91,7 @@ slb_miss_kernel_load_vmemmap: * can be demoted from 64K -> 4K dynamically on some machines */ clrldi r11,r10,48 - cmpldi r11,(VMALLOC_SIZE >> 28) - 1 + cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1 bgt 5f lhz r11,PACAVMALLOCSLLP(r13) b 6f -- cgit v0.10.2 From a3dece6d69b0ad21b64104dff508c67a1a1f14dd Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:22 +1000 Subject: powerpc/radix: Update MMU cache With radix there is no MMU cache. Hence we don't need to do anything in update_mmu_cache(). Signed-off-by: Aneesh Kumar K.V Acked-by: Balbir Singh Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ac79dbd..8967788 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -499,6 +499,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * called with either mm->page_table_lock held or ptl lock held */ unsigned long access = 0, trap; + if (radix_enabled()) + return; /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) -- cgit v0.10.2 From 5ed7ecd08a0807d6d616c3d958402f9c723bb048 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:23 +1000 Subject: powerpc/mm: pte_frag abstraction In this patch we make the number of pte fragments per level 4 page table page a variable. Radix level 4 table size is 256 bytes and hence we can have 256 fragments per level 4 page. We don't update the fragment count in this patch. We need to do performance measurements to find the right value for fragment count. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index dcb9d6e..99f360e 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -28,6 +28,8 @@ #define H_PAGE_4K_PFN 0x0 #define H_PAGE_THP_HUGE 0x0 #define H_PAGE_COMBO 0x0 +#define H_PTE_FRAG_NR 0 +#define H_PTE_FRAG_SIZE_SHIFT 0 /* * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range() */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index c00e39f..80d2abe 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -29,12 +29,12 @@ /* * we support 16 fragments per PTE page of 64K size. */ -#define PTE_FRAG_NR 16 +#define H_PTE_FRAG_NR 16 /* * We use a 2K PTE page fragment and another 2K for storing * real_pte_t hash index */ -#define PTE_FRAG_SIZE_SHIFT 12 +#define H_PTE_FRAG_SIZE_SHIFT 12 #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index f5628f9..9175a68 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -177,6 +177,12 @@ extern unsigned long __pgd_val_bits; #define PMD_VAL_BITS __pmd_val_bits #define PUD_VAL_BITS __pud_val_bits #define PGD_VAL_BITS __pgd_val_bits + +extern unsigned long __pte_frag_nr; +#define PTE_FRAG_NR __pte_frag_nr +extern unsigned long __pte_frag_size_shift; +#define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift +#define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) /* * Pgtable size used by swapper, init in asm code */ diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 68aee43..262082e 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -873,6 +873,9 @@ void __init hash__early_init_mmu(void) /* * initialize page table size */ + __pte_frag_nr = H_PTE_FRAG_NR; + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + __pte_index_size = H_PTE_INDEX_SIZE; __pmd_index_size = H_PMD_INDEX_SIZE; __pud_index_size = H_PUD_INDEX_SIZE; diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 13afacd..a5a5253 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -334,6 +334,11 @@ void __init radix__early_init_mmu(void) __vmalloc_end = RADIX_VMALLOC_END; vmemmap = (struct page *)RADIX_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; + /* + * For now radix also use the same frag size + */ + __pte_frag_nr = H_PTE_FRAG_NR; + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; radix_init_page_sizes(); if (!firmware_has_feature(FW_FEATURE_LPAR)) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 216e2bd..95df08e 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -107,6 +107,10 @@ unsigned long __vmalloc_end; EXPORT_SYMBOL(__vmalloc_end); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); +unsigned long __pte_frag_nr; +EXPORT_SYMBOL(__pte_frag_nr); +unsigned long __pte_frag_size_shift; +EXPORT_SYMBOL(__pte_frag_size_shift); unsigned long ioremap_bot; #else /* !CONFIG_PPC_BOOK3S_64 */ unsigned long ioremap_bot = IOREMAP_BASE; -- cgit v0.10.2 From 2f5f0dfd1e5f9840560e85ad26cb4a6f06c19e07 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:24 +1000 Subject: powerpc/mm: Fix vma_mmu_pagesize() for radix Radix doesn't use the slice framework to find the page size. Hence use vma to find the page size. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 6d91096..7d677ef 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -719,14 +719,14 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { #ifdef CONFIG_PPC_MM_SLICES unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); - - return 1UL << mmu_psize_to_shift(psize); -#else + /* With radix we don't use slice, so derive it from vma*/ + if (!radix_enabled()) + return 1UL << mmu_psize_to_shift(psize); +#endif if (!is_vm_hugetlb_page(vma)) return PAGE_SIZE; return huge_page_size(hstate_vma(vma)); -#endif } static inline bool is_power_of_4(unsigned long x) -- cgit v0.10.2 From 484837601d4dd048d1f8d2cedd39934bb3ad6d59 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:25 +1000 Subject: powerpc/mm: Add radix support for hugetlb Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h new file mode 100644 index 0000000..60f4764 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h @@ -0,0 +1,14 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H +#define _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H +/* + * For radix we want generic code to handle hugetlb. But then if we want + * both hash and radix to be enabled together we need to workaround the + * limitations. + */ +void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern unsigned long +radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); +#endif diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 42814f0..e2d9f49 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -8,6 +8,8 @@ extern struct kmem_cache *hugepte_cache; #ifdef CONFIG_PPC_BOOK3S_64 + +#include /* * This should work for other subarchs too. But right now we use the * new format only for 64bit book3s @@ -31,7 +33,19 @@ static inline unsigned int hugepd_shift(hugepd_t hpd) { return mmu_psize_to_shift(hugepd_mmu_psize(hpd)); } +static inline void flush_hugetlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__flush_hugetlb_page(vma, vmaddr); +} +static inline void __local_flush_hugetlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__local_flush_hugetlb_page(vma, vmaddr); +} #else static inline pte_t *hugepd_page(hugepd_t hpd) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 48aa11a..47511dd 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o ifeq ($(CONFIG_HUGETLB_PAGE),y) obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c new file mode 100644 index 0000000..1e11559 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include +#include + +void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + unsigned long ap, shift; + struct hstate *hstate = hstate_file(vma->vm_file); + + shift = huge_page_shift(hstate); + if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) + ap = mmu_get_ap(MMU_PAGE_2M); + else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) + ap = mmu_get_ap(MMU_PAGE_1G); + else { + WARN(1, "Wrong huge page shift\n"); + return ; + } + radix___flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); +} + +void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + unsigned long ap, shift; + struct hstate *hstate = hstate_file(vma->vm_file); + + shift = huge_page_shift(hstate); + if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) + ap = mmu_get_ap(MMU_PAGE_2M); + else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) + ap = mmu_get_ap(MMU_PAGE_1G); + else { + WARN(1, "Wrong huge page shift\n"); + return ; + } + radix___local_flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); +} + +/* + * A vairant of hugetlb_get_unmapped_area doing topdown search + * FIXME!! should we do as x86 does or non hugetlb area does ? + * ie, use topdown or not based on mmap_is_legacy check ? + */ +unsigned long +radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + return addr; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + /* + * We are always doing an topdown search here. Slice code + * does that too. + */ + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 7d677ef..790c5de 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -711,6 +711,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); + if (radix_enabled()) + return radix__hugetlb_get_unmapped_area(file, addr, len, + pgoff, flags); return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); } #endif @@ -823,7 +826,7 @@ static int __init hugetlbpage_init(void) { int psize; - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) return -ENODEV; for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { @@ -863,6 +866,9 @@ static int __init hugetlbpage_init(void) HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; else if (mmu_psize_defs[MMU_PAGE_1M].shift) HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; + else if (mmu_psize_defs[MMU_PAGE_2M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; + return 0; } diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index ecfa00f..0fdaf93 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -141,6 +141,11 @@ void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { +#ifdef CONFIG_HUGETLB_PAGE + /* need the return fix for nohash.c */ + if (vma && is_vm_hugetlb_page(vma)) + return __local_flush_hugetlb_page(vma, vmaddr); +#endif radix___local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, mmu_get_ap(mmu_virtual_psize), 0); } @@ -202,6 +207,10 @@ bail: void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { +#ifdef CONFIG_HUGETLB_PAGE + if (vma && is_vm_hugetlb_page(vma)) + return flush_hugetlb_page(vma, vmaddr); +#endif radix___flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, mmu_get_ap(mmu_virtual_psize), 0); } -- cgit v0.10.2 From 43a5c684270ee9b5b13c91ec048831dd5b7e0cdc Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:26 +1000 Subject: powerpc/mm/radix: Make sure swapper pgdir is properly aligned With 4K page size radix config our level 1 page table size is 64K and it should be naturally aligned. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 4286775..2d14774 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -973,13 +973,16 @@ start_here_common: * This stuff goes at the beginning of the bss, which is page-aligned. */ .section ".bss" +/* + * pgd dir should be aligned to PGD_TABLE_SIZE which is 64K. + * We will need to find a better way to fix this + */ + .align 16 - .align PAGE_SHIFT + .globl swapper_pg_dir +swapper_pg_dir: + .space PGD_TABLE_SIZE .globl empty_zero_page empty_zero_page: .space PAGE_SIZE - - .globl swapper_pg_dir -swapper_pg_dir: - .space PGD_TABLE_SIZE -- cgit v0.10.2 From c0a6c719d2a153abe5d8c3b42f0379a5bd752517 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:27 +1000 Subject: powerpc/mm/radix: Add hugetlb support 4K page size We have hugepage at the pmd level with 4K radix config. Hence we don't need to use hugepd format with radix. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 99f360e..6a10b0d 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -37,26 +37,7 @@ remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot)) #ifdef CONFIG_HUGETLB_PAGE -/* - * For 4k page size, we support explicit hugepage via hugepd - */ -static inline int pmd_huge(pmd_t pmd) -{ - return 0; -} - -static inline int pud_huge(pud_t pud) -{ - return 0; -} - -static inline int pgd_huge(pgd_t pgd) -{ - return 0; -} -#define pgd_huge pgd_huge - -static inline int hugepd_ok(hugepd_t hpd) +static inline int hash__hugepd_ok(hugepd_t hpd) { /* * if it is not a pte and have hugepd shift mask @@ -67,7 +48,6 @@ static inline int hugepd_ok(hugepd_t hpd) return true; return false; } -#define is_hugepd(hpd) (hugepd_ok(hpd)) #endif #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h index 423735f..71e9abc 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h @@ -3,5 +3,51 @@ /* * hash 4k can't share hugetlb and also doesn't support THP */ +#ifndef __ASSEMBLY__ +#ifdef CONFIG_HUGETLB_PAGE +static inline int pmd_huge(pmd_t pmd) +{ + /* + * leaf pte for huge page + */ + if (radix_enabled()) + return !!(pmd_val(pmd) & _PAGE_PTE); + return 0; +} + +static inline int pud_huge(pud_t pud) +{ + /* + * leaf pte for huge page + */ + if (radix_enabled()) + return !!(pud_val(pud) & _PAGE_PTE); + return 0; +} + +static inline int pgd_huge(pgd_t pgd) +{ + /* + * leaf pte for huge page + */ + if (radix_enabled()) + return !!(pgd_val(pgd) & _PAGE_PTE); + return 0; +} +#define pgd_huge pgd_huge +/* + * With radix , we have hugepage ptes in the pud and pmd entries. We don't + * need to setup hugepage directory for them. Our pte and page directory format + * enable us to have this enabled. + */ +static inline int hugepd_ok(hugepd_t hpd) +{ + if (radix_enabled()) + return 0; + return hash__hugepd_ok(hpd); +} +#define is_hugepd(hpd) (hugepd_ok(hpd)) +#endif /* CONFIG_HUGETLB_PAGE */ +#endif /* __ASSEMBLY__ */ #endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H */ -- cgit v0.10.2 From 6a1ea36260f69f1aea85bbf8afcd6a8c193485b9 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:28 +1000 Subject: powerpc/mm: THP is only available on hash64 as of now Only code movement in this patch. No functionality change. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 9175a68..dfdd1f4 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -777,18 +777,6 @@ static inline void vmemmap_remove_mapping(unsigned long start, #endif struct page *realmode_pfn_to_page(unsigned long pfn); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); -extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); -extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); -extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd); -extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd); -extern int has_transparent_hugepage(void); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - - static inline pte_t pmd_pte(pmd_t pmd) { return __pte(pmd_val(pmd)); @@ -803,7 +791,6 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) { return (pte_t *)pmd; } - #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) @@ -830,6 +817,16 @@ static inline int pmd_protnone(pmd_t pmd) #define __HAVE_ARCH_PMD_WRITE #define pmd_write(pmd) pte_write(pmd_pte(pmd)) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); +extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); +extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); +extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd); +extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd); +extern int has_transparent_hugepage(void); + static inline pmd_t pmd_mkhuge(pmd_t pmd) { return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE)); @@ -878,5 +875,6 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, */ return true; } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 64975eb..9699a1c 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -15,6 +15,9 @@ #include "mmu_decl.h" +#define CREATE_TRACE_POINTS +#include + #ifdef CONFIG_SPARSEMEM_VMEMMAP /* * On hash-based CPUs, the vmemmap is bolted in the hash table. @@ -93,3 +96,358 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flag smp_wmb(); return 0; } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +/* + * This is called when relaxing access to a hugepage. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp, pmd_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&vma->vm_mm->page_table_lock); +#endif + changed = !pmd_same(*(pmdp), entry); + if (changed) { + __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); + /* + * Since we are not supporting SW TLB systems, we don't + * have any thing similar to flush_tlb_page_nohash() + */ + } + return changed; +} + +unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + __be64 old_be, tmp; + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&mm->page_table_lock); +#endif + + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + and. %1,%0,%6\n\ + bne- 1b \n\ + andc %1,%0,%4 \n\ + or %1,%1,%7\n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) + : "cc" ); + + old = be64_to_cpu(old_be); + + trace_hugepage_update(addr, old, clr, set); + if (old & H_PAGE_HASHPTE) + hpte_do_hugepage_flush(mm, addr, pmdp, old); + return old; +} + +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_sem. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + kick_all_cpus_sync(); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_sem and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + return pmd; +} + +/* + * We currently remove entries from the hashtable regardless of whether + * the entry was young or dirty. + * + * We should be more intelligent about this but for the moment we override + * these functions and force a tlb flush unconditionally + */ +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} + +/* + * We want to put the pgtable in pmd and use pgtable for tracking + * the base page size hptes + */ +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pgtable_t *pgtable_slot; + assert_spin_locked(&mm->page_table_lock); + /* + * we store the pgtable in the second half of PMD + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + *pgtable_slot = pgtable; + /* + * expose the deposited pgtable to other cpus. + * before we set the hugepage PTE at pmd level + * hash fault code looks at the deposted pgtable + * to store hash index values. + */ + smp_wmb(); +} + +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + pgtable_t *pgtable_slot; + + assert_spin_locked(&mm->page_table_lock); + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Once we withdraw, mark the entry NULL. + */ + *pgtable_slot = NULL; + /* + * We store HPTE information in the deposited PTE fragment. + * zero out the content on withdraw. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return pgtable; +} + +void pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); + + /* + * We can't mark the pmd none here, because that will cause a race + * against exit_mmap. We need to continue mark pmd TRANS HUGE, while + * we spilt, but at the same time we wan't rest of the ppc64 code + * not to insert hash pte on this, because we will be modifying + * the deposited pgtable in the caller of this function. Hence + * clear the _PAGE_USER so that we move the fault handling to + * higher level function and that will serialize against ptl. + * We need to flush existing hash pte entries here even though, + * the translation is still valid, because we will withdraw + * pgtable_t after this. + */ + pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); +} + + +/* + * set a new huge pmd. We should not be called for updating + * an existing pmd entry. That should go via pmd_hugepage_update. + */ +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#ifdef CONFIG_DEBUG_VM + WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); + assert_spin_locked(&mm->page_table_lock); + WARN_ON(!pmd_trans_huge(pmd)); +#endif + trace_hugepage_set_pmd(addr, pmd_val(pmd)); + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); +} + +/* + * We use this to invalidate a pmdp entry before switching from a + * hugepte to regular pmd entry. + */ +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + + /* + * This ensures that generic code that rely on IRQ disabling + * to prevent a parallel THP split work as expected. + */ + kick_all_cpus_sync(); +} + +/* + * A linux hugepage PMD was changed and the corresponding hash table entries + * neesd to be flushed. + */ +void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long old_pmd) +{ + int ssize; + unsigned int psize; + unsigned long vsid; + unsigned long flags = 0; + const struct cpumask *tmp; + + /* get the base page size,vsid and segment size */ +#ifdef CONFIG_DEBUG_VM + psize = get_slice_psize(mm, addr); + BUG_ON(psize == MMU_PAGE_16M); +#endif + if (old_pmd & H_PAGE_COMBO) + psize = MMU_PAGE_4K; + else + psize = MMU_PAGE_64K; + + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + tmp = cpumask_of(smp_processor_id()); + if (cpumask_equal(mm_cpumask(mm), tmp)) + flags |= HPTE_LOCAL_UPDATE; + + return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); +} + +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) +{ + return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); +} + +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pmdv; + + pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + return pmd_set_protbits(__pmd(pmdv), pgprot); +} + +pmd_t mk_pmd(struct page *page, pgprot_t pgprot) +{ + return pfn_pmd(page_to_pfn(page), pgprot); +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + unsigned long pmdv; + + pmdv = pmd_val(pmd); + pmdv &= _HPAGE_CHG_MASK; + return pmd_set_protbits(__pmd(pmdv), newprot); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a HUGE PMD entry in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux HUGE PMD entry. + */ +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + return; +} + +pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + pgtable_t pgtable; + unsigned long old; + pgtable_t *pgtable_slot; + + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * We have pmd == none and we are holding page_table_lock. + * So we can safely go and clear the pgtable hash + * index info. + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Let's zero out old valid and hash index details + * hash fault look at them. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + /* + * Serialize against find_linux_pte_or_hugepte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_linux_pte_or_hugepage to finish. + */ + kick_all_cpus_sync(); + return old_pmd; +} + +int has_transparent_hugepage(void) +{ + + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return 0; + /* + * We support THP only if PMD_SIZE is 16MB. + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) + return 0; + /* + * We need to make sure that we support 16MB hugepage in a segement + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE + * of 64K. + */ + /* + * If we have 64K HPTE, we will be using that by default + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift && + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) + return 0; + /* + * Ok we only have 4K HPTE + */ + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) + return 0; + + return 1; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 95df08e..e009e06 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -55,9 +55,6 @@ #include "mmu_decl.h" -#define CREATE_TRACE_POINTS -#include - #ifdef CONFIG_PPC_STD_MMU_64 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) #error TASK_SIZE_USER64 exceeds user VSID range @@ -435,359 +432,3 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) } } #endif - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - -/* - * This is called when relaxing access to a hugepage. It's also called in the page - * fault path when we don't hit any of the major fault cases, ie, a minor - * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have - * handled those two for us, we additionally deal with missing execute - * permission here on some processors - */ -int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp, pmd_t entry, int dirty) -{ - int changed; -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&vma->vm_mm->page_table_lock); -#endif - changed = !pmd_same(*(pmdp), entry); - if (changed) { - __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); - /* - * Since we are not supporting SW TLB systems, we don't - * have any thing similar to flush_tlb_page_nohash() - */ - } - return changed; -} - -unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr, - unsigned long set) -{ - - __be64 old_be, tmp; - unsigned long old; - -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&mm->page_table_lock); -#endif - - __asm__ __volatile__( - "1: ldarx %0,0,%3\n\ - and. %1,%0,%6\n\ - bne- 1b \n\ - andc %1,%0,%4 \n\ - or %1,%1,%7\n\ - stdcx. %1,0,%3 \n\ - bne- 1b" - : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) - : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), - "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) - : "cc" ); - - old = be64_to_cpu(old_be); - - trace_hugepage_update(addr, old, clr, set); - if (old & H_PAGE_HASHPTE) - hpte_do_hugepage_flush(mm, addr, pmdp, old); - return old; -} - -pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_t pmd; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(pmd_trans_huge(*pmdp)); - - pmd = *pmdp; - pmd_clear(pmdp); - /* - * Wait for all pending hash_page to finish. This is needed - * in case of subpage collapse. When we collapse normal pages - * to hugepage, we first clear the pmd, then invalidate all - * the PTE entries. The assumption here is that any low level - * page fault will see a none pmd and take the slow path that - * will wait on mmap_sem. But we could very well be in a - * hash_page with local ptep pointer value. Such a hash page - * can result in adding new HPTE entries for normal subpages. - * That means we could be modifying the page content as we - * copy them to a huge page. So wait for parallel hash_page - * to finish before invalidating HPTE entries. We can do this - * by sending an IPI to all the cpus and executing a dummy - * function there. - */ - kick_all_cpus_sync(); - /* - * Now invalidate the hpte entries in the range - * covered by pmd. This make sure we take a - * fault and will find the pmd as none, which will - * result in a major fault which takes mmap_sem and - * hence wait for collapse to complete. Without this - * the __collapse_huge_page_copy can result in copying - * the old content. - */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); - return pmd; -} - -/* - * We currently remove entries from the hashtable regardless of whether - * the entry was young or dirty. - * - * We should be more intelligent about this but for the moment we override - * these functions and force a tlb flush unconditionally - */ -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} - -/* - * We want to put the pgtable in pmd and use pgtable for tracking - * the base page size hptes - */ -void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) -{ - pgtable_t *pgtable_slot; - assert_spin_locked(&mm->page_table_lock); - /* - * we store the pgtable in the second half of PMD - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - *pgtable_slot = pgtable; - /* - * expose the deposited pgtable to other cpus. - * before we set the hugepage PTE at pmd level - * hash fault code looks at the deposted pgtable - * to store hash index values. - */ - smp_wmb(); -} - -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) -{ - pgtable_t pgtable; - pgtable_t *pgtable_slot; - - assert_spin_locked(&mm->page_table_lock); - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Once we withdraw, mark the entry NULL. - */ - *pgtable_slot = NULL; - /* - * We store HPTE information in the deposited PTE fragment. - * zero out the content on withdraw. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - return pgtable; -} - -void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); - - /* - * We can't mark the pmd none here, because that will cause a race - * against exit_mmap. We need to continue mark pmd TRANS HUGE, while - * we spilt, but at the same time we wan't rest of the ppc64 code - * not to insert hash pte on this, because we will be modifying - * the deposited pgtable in the caller of this function. Hence - * clear the _PAGE_USER so that we move the fault handling to - * higher level function and that will serialize against ptl. - * We need to flush existing hash pte entries here even though, - * the translation is still valid, because we will withdraw - * pgtable_t after this. - */ - pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); -} - - -/* - * set a new huge pmd. We should not be called for updating - * an existing pmd entry. That should go via pmd_hugepage_update. - */ -void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) -{ -#ifdef CONFIG_DEBUG_VM - WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); - assert_spin_locked(&mm->page_table_lock); - WARN_ON(!pmd_trans_huge(pmd)); -#endif - trace_hugepage_set_pmd(addr, pmd_val(pmd)); - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); -} - -/* - * We use this to invalidate a pmdp entry before switching from a - * hugepte to regular pmd entry. - */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); - - /* - * This ensures that generic code that rely on IRQ disabling - * to prevent a parallel THP split work as expected. - */ - kick_all_cpus_sync(); -} - -/* - * A linux hugepage PMD was changed and the corresponding hash table entries - * neesd to be flushed. - */ -void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long old_pmd) -{ - int ssize; - unsigned int psize; - unsigned long vsid; - unsigned long flags = 0; - const struct cpumask *tmp; - - /* get the base page size,vsid and segment size */ -#ifdef CONFIG_DEBUG_VM - psize = get_slice_psize(mm, addr); - BUG_ON(psize == MMU_PAGE_16M); -#endif - if (old_pmd & H_PAGE_COMBO) - psize = MMU_PAGE_4K; - else - psize = MMU_PAGE_64K; - - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - - tmp = cpumask_of(smp_processor_id()); - if (cpumask_equal(mm_cpumask(mm), tmp)) - flags |= HPTE_LOCAL_UPDATE; - - return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); -} - -static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) -{ - return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); -} - -pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) -{ - unsigned long pmdv; - - pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; - return pmd_set_protbits(__pmd(pmdv), pgprot); -} - -pmd_t mk_pmd(struct page *page, pgprot_t pgprot) -{ - return pfn_pmd(page_to_pfn(page), pgprot); -} - -pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) -{ - unsigned long pmdv; - - pmdv = pmd_val(pmd); - pmdv &= _HPAGE_CHG_MASK; - return pmd_set_protbits(__pmd(pmdv), newprot); -} - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a HUGE PMD entry in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux HUGE PMD entry. - */ -void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd) -{ - return; -} - -pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t old_pmd; - pgtable_t pgtable; - unsigned long old; - pgtable_t *pgtable_slot; - - old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); - old_pmd = __pmd(old); - /* - * We have pmd == none and we are holding page_table_lock. - * So we can safely go and clear the pgtable hash - * index info. - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Let's zero out old valid and hash index details - * hash fault look at them. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - /* - * Serialize against find_linux_pte_or_hugepte which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_linux_pte_or_hugepage to finish. - */ - kick_all_cpus_sync(); - return old_pmd; -} - -int has_transparent_hugepage(void) -{ - - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) - return 0; - /* - * We support THP only if PMD_SIZE is 16MB. - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) - return 0; - /* - * We need to make sure that we support 16MB hugepage in a segement - * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE - * of 64K. - */ - /* - * If we have 64K HPTE, we will be using that by default - */ - if (mmu_psize_defs[MMU_PAGE_64K].shift && - (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) - return 0; - /* - * Ok we only have 4K HPTE - */ - if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) - return 0; - - return 1; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit v0.10.2 From 3df33f12be2f6fa88dff12564ab1d069482e3224 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:29 +1000 Subject: powerpc/mm/thp: Abstraction for THP functions Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 80d2abe..5aae4f5 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -119,11 +119,6 @@ static inline int hash__remap_4k_pfn(struct vm_area_struct *vma, unsigned long a #define H_PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern unsigned long pmd_hugepage_update(struct mm_struct *mm, - unsigned long addr, - pmd_t *pmdp, - unsigned long clr, - unsigned long set); static inline char *get_hpte_slot_array(pmd_t *pmdp) { /* @@ -193,6 +188,24 @@ static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); } +static inline pmd_t hash__pmd_mkhuge(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE)); +} + +extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + unsigned long clr, unsigned long set); +extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); +extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); +extern int hash__has_transparent_hugepage(void); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index 27b5e34..79331cf 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -54,41 +54,6 @@ static inline int hugepd_ok(hugepd_t hpd) #endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int pmd_large(pmd_t pmd) -{ - return !!(pmd_val(pmd) & _PAGE_PTE); -} - -static inline pmd_t pmd_mknotpresent(pmd_t pmd) -{ - return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); -} -/* - * For radix we should always find H_PAGE_HASHPTE zero. Hence - * the below will work for radix too - */ -static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - unsigned long old; - - if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) - return 0; - old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); - return ((old & _PAGE_ACCESSED) != 0); -} - -#define __HAVE_ARCH_PMDP_SET_WRPROTECT -static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - - if ((pmd_val(*pmdp) & _PAGE_WRITE) == 0) - return; - - pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); -} - static inline int pmd_trans_huge(pmd_t pmd) { if (radix_enabled()) @@ -103,6 +68,12 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) return radix__pmd_same(pmd_a, pmd_b); return hash__pmd_same(pmd_a, pmd_b); } + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + return hash__pmd_mkhuge(pmd); +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, @@ -111,7 +82,6 @@ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, if (radix_enabled()) BUG(); return hash__remap_4k_pfn(vma, addr, pfn, prot); - } #endif /* __ASSEMBLY__ */ #endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index dfdd1f4..5f290d3 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -769,7 +769,6 @@ static inline int __meminit vmemmap_create_mapping(unsigned long start, static inline void vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { - if (radix_enabled()) return radix__vmemmap_remove_mapping(start, page_size); return hash__vmemmap_remove_mapping(start, page_size); @@ -825,11 +824,52 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); -extern int has_transparent_hugepage(void); +extern int hash__has_transparent_hugepage(void); +static inline int has_transparent_hugepage(void) +{ + return hash__has_transparent_hugepage(); +} -static inline pmd_t pmd_mkhuge(pmd_t pmd) +static inline unsigned long +pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, + unsigned long clr, unsigned long set) { - return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE)); + return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); +} + +static inline int pmd_large(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); +} +/* + * For radix we should always find H_PAGE_HASHPTE zero. Hence + * the below will work for radix too + */ +static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + unsigned long old; + + if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) + return 0; + old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); + return ((old & _PAGE_ACCESSED) != 0); +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + + if ((pmd_val(*pmdp) & _PAGE_WRITE) == 0) + return; + + pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); } #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS @@ -842,26 +882,43 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR -extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp); +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); +} -extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return hash__pmdp_collapse_flush(vma, address, pmdp); +} #define pmdp_collapse_flush pmdp_collapse_flush #define __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable); +static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, + pmd_t *pmdp, pgtable_t pgtable) +{ + return hash__pgtable_trans_huge_deposit(mm, pmdp, pgtable); +} + #define __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, + pmd_t *pmdp) +{ + return hash__pgtable_trans_huge_withdraw(mm, pmdp); +} #define __HAVE_ARCH_PMDP_INVALIDATE extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE -extern void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return hash__pmdp_huge_split_prepare(vma, address, pmdp); +} #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 47511dd..f2cea6d 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o +obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c new file mode 100644 index 0000000..d566a25 --- /dev/null +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -0,0 +1,118 @@ +/* + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "mmu_decl.h" +#include + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * This is called when relaxing access to a hugepage. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp, pmd_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&vma->vm_mm->page_table_lock); +#endif + changed = !pmd_same(*(pmdp), entry); + if (changed) { + __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); + /* + * Since we are not supporting SW TLB systems, we don't + * have any thing similar to flush_tlb_page_nohash() + */ + } + return changed; +} + +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} +/* + * set a new huge pmd. We should not be called for updating + * an existing pmd entry. That should go via pmd_hugepage_update. + */ +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#ifdef CONFIG_DEBUG_VM + WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); + assert_spin_locked(&mm->page_table_lock); + WARN_ON(!pmd_trans_huge(pmd)); +#endif + trace_hugepage_set_pmd(addr, pmd_val(pmd)); + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); +} +/* + * We use this to invalidate a pmdp entry before switching from a + * hugepte to regular pmd entry. + */ +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + + /* + * This ensures that generic code that rely on IRQ disabling + * to prevent a parallel THP split work as expected. + */ + kick_all_cpus_sync(); +} + +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) +{ + return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); +} + +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pmdv; + + pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + return pmd_set_protbits(__pmd(pmdv), pgprot); +} + +pmd_t mk_pmd(struct page *page, pgprot_t pgprot) +{ + return pfn_pmd(page_to_pfn(page), pgprot); +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + unsigned long pmdv; + + pmdv = pmd_val(pmd); + pmdv &= _HPAGE_CHG_MASK; + return pmd_set_protbits(__pmd(pmdv), newprot); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a HUGE PMD entry in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux HUGE PMD entry. + */ +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + return; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 9699a1c..c23e286 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -99,35 +99,9 @@ int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flag #ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* - * This is called when relaxing access to a hugepage. It's also called in the page - * fault path when we don't hit any of the major fault cases, ie, a minor - * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have - * handled those two for us, we additionally deal with missing execute - * permission here on some processors - */ -int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp, pmd_t entry, int dirty) -{ - int changed; -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&vma->vm_mm->page_table_lock); -#endif - changed = !pmd_same(*(pmdp), entry); - if (changed) { - __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); - /* - * Since we are not supporting SW TLB systems, we don't - * have any thing similar to flush_tlb_page_nohash() - */ - } - return changed; -} - -unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr, - unsigned long set) +unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) { __be64 old_be, tmp; unsigned long old; @@ -158,8 +132,8 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, return old; } -pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) +pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) { pmd_t pmd; @@ -198,24 +172,11 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, } /* - * We currently remove entries from the hashtable regardless of whether - * the entry was young or dirty. - * - * We should be more intelligent about this but for the moment we override - * these functions and force a tlb flush unconditionally - */ -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} - -/* * We want to put the pgtable in pmd and use pgtable for tracking * the base page size hptes */ -void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) +void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { pgtable_t *pgtable_slot; assert_spin_locked(&mm->page_table_lock); @@ -233,7 +194,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, smp_wmb(); } -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; pgtable_t *pgtable_slot; @@ -253,8 +214,8 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) return pgtable; } -void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) +void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) { VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); @@ -274,39 +235,6 @@ void pmdp_huge_split_prepare(struct vm_area_struct *vma, pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); } - -/* - * set a new huge pmd. We should not be called for updating - * an existing pmd entry. That should go via pmd_hugepage_update. - */ -void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) -{ -#ifdef CONFIG_DEBUG_VM - WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); - assert_spin_locked(&mm->page_table_lock); - WARN_ON(!pmd_trans_huge(pmd)); -#endif - trace_hugepage_set_pmd(addr, pmd_val(pmd)); - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); -} - -/* - * We use this to invalidate a pmdp entry before switching from a - * hugepte to regular pmd entry. - */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); - - /* - * This ensures that generic code that rely on IRQ disabling - * to prevent a parallel THP split work as expected. - */ - kick_all_cpus_sync(); -} - /* * A linux hugepage PMD was changed and the corresponding hash table entries * neesd to be flushed. @@ -346,47 +274,8 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); } -static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) -{ - return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); -} - -pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) -{ - unsigned long pmdv; - - pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; - return pmd_set_protbits(__pmd(pmdv), pgprot); -} - -pmd_t mk_pmd(struct page *page, pgprot_t pgprot) -{ - return pfn_pmd(page_to_pfn(page), pgprot); -} - -pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) -{ - unsigned long pmdv; - - pmdv = pmd_val(pmd); - pmdv &= _HPAGE_CHG_MASK; - return pmd_set_protbits(__pmd(pmdv), newprot); -} - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a HUGE PMD entry in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux HUGE PMD entry. - */ -void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd) -{ - return; -} - -pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) +pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) { pmd_t old_pmd; pgtable_t pgtable; @@ -421,7 +310,7 @@ pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } -int has_transparent_hugepage(void) +int hash__has_transparent_hugepage(void) { if (!mmu_has_feature(MMU_FTR_16M_PAGE)) -- cgit v0.10.2 From bde3eb6222e49673d2ee9dfdc6ab6e7b6ff69e91 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:30 +1000 Subject: powerpc/mm/radix: Add radix THP callbacks The deposited pgtable_t is a pte fragment hence we cannot use page->lru for linking then together. We use the first two 64 bits for pte fragment as list_head type to link all deposited fragments together. On withdraw we properly zero then out. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index 79331cf..793ecd5 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -71,6 +71,8 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) static inline pmd_t pmd_mkhuge(pmd_t pmd) { + if (radix_enabled()) + return radix__pmd_mkhuge(pmd); return hash__pmd_mkhuge(pmd); } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 5f290d3..1b09ba0 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -827,6 +827,8 @@ extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, extern int hash__has_transparent_hugepage(void); static inline int has_transparent_hugepage(void) { + if (radix_enabled()) + return radix__has_transparent_hugepage(); return hash__has_transparent_hugepage(); } @@ -834,6 +836,8 @@ static inline unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, unsigned long set) { + if (radix_enabled()) + return radix__pmd_hugepage_update(mm, addr, pmdp, clr, set); return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); } @@ -885,12 +889,16 @@ extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { + if (radix_enabled()) + return radix__pmdp_huge_get_and_clear(mm, addr, pmdp); return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); } static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + if (radix_enabled()) + return radix__pmdp_collapse_flush(vma, address, pmdp); return hash__pmdp_collapse_flush(vma, address, pmdp); } #define pmdp_collapse_flush pmdp_collapse_flush @@ -899,6 +907,8 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { + if (radix_enabled()) + return radix__pgtable_trans_huge_deposit(mm, pmdp, pgtable); return hash__pgtable_trans_huge_deposit(mm, pmdp, pgtable); } @@ -906,6 +916,8 @@ static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { + if (radix_enabled()) + return radix__pgtable_trans_huge_withdraw(mm, pmdp); return hash__pgtable_trans_huge_withdraw(mm, pmdp); } @@ -917,6 +929,8 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { + if (radix_enabled()) + return radix__pmdp_huge_split_prepare(vma, address, pmdp); return hash__pmdp_huge_split_prepare(vma, address, pmdp); } @@ -925,6 +939,8 @@ struct spinlock; static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, struct spinlock *old_pmd_ptl) { + if (radix_enabled()) + return false; /* * Archs like ppc64 use pgtable to store per pmd * specific information. So when we switch the pmd, diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index f470902..937d4e2 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -196,6 +196,28 @@ static inline int radix__pmd_trans_huge(pmd_t pmd) return !!(pmd_val(pmd) & _PAGE_PTE); } +static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | _PAGE_PTE); +} +static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + /* Nothing to do for radix. */ + return; +} + +extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set); +extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); +extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); +extern int radix__has_transparent_hugepage(void); #endif extern int __meminit radix__vmemmap_create_mapping(unsigned long start, diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index d566a25..eb44511 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -69,7 +69,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); - + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* * This ensures that generic code that rely on IRQ disabling * to prevent a parallel THP split work as expected. diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index a5a5253..18b2c11 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -19,6 +19,8 @@ #include #include +#include + static int native_update_partition_table(u64 patb1) { partition_tb->patb1 = cpu_to_be64(patb1); @@ -407,3 +409,118 @@ void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) } #endif #endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!radix__pmd_trans_huge(*pmdp)); + assert_spin_locked(&mm->page_table_lock); +#endif + + old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); + trace_hugepage_update(addr, old, clr, set); + + return old; +} + +pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) + +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); + /* + * khugepaged calls this for normal pmd + */ + pmd = *pmdp; + pmd_clear(pmdp); + /*FIXME!! Verify whether we need this kick below */ + kick_all_cpus_sync(); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} + +/* + * For us pgtable_t is pte_t *. Inorder to save the deposisted + * page table, we consider the allocated page table as a list + * head. On withdraw we need to make sure we zero out the used + * list_head memory area. + */ +void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + struct list_head *lh = (struct list_head *) pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; +} + +pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pte_t *ptep; + pgtable_t pgtable; + struct list_head *lh; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + *ptep = __pte(0); + ptep++; + *ptep = __pte(0); + return pgtable; +} + + +pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + unsigned long old; + + old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * Serialize against find_linux_pte_or_hugepte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_linux_pte_or_hugepage to finish. + */ + kick_all_cpus_sync(); + return old_pmd; +} + +int radix__has_transparent_hugepage(void) +{ + /* For radix 2M at PMD level means thp */ + if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) + return 1; + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit v0.10.2 From ab62476240d59a9d7d78ca5ff32499dd50db2069 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:31 +1000 Subject: powerpc/mm/radix: Add THP support for 4K linux page size This adds THP support for 4K Linux page size config with radix. We still don't do THP with 4K Linux page size and hash page table. Hash page table needs a 16MB hugepage and we can't do THP with 16MM hugepage and 4K Linux page size. We add missing functions to 4K hash config to get it to build and hash__has_transparent_hugepage() makes sure we don't enable THP for 4K hash config. To catch wrong usage of THP related with 4K config, we add BUG() in those dummy functions we added to get it compile. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 6a10b0d..1af837c 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -50,6 +50,65 @@ static inline int hash__hugepd_ok(hugepd_t hpd) } #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +static inline char *get_hpte_slot_array(pmd_t *pmdp) +{ + BUG(); + return NULL; +} + +static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index) +{ + BUG(); + return 0; +} + +static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array, + int index) +{ + BUG(); + return 0; +} + +static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, + unsigned int index, unsigned int hidx) +{ + BUG(); +} + +static inline int hash__pmd_trans_huge(pmd_t pmd) +{ + return 0; +} + +static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + BUG(); + return 0; +} + +static inline pmd_t hash__pmd_mkhuge(pmd_t pmd) +{ + BUG(); + return pmd; +} + +extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + unsigned long clr, unsigned long set); +extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); +extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); +extern int hash__has_transparent_hugepage(void); +#endif + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index 793ecd5..cb2d0a5 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -53,31 +53,6 @@ static inline int hugepd_ok(hugepd_t hpd) #endif /* CONFIG_HUGETLB_PAGE */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int pmd_trans_huge(pmd_t pmd) -{ - if (radix_enabled()) - return radix__pmd_trans_huge(pmd); - return hash__pmd_trans_huge(pmd); -} - -#define __HAVE_ARCH_PMD_SAME -static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) -{ - if (radix_enabled()) - return radix__pmd_same(pmd_a, pmd_b); - return hash__pmd_same(pmd_a, pmd_b); -} - -static inline pmd_t pmd_mkhuge(pmd_t pmd) -{ - if (radix_enabled()) - return radix__pmd_mkhuge(pmd); - return hash__pmd_mkhuge(pmd); -} - -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t prot) { diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 1b09ba0..898222a 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -876,6 +876,28 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); } +static inline int pmd_trans_huge(pmd_t pmd) +{ + if (radix_enabled()) + return radix__pmd_trans_huge(pmd); + return hash__pmd_trans_huge(pmd); +} + +#define __HAVE_ARCH_PMD_SAME +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + if (radix_enabled()) + return radix__pmd_same(pmd_a, pmd_b); + return hash__pmd_same(pmd_a, pmd_b); +} + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + if (radix_enabled()) + return radix__pmd_mkhuge(pmd); + return hash__pmd_mkhuge(pmd); +} + #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 8d2cfb8..77e9b8d 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -72,7 +72,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS - select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES + select HAVE_ARCH_TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_NUMA_BALANCING select IRQ_WORK -- cgit v0.10.2 From 17a3dd2f5fc75fc12d0a2ebf05327cf965bdf84b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:32 +1000 Subject: powerpc/mm/radix: Use firmware feature to enable Radix MMU We use the existing "ibm,pa-features" device-tree property to enable Radix MMU mode. This means we default to hash mode unless firmware tells us it's OK to start using Radix mode. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 1b082c7..d924cd6 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -166,6 +166,7 @@ static struct ibm_pa_feature { * which is 0 if the kernel doesn't support TM. */ {CPU_FTR_TM_COMP, 0, 0, 22, 0, 0}, + {0, MMU_FTR_RADIX, 0, 40, 0, 0}, }; static void __init scan_features(unsigned long node, const unsigned char *ftrs, -- cgit v0.10.2 From 69dfbaeb657bad13dddb3204603490206b859a7e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 29 Apr 2016 23:26:33 +1000 Subject: powerpc/mm/radix: Document software bits for radix Add #defines for Power ISA 3.0 software defined bits. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 898222a..48dc76c 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -20,12 +20,16 @@ /* * Software bits */ +#define _RPAGE_SW0 0x2000000000000000UL +#define _RPAGE_SW1 0x00800 +#define _RPAGE_SW2 0x00400 +#define _RPAGE_SW3 0x00200 #ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY 0x00200 /* software: software dirty tracking */ +#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ #else #define _PAGE_SOFT_DIRTY 0x00000 #endif -#define _PAGE_SPECIAL 0x00400 /* software: special page */ +#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ #define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ -- cgit v0.10.2 From bb03efe2b7409d83a279ea1ef2317c26dbfb0c6c Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Tue, 3 May 2016 08:59:27 +0200 Subject: powerpc/mm/radix: Fix CONFIG_PPC_MMU_STD_64 typo It's CONFIG_PPC_STD_MMU_64 not ... CONFIG_PPC_MMU_STD_64. Fixes: 11ffc1cfa4c2 ("powerpc/mm/radix: Use STD_MMU_64 to properly isolate hash related code") Signed-off-by: Valentin Rothberg Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index bec7033..7353991 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -80,7 +80,7 @@ void __flush_tlb_power9(unsigned int action) /* flush SLBs and reload */ -#ifdef CONFIG_PPC_MMU_STD_64 +#ifdef CONFIG_PPC_STD_MMU_64 static void flush_and_reload_slb(void) { struct slb_shadow *slb; @@ -125,7 +125,7 @@ static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) * reset the error bits whenever we handle them so that at the end * we can check whether we handled all of them or not. * */ -#ifdef CONFIG_PPC_MMU_STD_64 +#ifdef CONFIG_PPC_STD_MMU_64 if (dsisr & slb_error_bits) { flush_and_reload_slb(); /* reset error bits */ @@ -157,7 +157,7 @@ static long mce_handle_common_ierror(uint64_t srr1) switch (P7_SRR1_MC_IFETCH(srr1)) { case 0: break; -#ifdef CONFIG_PPC_MMU_STD_64 +#ifdef CONFIG_PPC_STD_MMU_64 case P7_SRR1_MC_IFETCH_SLB_PARITY: case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: /* flush and reload SLBs for SLB errors. */ @@ -184,7 +184,7 @@ static long mce_handle_ierror_p7(uint64_t srr1) handled = mce_handle_common_ierror(srr1); -#ifdef CONFIG_PPC_MMU_STD_64 +#ifdef CONFIG_PPC_STD_MMU_64 if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { flush_and_reload_slb(); handled = 1; @@ -332,7 +332,7 @@ static long mce_handle_ierror_p8(uint64_t srr1) handled = mce_handle_common_ierror(srr1); -#ifdef CONFIG_PPC_MMU_STD_64 +#ifdef CONFIG_PPC_STD_MMU_64 if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { flush_and_reload_slb(); handled = 1; -- cgit v0.10.2 From 2d566537dd2c69bd6898ef5d95ecccdfbab97eef Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 2 May 2016 16:21:50 +0530 Subject: powerpc/mm/subpage: Initialise user psize correctly As part of the radix support we switched Book3s64 to use a value of ~0 for MMU_NO_CONTEXT. That is because id 0 is special on radix. However that broke the logic in init_new_context(). The code there needs to differentiate between a newly allocated context and one inherited via fork. Previously it worked because a newly allocated context has an id of zero (because it was just memset() to zero), which used to match MMU_NO_CONTEXT, and therefore slice_mm_new_context() did the right thing. Instead check against a context.id value of zero instead of using slice_mm_new_context(). Without this patch we never call slice_set_user_psize(), and end up with a slice psize value of zero and we always end up using 4K HPTE. Fixes: 1a472c9dba6b ("powerpc/mm/radix: Add tlbflush routines") Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index b5288b4..227b2a6 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -85,8 +85,17 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) /* The old code would re-promote on fork, we don't do that * when using slices as it could cause problem promoting slices * that have been forced down to 4K + * + * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check + * explicitly against context.id == 0. This ensures that we + * properly initialize context slice details for newly allocated + * mm's (which will have id == 0) and don't alter context slice + * inherited via fork (which will have id != 0). + * + * We should not be calling init_new_context() on init_mm. Hence a + * check against 0 is ok. */ - if (slice_mm_new_context(mm)) + if (mm->context.id == 0) slice_set_user_psize(mm, mmu_virtual_psize); subpage_prot_init_new_context(mm); } -- cgit v0.10.2 From 62ccf5bf1f4704986844440069df70b6509a9213 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 2 May 2016 18:26:07 +0530 Subject: powerpc/mm/slice: Remove slice_mm_new_context() The usage in mm mmu_context_nohash.c is bogus, because we set the context.id value to MMU_NO_CONTEXT 4 lines previously in the same function, meaning slice_mm_new_context() will always be true. The book3s 64 usage was removed in the previous commit. So remove it as unused. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 7748885..dd5f071 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -128,8 +128,6 @@ extern void slice_set_user_psize(struct mm_struct *mm, unsigned int psize); extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize); -#define slice_mm_new_context(mm) ((mm)->context.id == MMU_NO_CONTEXT) - #endif /* __ASSEMBLY__ */ #else #define slice_init() @@ -151,7 +149,6 @@ do { \ #define slice_set_range_psize(mm, start, len, psize) \ slice_set_user_psize((mm), (psize)) -#define slice_mm_new_context(mm) 1 #endif /* CONFIG_PPC_MM_SLICES */ #ifdef CONFIG_HUGETLB_PAGE diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index a36c43a..7d95bc4 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -335,8 +335,7 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm) mm->context.active = 0; #ifdef CONFIG_PPC_MM_SLICES - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); + slice_set_user_psize(mm, mmu_virtual_psize); #endif return 0; -- cgit v0.10.2 From c415c9cb8ab2c852a56f5d592057fc14c585301d Mon Sep 17 00:00:00 2001 From: Rashmica Gupta Date: Tue, 12 Apr 2016 15:33:58 +1000 Subject: powerpc: Remove unused remnants from A2 cpu Support for the A2 cpu was removed in commit fb5a515704d7 ("powerpc: Remove platforms/wsp and associated pieces"), and the externs: __setup_cpu_a2 and __restore_cpu_a2 are still around and unused, so remove them. Signed-off-by: Rashmica Gupta Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 6c662b8..eeeacf6 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -63,7 +63,6 @@ extern void __setup_cpu_745x(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_a2(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_pa6t(void); extern void __restore_cpu_ppc970(void); extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec); @@ -72,7 +71,6 @@ extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_power8(void); extern void __setup_cpu_power9(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_power9(void); -extern void __restore_cpu_a2(void); extern void __flush_tlb_power7(unsigned int action); extern void __flush_tlb_power8(unsigned int action); extern void __flush_tlb_power9(unsigned int action); -- cgit v0.10.2 From e44c1b15cf32e33cd4521f7278333d448eda170b Mon Sep 17 00:00:00 2001 From: Chris Smart Date: Mon, 2 May 2016 16:00:58 +1000 Subject: powerpc: Remove unnecessary CONFIG_SMP #ifdefs The code in machine_restart/power_off/halt() includes #ifdefs around calls to smp_send_stop(), however these are not required as include/linux/smp.h includes an empty version of this function for CONFIG_SMP=n builds. Signed-off-by: Chris Smart Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 44c8d03..8ca79b7 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -128,9 +128,7 @@ void machine_restart(char *cmd) machine_shutdown(); if (ppc_md.restart) ppc_md.restart(cmd); -#ifdef CONFIG_SMP smp_send_stop(); -#endif printk(KERN_EMERG "System Halted, OK to turn off power\n"); local_irq_disable(); while (1) ; @@ -141,9 +139,7 @@ void machine_power_off(void) machine_shutdown(); if (pm_power_off) pm_power_off(); -#ifdef CONFIG_SMP smp_send_stop(); -#endif printk(KERN_EMERG "System Halted, OK to turn off power\n"); local_irq_disable(); while (1) ; @@ -159,9 +155,7 @@ void machine_halt(void) machine_shutdown(); if (ppc_md.halt) ppc_md.halt(); -#ifdef CONFIG_SMP smp_send_stop(); -#endif printk(KERN_EMERG "System Halted, OK to turn off power\n"); local_irq_disable(); while (1) ; -- cgit v0.10.2 From 438517ec78fa4e9db9892e749ed50e2a21f5f17f Mon Sep 17 00:00:00 2001 From: Chris Smart Date: Mon, 2 May 2016 13:51:38 +1000 Subject: selftests/powerpc: Test cp_abort during context switch Test that performing a copy paste sequence in userspace on P9 does not result in a leak of the copy into the paste of another process. This is based on Anton Blanchard's context_switch benchmarking code. It sets up two processes tied to the same CPU, one which copies and one which pastes. The paste should never succeed and the test fails if it does. This is a test for commit, "8a64904 powerpc: Add support for userspace P9 copy paste." Patch created with much assistance from Michael Neuling Signed-off-by: Chris Smart Reviewed-by: Cyril Bur Signed-off-by: Michael Ellerman diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index b08f77c..4ca83fe 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -14,6 +14,7 @@ export CFLAGS SUB_DIRS = benchmarks \ copyloops \ + context_switch \ dscr \ mm \ pmu \ diff --git a/tools/testing/selftests/powerpc/context_switch/.gitignore b/tools/testing/selftests/powerpc/context_switch/.gitignore new file mode 100644 index 0000000..c1431af --- /dev/null +++ b/tools/testing/selftests/powerpc/context_switch/.gitignore @@ -0,0 +1 @@ +cp_abort diff --git a/tools/testing/selftests/powerpc/context_switch/Makefile b/tools/testing/selftests/powerpc/context_switch/Makefile new file mode 100644 index 0000000..e164d14 --- /dev/null +++ b/tools/testing/selftests/powerpc/context_switch/Makefile @@ -0,0 +1,10 @@ +TEST_PROGS := cp_abort + +all: $(TEST_PROGS) + +$(TEST_PROGS): ../harness.c ../utils.c + +include ../../lib.mk + +clean: + rm -f $(TEST_PROGS) diff --git a/tools/testing/selftests/powerpc/context_switch/cp_abort.c b/tools/testing/selftests/powerpc/context_switch/cp_abort.c new file mode 100644 index 0000000..5a5b55a --- /dev/null +++ b/tools/testing/selftests/powerpc/context_switch/cp_abort.c @@ -0,0 +1,110 @@ +/* + * Adapted from Anton Blanchard's context switch microbenchmark. + * + * Copyright 2009, Anton Blanchard, IBM Corporation. + * Copyright 2016, Mikey Neuling, Chris Smart, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This program tests the copy paste abort functionality of a P9 + * (or later) by setting up two processes on the same CPU, one + * which executes the copy instruction and the other which + * executes paste. + * + * The paste instruction should never succeed, as the cp_abort + * instruction is called by the kernel during a context switch. + * + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include "utils.h" +#include + +#define READ_FD 0 +#define WRITE_FD 1 + +#define NUM_LOOPS 1000 + +/* This defines the "paste" instruction from Power ISA 3.0 Book II, section 4.4. */ +#define PASTE(RA, RB, L, RC) \ + .long (0x7c00070c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10) | (RC) << (31-31)) + +int paste(void *i) +{ + int cr; + + asm volatile(str(PASTE(0, %1, 1, 1))";" + "mfcr %0;" + : "=r" (cr) + : "b" (i) + : "memory" + ); + return cr; +} + +/* This defines the "copy" instruction from Power ISA 3.0 Book II, section 4.4. */ +#define COPY(RA, RB, L) \ + .long (0x7c00060c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10)) + +void copy(void *i) +{ + asm volatile(str(COPY(0, %0, 1))";" + : + : "b" (i) + : "memory" + ); +} + +int test_cp_abort(void) +{ + /* 128 bytes for a full cache line */ + char buf[128] __cacheline_aligned; + cpu_set_t cpuset; + int fd1[2], fd2[2], pid; + char c; + + /* only run this test on a P9 or later */ + SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_00)); + + /* + * Run both processes on the same CPU, so that copy is more likely + * to leak into a paste. + */ + CPU_ZERO(&cpuset); + CPU_SET(pick_online_cpu(), &cpuset); + FAIL_IF(sched_setaffinity(0, sizeof(cpuset), &cpuset)); + + FAIL_IF(pipe(fd1) || pipe(fd2)); + + pid = fork(); + FAIL_IF(pid < 0); + + if (!pid) { + for (int i = 0; i < NUM_LOOPS; i++) { + FAIL_IF((write(fd1[WRITE_FD], &c, 1)) != 1); + FAIL_IF((read(fd2[READ_FD], &c, 1)) != 1); + /* A paste succeeds if CR0 EQ bit is set */ + FAIL_IF(paste(buf) & 0x20000000); + } + } else { + for (int i = 0; i < NUM_LOOPS; i++) { + FAIL_IF((read(fd1[READ_FD], &c, 1)) != 1); + copy(buf); + FAIL_IF((write(fd2[WRITE_FD], &c, 1) != 1)); + } + } + return 0; + +} + +int main(int argc, char *argv[]) +{ + return test_harness(test_cp_abort, "cp_abort"); +} diff --git a/tools/testing/selftests/powerpc/utils.h b/tools/testing/selftests/powerpc/utils.h index 175ac6a..3b370de 100644 --- a/tools/testing/selftests/powerpc/utils.h +++ b/tools/testing/selftests/powerpc/utils.h @@ -6,6 +6,8 @@ #ifndef _SELFTESTS_POWERPC_UTILS_H #define _SELFTESTS_POWERPC_UTILS_H +#define __cacheline_aligned __attribute__((aligned(128))) + #include #include #include @@ -54,4 +56,9 @@ do { \ #define _str(s) #s #define str(s) _str(s) +/* POWER9 feature */ +#ifndef PPC_FEATURE2_ARCH_3_00 +#define PPC_FEATURE2_ARCH_3_00 0x00800000 +#endif + #endif /* _SELFTESTS_POWERPC_UTILS_H */ -- cgit v0.10.2 From 2f67798c1f9f35ba576ac0639b9b648b9b2033f6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 2 May 2016 14:54:29 +1000 Subject: selftests/powerpc: Fix subpage_prot test to return !0 on failure It's helpful for automated testing if the test returns error codes back to the calling program. While we're here fix all the usages of %p to remove the double 0x, ie. %p already includes 0x. Signed-off-by: Michael Ellerman Reviewed-by: Aneesh Kumar K.V diff --git a/tools/testing/selftests/powerpc/mm/subpage_prot.c b/tools/testing/selftests/powerpc/mm/subpage_prot.c index 440180f..35ade74 100644 --- a/tools/testing/selftests/powerpc/mm/subpage_prot.c +++ b/tools/testing/selftests/powerpc/mm/subpage_prot.c @@ -73,7 +73,7 @@ static inline void check_faulted(void *addr, long page, long subpage, int write) want_fault |= (subpage == ((page + 1) % 16)); if (faulted != want_fault) { - printf("Failed at 0x%p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n", + printf("Failed at %p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n", addr, page, subpage, write, want_fault ? "fault" : "pass", faulted ? "fault" : "pass"); @@ -82,7 +82,7 @@ static inline void check_faulted(void *addr, long page, long subpage, int write) if (faulted) { if (dar != addr) { - printf("Fault expected at 0x%p and happened at 0x%p !\n", + printf("Fault expected at %p and happened at %p !\n", addr, dar); } faulted = 0; @@ -162,7 +162,7 @@ int test_anon(void) mallocblock = (void *)align; - printf("allocated malloc block of 0x%lx bytes at 0x%p\n", + printf("allocated malloc block of 0x%lx bytes at %p\n", mallocsize, mallocblock); printf("testing malloc block...\n"); @@ -197,7 +197,7 @@ int test_file(void) perror("failed to map file"); return 1; } - printf("allocated %s for 0x%lx bytes at 0x%p\n", + printf("allocated %s for 0x%lx bytes at %p\n", file_name, filesize, fileblock); printf("testing file map...\n"); @@ -207,14 +207,16 @@ int test_file(void) int main(int argc, char *argv[]) { - test_harness(test_anon, "subpage_prot_anon"); + int rc; + + rc = test_harness(test_anon, "subpage_prot_anon"); + if (rc) + return rc; if (argc > 1) file_name = argv[1]; else file_name = "tempfile"; - test_harness(test_file, "subpage_prot_file"); - - return 0; + return test_harness(test_file, "subpage_prot_file"); } -- cgit v0.10.2 From 9e44754755528f78dbc3a5e64cd48dadf4773777 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 2 May 2016 17:06:12 +1000 Subject: powerpc/powernv/pci: Fix cfg_dbg() & replace with pr_devel() When cfg_dbg() is enabled (i.e. mapped to printk()), gcc produces errors as the __func__ parameter is missing (pnv_pci_cfg_read() has one); this adds the missing parameter. cfg_dbg() is just an inferior version of pr_devel() so use the latter instead. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 73c8dc2..0db20ae 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -39,9 +39,6 @@ /* Delay in usec */ #define PCI_RESET_DELAY_US 3000000 -#define cfg_dbg(fmt...) do { } while(0) -//#define cfg_dbg(fmt...) printk(fmt) - #ifdef CONFIG_PCI_MSI int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) { @@ -402,8 +399,8 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) } } - cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", - (pdn->busno << 8) | (pdn->devfn), pe_no, fstate); + pr_devel(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", + (pdn->busno << 8) | (pdn->devfn), pe_no, fstate); /* Clear the frozen state if applicable */ if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE || @@ -451,8 +448,8 @@ int pnv_pci_cfg_read(struct pci_dn *pdn, return PCIBIOS_FUNC_NOT_SUPPORTED; } - cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", - __func__, pdn->busno, pdn->devfn, where, size, *val); + pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + __func__, pdn->busno, pdn->devfn, where, size, *val); return PCIBIOS_SUCCESSFUL; } @@ -462,8 +459,8 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, struct pnv_phb *phb = pdn->phb->private_data; u32 bdfn = (pdn->busno << 8) | pdn->devfn; - cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", - pdn->busno, pdn->devfn, where, size, val); + pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + __func__, pdn->busno, pdn->devfn, where, size, val); switch (size) { case 1: opal_pci_config_write_byte(phb->opal_id, bdfn, where, val); -- cgit v0.10.2 From b2ed059642875a699a195cb136b2e57e7e07cedf Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Thu, 28 Apr 2016 15:53:45 +1000 Subject: powerpc/pseries: Add null property check to pseries_discover_pic() The return value of of_get_property() isn't checked before it is passed to the strstr() function, if it happens that the return value is null then this will result in a null pointer being dereferenced. Add a check to see if the return value of of_get_property() is null and if it is continue straight on to the next node. Signed-off-by: Suraj Jitindar Singh Reviewed-by: Chris Smart Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 6e944fc..fa73494 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -235,6 +235,8 @@ static void __init pseries_discover_pic(void) for_each_node_by_name(np, "interrupt-controller") { typep = of_get_property(np, "compatible", NULL); + if (!typep) + continue; if (strstr(typep, "open-pic")) { pSeries_mpic_node = of_node_get(np); ppc_md.init_IRQ = pseries_mpic_init_IRQ; -- cgit v0.10.2 From 201b3fe586361c0241105f6e3de2c4749010f897 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Thu, 28 Apr 2016 15:34:54 +1000 Subject: drivers/of: Add check for null property in of_remove_property() The validity of the property input argument to of_remove_property() is never checked within the function and thus it is possible to pass a null value. It happens that this will be picked up in __of_remove_property() as no matching property of the device node will be found and thus an error will be returned, however once again there is no explicit check for a null value. By the time this is detected 2 locks have already been acquired which is completely unnecessary if the property to remove is null. Add an explicit check in the function of_remove_property() for a null property value and return -ENODEV in this case, this is consistent with what the previous return value would have been when the null value was not detected and passed to __of_remove_property(). By moving an explicit check for the property paramenter into the of_remove_property() function, this will remove the need to perform this check in calling code before invocation of the of_remove_property() function. Signed-off-by: Suraj Jitindar Singh Acked-by: Rob Herring Signed-off-by: Michael Ellerman diff --git a/drivers/of/base.c b/drivers/of/base.c index b299de2..64018eb 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1777,6 +1777,9 @@ int of_remove_property(struct device_node *np, struct property *prop) unsigned long flags; int rc; + if (!prop) + return -ENODEV; + mutex_lock(&of_mutex); raw_spin_lock_irqsave(&devtree_lock, flags); -- cgit v0.10.2 From 925e2d1ded80fcc92210c9a9e5009887553dd83f Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Thu, 28 Apr 2016 15:34:55 +1000 Subject: powerpc: Update of_remove_property() call sites to remove null checking After obtaining a property from of_find_property() and before calling of_remove_property() most code checks to ensure that the property returned from of_find_property() is not null. The previous patch moved this check to the start of the function of_remove_property() in order to avoid the case where this check isn't done and a null value is passed. This ensures the check is always conducted before taking locks and attempting to remove the property. Thus it is no longer necessary to perform a check for null values before invoking of_remove_property(). Update of_remove_property() call sites in order to remove redundant checking for null property value as check is now performed within the of_remove_property function(). Signed-off-by: Suraj Jitindar Singh [mpe: Unbreak some lines which are just >80 chars for readability] Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index 015ae55..2694d07 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -228,17 +228,12 @@ static struct property memory_limit_prop = { static void __init export_crashk_values(struct device_node *node) { - struct property *prop; - /* There might be existing crash kernel properties, but we can't * be sure what's in them, so remove them. */ - prop = of_find_property(node, "linux,crashkernel-base", NULL); - if (prop) - of_remove_property(node, prop); - - prop = of_find_property(node, "linux,crashkernel-size", NULL); - if (prop) - of_remove_property(node, prop); + of_remove_property(node, of_find_property(node, + "linux,crashkernel-base", NULL)); + of_remove_property(node, of_find_property(node, + "linux,crashkernel-size", NULL)); if (crashk_res.start != 0) { crashk_base = cpu_to_be_ulong(crashk_res.start), @@ -258,16 +253,13 @@ static void __init export_crashk_values(struct device_node *node) static int __init kexec_setup(void) { struct device_node *node; - struct property *prop; node = of_find_node_by_path("/chosen"); if (!node) return -ENOENT; /* remove any stale properties so ours can be found */ - prop = of_find_property(node, kernel_end_prop.name, NULL); - if (prop) - of_remove_property(node, prop); + of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL)); /* information needed by userspace when using default_machine_kexec */ kernel_end = cpu_to_be_ulong(__pa(_end)); diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 1da864c..b8c202d 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -403,7 +403,6 @@ static struct property htab_size_prop = { static int __init export_htab_values(void) { struct device_node *node; - struct property *prop; /* On machines with no htab htab_address is NULL */ if (!htab_address) @@ -414,12 +413,8 @@ static int __init export_htab_values(void) return -ENODEV; /* remove any stale propertys so ours can be found */ - prop = of_find_property(node, htab_base_prop.name, NULL); - if (prop) - of_remove_property(node, prop); - prop = of_find_property(node, htab_size_prop.name, NULL); - if (prop) - of_remove_property(node, prop); + of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL)); + of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL)); htab_base = cpu_to_be64(__pa(htab_address)); of_add_property(node, &htab_base_prop); diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index ceb18d3..a560a98 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -191,8 +191,8 @@ static int update_dt_node(__be32 phandle, s32 scope) break; case 0x80000000: - prop = of_find_property(dn, prop_name, NULL); - of_remove_property(dn, prop); + of_remove_property(dn, of_find_property(dn, + prop_name, NULL)); prop = NULL; break; diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 7c7fcc0..cc66c49 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -303,7 +303,6 @@ static int do_remove_property(char *buf, size_t bufsize) { struct device_node *np; char *tmp; - struct property *prop; buf = parse_node(buf, bufsize, &np); if (!np) @@ -316,9 +315,7 @@ static int do_remove_property(char *buf, size_t bufsize) if (strlen(buf) == 0) return -EINVAL; - prop = of_find_property(np, buf, NULL); - - return of_remove_property(np, prop); + return of_remove_property(np, of_find_property(np, buf, NULL)); } static int do_update_property(char *buf, size_t bufsize) -- cgit v0.10.2 From 8bbc9b7b001eaab8abf7e9e24edf1bb285c8d825 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 6 May 2016 16:46:00 +1000 Subject: powerpc/mm/hash64: Factor out hash preload psize check Currently we have a check in hash_preload() against the psize, which is only included when CONFIG_PPC_MM_SLICES is enabled. We want to expand this check in a subsequent patch, so factor it out to allow that. As a bonus it removes the #ifdef in the C code. Unfortunately we can't put this in the existing CONFIG_PPC_MM_SLICES block because it would require a forward declaration. Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 262082e..fbe747e 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1322,6 +1322,22 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, return hash_page_mm(mm, ea, access, trap, flags); } +#ifdef CONFIG_PPC_MM_SLICES +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + /* We only prefault standard pages for now */ + if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) + return false; + + return true; +} +#else +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + return true; +} +#endif + void hash_preload(struct mm_struct *mm, unsigned long ea, unsigned long access, unsigned long trap) { @@ -1334,11 +1350,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, BUG_ON(REGION_ID(ea) != USER_REGION_ID); -#ifdef CONFIG_PPC_MM_SLICES - /* We only prefault standard pages for now */ - if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) + if (!should_hash_preload(mm, ea)) return; -#endif DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," " trap=%lx\n", mm, mm->pgd, ea, access, trap); -- cgit v0.10.2 From aac55d7573c5d46ed9a62818d5d3e69dd2060105 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 6 May 2016 16:47:12 +1000 Subject: powerpc/mm/hash64: Fix subpage protection with 4K HPTE config With Linux page size of 64K and hardware only supporting 4K HPTE, if we use subpage protection, we always fail for the subpage 0 as shown below (using the selftest subpage_prot test): 520175565: (4520111850): Failed at 0x3fffad4b0000 (p=13,sp=0,w=0), want=fault, got=pass ! 4520890210: (4520826495): Failed at 0x3fffad5b0000 (p=29,sp=0,w=0), want=fault, got=pass ! 4521574251: (4521510536): Failed at 0x3fffad6b0000 (p=45,sp=0,w=0), want=fault, got=pass ! 4522258324: (4522194609): Failed at 0x3fffad7b0000 (p=61,sp=0,w=0), want=fault, got=pass ! This is because hash preload wrongly inserts the HPTE entry for subpage 0 without looking at the subpage protection information. Fix it by teaching should_hash_preload() not to preload if we have subpage protection configured for that range. It appears this has been broken since it was introduced in 2008. Fixes: fa28237cfcc5 ("[POWERPC] Provide a way to protect 4k subpages when using 64k pages") Signed-off-by: Aneesh Kumar K.V [mpe: Rework into should_hash_preload() to avoid build fails w/SLICES=n] Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index fbe747e..5926896 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1325,8 +1325,16 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, #ifdef CONFIG_PPC_MM_SLICES static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) { + int psize = get_slice_psize(mm, ea); + /* We only prefault standard pages for now */ - if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) + if (unlikely(psize != mm->context.user_psize)) + return false; + + /* + * Don't prefault if subpage protection is enabled for the EA. + */ + if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea))) return false; return true; -- cgit v0.10.2 From 17ed7c38427ff84ef77eac15724b6faf35e5492e Mon Sep 17 00:00:00 2001 From: Chandan Kumar Date: Thu, 28 Apr 2016 15:01:08 +0530 Subject: powerpc: Add HAVE_PERF_USER_STACK_DUMP support With perf regs support enabled for powerpc, in commit ed4a4ef85cf5 ("powerpc/perf: Add support for sampling interrupt register state"), the support for obtaining perf user stack dump is already enabled. This patch declares the support for same and also updates documentation to mark the support for perf-regs and perf-stackdump. Signed-off-by: Chandan Kumar Signed-off-by: Michael Ellerman diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt index e2b4a78..f179b1f 100644 --- a/Documentation/features/perf/perf-regs/arch-support.txt +++ b/Documentation/features/perf/perf-regs/arch-support.txt @@ -27,7 +27,7 @@ | nios2: | TODO | | openrisc: | TODO | | parisc: | TODO | - | powerpc: | TODO | + | powerpc: | ok | | s390: | TODO | | score: | TODO | | sh: | TODO | diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt index 3dc24b0..85777c5 100644 --- a/Documentation/features/perf/perf-stackdump/arch-support.txt +++ b/Documentation/features/perf/perf-stackdump/arch-support.txt @@ -27,7 +27,7 @@ | nios2: | TODO | | openrisc: | TODO | | parisc: | TODO | - | powerpc: | TODO | + | powerpc: | ok | | s390: | TODO | | score: | TODO | | sh: | TODO | diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b14966d..a630ac2 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -117,6 +117,7 @@ config PPC select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_PERF_EVENTS select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64 select ARCH_WANT_IPC_PARSE_VERSION -- cgit v0.10.2 From c4522469e6418fb547fbcd9d3aeb73bea0f6f859 Mon Sep 17 00:00:00 2001 From: Chandan Kumar Date: Thu, 28 Apr 2016 15:01:09 +0530 Subject: perf/powerpc: Add support for unwinding perf-stackdump Adds support for unwinding user stack dump by linking with libunwind. Signed-off-by: Chandan Kumar Reviewed-by: Naveen N. Rao Tested-by: Naveen N. Rao Acked-by: Arnaldo Carvalho de Melo Signed-off-by: Michael Ellerman diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build index 9ee3350..90ad64b 100644 --- a/tools/perf/arch/powerpc/util/Build +++ b/tools/perf/arch/powerpc/util/Build @@ -5,3 +5,4 @@ libperf-y += perf_regs.o libperf-$(CONFIG_DWARF) += dwarf-regs.o libperf-$(CONFIG_DWARF) += skip-callchain-idx.o +libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o diff --git a/tools/perf/arch/powerpc/util/unwind-libunwind.c b/tools/perf/arch/powerpc/util/unwind-libunwind.c new file mode 100644 index 0000000..9e15f92 --- /dev/null +++ b/tools/perf/arch/powerpc/util/unwind-libunwind.c @@ -0,0 +1,96 @@ +/* + * Copyright 2016 Chandan Kumar, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include "../../util/unwind.h" +#include "../../util/debug.h" + +int libunwind__arch_reg_id(int regnum) +{ + switch (regnum) { + case UNW_PPC64_R0: + return PERF_REG_POWERPC_R0; + case UNW_PPC64_R1: + return PERF_REG_POWERPC_R1; + case UNW_PPC64_R2: + return PERF_REG_POWERPC_R2; + case UNW_PPC64_R3: + return PERF_REG_POWERPC_R3; + case UNW_PPC64_R4: + return PERF_REG_POWERPC_R4; + case UNW_PPC64_R5: + return PERF_REG_POWERPC_R5; + case UNW_PPC64_R6: + return PERF_REG_POWERPC_R6; + case UNW_PPC64_R7: + return PERF_REG_POWERPC_R7; + case UNW_PPC64_R8: + return PERF_REG_POWERPC_R8; + case UNW_PPC64_R9: + return PERF_REG_POWERPC_R9; + case UNW_PPC64_R10: + return PERF_REG_POWERPC_R10; + case UNW_PPC64_R11: + return PERF_REG_POWERPC_R11; + case UNW_PPC64_R12: + return PERF_REG_POWERPC_R12; + case UNW_PPC64_R13: + return PERF_REG_POWERPC_R13; + case UNW_PPC64_R14: + return PERF_REG_POWERPC_R14; + case UNW_PPC64_R15: + return PERF_REG_POWERPC_R15; + case UNW_PPC64_R16: + return PERF_REG_POWERPC_R16; + case UNW_PPC64_R17: + return PERF_REG_POWERPC_R17; + case UNW_PPC64_R18: + return PERF_REG_POWERPC_R18; + case UNW_PPC64_R19: + return PERF_REG_POWERPC_R19; + case UNW_PPC64_R20: + return PERF_REG_POWERPC_R20; + case UNW_PPC64_R21: + return PERF_REG_POWERPC_R21; + case UNW_PPC64_R22: + return PERF_REG_POWERPC_R22; + case UNW_PPC64_R23: + return PERF_REG_POWERPC_R23; + case UNW_PPC64_R24: + return PERF_REG_POWERPC_R24; + case UNW_PPC64_R25: + return PERF_REG_POWERPC_R25; + case UNW_PPC64_R26: + return PERF_REG_POWERPC_R26; + case UNW_PPC64_R27: + return PERF_REG_POWERPC_R27; + case UNW_PPC64_R28: + return PERF_REG_POWERPC_R28; + case UNW_PPC64_R29: + return PERF_REG_POWERPC_R29; + case UNW_PPC64_R30: + return PERF_REG_POWERPC_R30; + case UNW_PPC64_R31: + return PERF_REG_POWERPC_R31; + case UNW_PPC64_LR: + return PERF_REG_POWERPC_LINK; + case UNW_PPC64_CTR: + return PERF_REG_POWERPC_CTR; + case UNW_PPC64_XER: + return PERF_REG_POWERPC_XER; + case UNW_PPC64_NIP: + return PERF_REG_POWERPC_NIP; + default: + pr_err("unwind: invalid reg id %d\n", regnum); + return -EINVAL; + } + return -EINVAL; +} diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index eda1cfb..220536c 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -26,6 +26,7 @@ NO_PERF_REGS := 1 # Additional ARCH settings for ppc ifeq ($(ARCH),powerpc) NO_PERF_REGS := 0 + LIBUNWIND_LIBS := -lunwind -lunwind-ppc64 endif # Additional ARCH settings for x86 -- cgit v0.10.2 From f47822078dece7189cad0a5f472f148e5e916736 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 28 Apr 2016 15:01:10 +0530 Subject: perf tools: Fix perf regs mask generation On some architectures (powerpc in particular), the number of registers exceeds what can be represented in an integer bitmask. Ensure we generate the proper bitmask on such platforms. Fixes: 71ad0f5e4 ("perf tools: Support for DWARF CFI unwinding on post processing") Signed-off-by: Naveen N. Rao Acked-by: Arnaldo Carvalho de Melo Signed-off-by: Michael Ellerman diff --git a/tools/perf/util/perf_regs.c b/tools/perf/util/perf_regs.c index 6b8eb13..c4023f2 100644 --- a/tools/perf/util/perf_regs.c +++ b/tools/perf/util/perf_regs.c @@ -12,18 +12,18 @@ int perf_reg_value(u64 *valp, struct regs_dump *regs, int id) int i, idx = 0; u64 mask = regs->mask; - if (regs->cache_mask & (1 << id)) + if (regs->cache_mask & (1ULL << id)) goto out; - if (!(mask & (1 << id))) + if (!(mask & (1ULL << id))) return -EINVAL; for (i = 0; i < id; i++) { - if (mask & (1 << i)) + if (mask & (1ULL << i)) idx++; } - regs->cache_mask |= (1 << id); + regs->cache_mask |= (1ULL << id); regs->cache_regs[id] = regs->regs[idx]; out: -- cgit v0.10.2 From 31cdd0c39c7544ced79da53aa0b7e989f3a39582 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 13 Apr 2016 21:31:24 +1000 Subject: powerpc/xmon: Fix SPR read/write commands and add command to dump SPRs xmon has commands for reading and writing SPRs, but they don't work currently for several reasons. They attempt to synthesize a small function containing an mfspr or mtspr instruction and call it. However, the instructions are on the stack, which is usually not executable. Also, for 64-bit we set up a procedure descriptor, which is fine for the big-endian ABIv1, but not correct for ABIv2. Finally, the code uses the infrastructure for catching memory errors, but that only catches data storage interrupts and machine check interrupts, but a failed mfspr/mtspr can generate a program interrupt or a hypervisor emulation assist interrupt, or be a no-op. Instead of trying to synthesize a function on the fly, this adds two new functions, xmon_mfspr() and xmon_mtspr(), which take an SPR number as an argument and read or write the SPR. Because there is no Power ISA instruction which takes an SPR number in a register, we have to generate one of each possible mfspr and mtspr instruction, for all 1024 possible SPRs. Thus we get just over 8k bytes of code for each of xmon_mfspr() and xmon_mtspr(). However, this 16kB of code pales in comparison to the > 130kB of PPC opcode tables used by the xmon disassembler. To catch interrupts caused by the mfspr/mtspr instructions, we add a new 'catch_spr_faults' flag. If an interrupt occurs while it is set, we come back into xmon() via program_check_interrupt(), _exception() and die(), see that catch_spr_faults is set and do a longjmp to bus_error_jmp, back into read_spr() or write_spr(). This adds a couple of other nice features: first, a "Sa" command that attempts to read and print out the value of all 1024 SPRs. If any mfspr instruction acts as a no-op, then the SPR is not implemented and not printed. Secondly, the Sr and Sw commands detect when an SPR is not implemented (i.e. mfspr is a no-op) and print a message to that effect rather than printing a bogus value. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 436062d..0b2f771 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -7,7 +7,7 @@ UBSAN_SANITIZE := n ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -obj-y += xmon.o nonstdio.o +obj-y += xmon.o nonstdio.o spr_access.o ifdef CONFIG_XMON_DISASSEMBLY obj-y += ppc-dis.o ppc-opc.o diff --git a/arch/powerpc/xmon/spr_access.S b/arch/powerpc/xmon/spr_access.S new file mode 100644 index 0000000..84ad742 --- /dev/null +++ b/arch/powerpc/xmon/spr_access.S @@ -0,0 +1,45 @@ +#include + +/* unsigned long xmon_mfspr(sprn, default_value) */ +_GLOBAL(xmon_mfspr) + ld r5, .Lmfspr_table@got(r2) + b xmon_mxspr + +/* void xmon_mtspr(sprn, new_value) */ +_GLOBAL(xmon_mtspr) + ld r5, .Lmtspr_table@got(r2) + b xmon_mxspr + +/* + * r3 = sprn + * r4 = default or new value + * r5 = table base + */ +xmon_mxspr: + /* + * To index into the table of mxsprs we need: + * i = (sprn & 0x3ff) * 8 + * or using rwlinm: + * i = (sprn << 3) & (0x3ff << 3) + */ + rlwinm r3, r3, 3, 0x3ff << 3 + add r5, r5, r3 + mtctr r5 + mr r3, r4 /* put default_value in r3 for mfspr */ + bctr + +.Lmfspr_table: + spr = 0 + .rept 1024 + mfspr r3, spr + blr + spr = spr + 1 + .endr + +.Lmtspr_table: + spr = 0 + .rept 1024 + mtspr spr, r4 + blr + spr = spr + 1 + .endr diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 308283b..c5e1551 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -86,6 +86,7 @@ static char tmpstr[128]; static long bus_error_jmp[JMP_BUF_LEN]; static int catch_memory_errors; +static int catch_spr_faults; static long *xmon_fault_jmp[NR_CPUS]; /* Breakpoint stuff */ @@ -147,7 +148,7 @@ void getstring(char *, int); static void flush_input(void); static int inchar(void); static void take_input(char *); -static unsigned long read_spr(int); +static int read_spr(int, unsigned long *); static void write_spr(int, unsigned long); static void super_regs(void); static void remove_bpts(void); @@ -250,6 +251,9 @@ Commands:\n\ sdi # disassemble spu local store for spu # (in hex)\n" #endif " S print special registers\n\ + Sa print all SPRs\n\ + Sr # read SPR #\n\ + Sw #v write v to SPR #\n\ t print backtrace\n\ x exit monitor and recover\n\ X exit monitor and don't recover\n" @@ -442,6 +446,12 @@ static int xmon_core(struct pt_regs *regs, int fromipi) #ifdef CONFIG_SMP cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, &cpus_in_xmon)) { + /* + * We catch SPR read/write faults here because the 0x700, 0xf60 + * etc. handlers don't call debugger_fault_handler(). + */ + if (catch_spr_faults) + longjmp(bus_error_jmp, 1); get_output_lock(); excprint(regs); printf("cpu 0x%x: Exception %lx %s in xmon, " @@ -1635,89 +1645,87 @@ static void cacheflush(void) catch_memory_errors = 0; } -static unsigned long -read_spr(int n) +extern unsigned long xmon_mfspr(int spr, unsigned long default_value); +extern void xmon_mtspr(int spr, unsigned long value); + +static int +read_spr(int n, unsigned long *vp) { - unsigned int instrs[2]; - unsigned long (*code)(void); unsigned long ret = -1UL; -#ifdef CONFIG_PPC64 - unsigned long opd[3]; - - opd[0] = (unsigned long)instrs; - opd[1] = 0; - opd[2] = 0; - code = (unsigned long (*)(void)) opd; -#else - code = (unsigned long (*)(void)) instrs; -#endif - - /* mfspr r3,n; blr */ - instrs[0] = 0x7c6002a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); - instrs[1] = 0x4e800020; - store_inst(instrs); - store_inst(instrs+1); + int ok = 0; if (setjmp(bus_error_jmp) == 0) { - catch_memory_errors = 1; + catch_spr_faults = 1; sync(); - ret = code(); + ret = xmon_mfspr(n, *vp); sync(); - /* wait a little while to see if we get a machine check */ - __delay(200); - n = size; + *vp = ret; + ok = 1; } + catch_spr_faults = 0; - return ret; + return ok; } static void write_spr(int n, unsigned long val) { - unsigned int instrs[2]; - unsigned long (*code)(unsigned long); -#ifdef CONFIG_PPC64 - unsigned long opd[3]; - - opd[0] = (unsigned long)instrs; - opd[1] = 0; - opd[2] = 0; - code = (unsigned long (*)(unsigned long)) opd; -#else - code = (unsigned long (*)(unsigned long)) instrs; -#endif - - instrs[0] = 0x7c6003a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); - instrs[1] = 0x4e800020; - store_inst(instrs); - store_inst(instrs+1); - if (setjmp(bus_error_jmp) == 0) { - catch_memory_errors = 1; + catch_spr_faults = 1; sync(); - code(val); + xmon_mtspr(n, val); sync(); - /* wait a little while to see if we get a machine check */ - __delay(200); - n = size; + } else { + printf("SPR 0x%03x (%4d) Faulted during write\n", n, n); } + catch_spr_faults = 0; } static unsigned long regno; extern char exc_prolog; extern char dec_exc; +static void dump_one_spr(int spr, bool show_unimplemented) +{ + unsigned long val; + + val = 0xdeadbeef; + if (!read_spr(spr, &val)) { + printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr); + return; + } + + if (val == 0xdeadbeef) { + /* Looks like read was a nop, confirm */ + val = 0x0badcafe; + if (!read_spr(spr, &val)) { + printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr); + return; + } + + if (val == 0x0badcafe) { + if (show_unimplemented) + printf("SPR 0x%03x (%4d) Unimplemented\n", spr, spr); + return; + } + } + + printf("SPR 0x%03x (%4d) = 0x%lx\n", spr, spr, val); +} + static void super_regs(void) { int cmd; - unsigned long val; + int spr; cmd = skipbl(); - if (cmd == '\n') { + + switch (cmd) { + case '\n': { unsigned long sp, toc; asm("mr %0,1" : "=r" (sp) :); asm("mr %0,2" : "=r" (toc) :); @@ -1730,21 +1738,29 @@ static void super_regs(void) mfspr(SPRN_DEC), mfspr(SPRN_SPRG2)); printf("sp = "REG" sprg3= "REG"\n", sp, mfspr(SPRN_SPRG3)); printf("toc = "REG" dar = "REG"\n", toc, mfspr(SPRN_DAR)); - return; } - - scanhex(®no); - switch (cmd) { - case 'w': - val = read_spr(regno); + case 'w': { + unsigned long val; + scanhex(®no); + val = 0; + read_spr(regno, &val); scanhex(&val); write_spr(regno, val); - /* fall through */ + dump_one_spr(regno, true); + break; + } case 'r': - printf("spr %lx = %lx\n", regno, read_spr(regno)); + scanhex(®no); + dump_one_spr(regno, true); + break; + case 'a': + /* dump ALL SPRs */ + for (spr = 1; spr < 1024; ++spr) + dump_one_spr(spr, false); break; } + scannl(); } -- cgit v0.10.2 From dd21731022faf43c1250050e5d28d11add599149 Mon Sep 17 00:00:00 2001 From: Lennart Sorensen Date: Thu, 5 May 2016 16:44:44 -0400 Subject: powerpc/sstep: Fix sstep.c compile on powerpcspe Commit be96f63375a1 ("powerpc: Split out instruction analysis part of emulate_step()") introduced ldarx and stdcx into the instructions in sstep.c, which are not accepted by the assembler on powerpcspe, but does seem to be accepted by the normal powerpc assembler even in 32 bit mode. Wrap these two instructions in a __powerpc64__ check like it is everywhere else in the file. Fixes: be96f63375a1 ("powerpc: Split out instruction analysis part of emulate_step()") Signed-off-by: Len Sorensen Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index dc885b3..6d34310 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1818,9 +1818,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) case 4: __get_user_asmx(val, op.ea, err, "lwarx"); break; +#ifdef __powerpc64__ case 8: __get_user_asmx(val, op.ea, err, "ldarx"); break; +#endif default: return 0; } @@ -1841,9 +1843,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) case 4: __put_user_asmx(op.val, op.ea, err, "stwcx.", cr); break; +#ifdef __powerpc64__ case 8: __put_user_asmx(op.val, op.ea, err, "stdcx.", cr); break; +#endif default: return 0; } -- cgit v0.10.2 From 66707836062db62756b395d1fb16ab02ff162862 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Tue, 16 Feb 2016 17:31:53 +1100 Subject: powerpc/sstep: Fix emulation fall-through There is a switch fallthough in instr_analyze() which can cause an invalid instruction to be emulated as a different, valid, instruction. The rld* (opcode 30) case extracts a sub-opcode from bits 3:1 of the instruction word. However, the only valid values of this field are 001 and 000. These cases are correctly handled, but the others are not which causes execution to fall through into case 31. Breaking out of the switch causes the instruction to be marked as unknown and allows the caller to deal with the invalid instruction in a manner consistent with other invalid instructions. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 6d34310..3362299 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, } } #endif + break; /* illegal instruction */ case 31: switch ((instr >> 1) & 0x3ff) { -- cgit v0.10.2 From dd0b52c47a2d71b7960c9a5ce7d51f17ed03d9f5 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Thu, 5 May 2016 17:54:08 +1000 Subject: powerpc/mm: define TOP_ZONE as a constant The zone that contains the top of memory will be either ZONE_NORMAL or ZONE_HIGHMEM depending on the kernel config. There are two functions that require this information and both of them use an #ifdef to set a local variable (top_zone). This is a little silly so lets just make it a constant. Signed-off-by: Oliver O'Halloran Cc: linux-mm@kvack.org Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 8967788..879e0bc 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -68,12 +68,15 @@ pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); pgprot_t kmap_prot; EXPORT_SYMBOL(kmap_prot); +#define TOP_ZONE ZONE_HIGHMEM static inline pte_t *virt_to_kpte(unsigned long vaddr) { return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), vaddr), vaddr); } +#else +#define TOP_ZONE ZONE_NORMAL #endif int page_is_ram(unsigned long pfn) @@ -267,14 +270,9 @@ void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit) */ int dma_pfn_limit_to_zone(u64 pfn_limit) { - enum zone_type top_zone = ZONE_NORMAL; int i; -#ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; -#endif - - for (i = top_zone; i >= 0; i--) { + for (i = TOP_ZONE; i >= 0; i--) { if (max_zone_pfns[i] <= pfn_limit) return i; } @@ -289,7 +287,6 @@ void __init paging_init(void) { unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); - enum zone_type top_zone; #ifdef CONFIG_PPC32 unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); @@ -313,13 +310,9 @@ void __init paging_init(void) (long int)((top_of_ram - total_ram) >> 20)); #ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT); -#else - top_zone = ZONE_NORMAL; #endif - - limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT); + limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT); zone_limits_final = true; free_area_init_nodes(max_zone_pfns); -- cgit v0.10.2 From 171cb719dacd0f102b39bd5ab5eea0ce11813853 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 26 Feb 2016 11:26:26 +1100 Subject: powerpc/mm: Improve readability of update_mmu_cache() The function is used to update the MMU with software PTE. It can be called by data access exception handler (0x300) or instruction access exception handler (0x400). If the function is called by 0x400 handler, the local variable @access is set to _PAGE_EXEC to indicate the software PTE should have that flag set. When the function is called by 0x300 handler, @access is set to zero. This improves the readability of the function by replacing if statements with switch. No logical changes introduced. Signed-off-by: Gavin Shan Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 879e0bc..2fd57fa 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -491,7 +491,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held */ - unsigned long access = 0, trap; + unsigned long access, trap; + if (radix_enabled()) return; @@ -506,13 +507,19 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * * We also avoid filling the hash if not coming from a fault */ - if (current->thread.regs == NULL) - return; - trap = TRAP(current->thread.regs); - if (trap == 0x400) - access |= _PAGE_EXEC; - else if (trap != 0x300) + + trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; + switch (trap) { + case 0x300: + access = 0UL; + break; + case 0x400: + access = _PAGE_EXEC; + break; + default: return; + } + hash_preload(vma->vm_mm, address, access, trap); #endif /* CONFIG_PPC_STD_MMU */ #if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ -- cgit v0.10.2 From 895a79805c287df73142f1b424b22ea5190734c2 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 4 May 2016 14:46:30 +1000 Subject: cxl: Handle num_of_processes larger than can fit in the SPA num_of_process is a 16 bit field, theoretically allowing an AFU to support 16K processes, however the scheduled process area currently has a maximum size of 1MB, which limits the maximum number of processes to 7704. Some AFUs may not necessarily care what the limit is and just want to be able to use the maximum by setting the field to 16K. To allow these to work, detect this situation and use the maximum size for the SPA. Downgrade the WARN_ON to a dev_warn. Signed-off-by: Ian Munsie Signed-off-by: Michael Ellerman diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 387fcbd..b8b547a 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -185,16 +185,25 @@ static int spa_max_procs(int spa_size) int cxl_alloc_spa(struct cxl_afu *afu) { + unsigned spa_size; + /* Work out how many pages to allocate */ afu->native->spa_order = 0; do { afu->native->spa_order++; - afu->native->spa_size = (1 << afu->native->spa_order) * PAGE_SIZE; + spa_size = (1 << afu->native->spa_order) * PAGE_SIZE; + + if (spa_size > 0x100000) { + dev_warn(&afu->dev, "num_of_processes too large for the SPA, limiting to %i (0x%x)\n", + afu->native->spa_max_procs, afu->native->spa_size); + afu->num_procs = afu->native->spa_max_procs; + break; + } + + afu->native->spa_size = spa_size; afu->native->spa_max_procs = spa_max_procs(afu->native->spa_size); } while (afu->native->spa_max_procs < afu->num_procs); - WARN_ON(afu->native->spa_size > 0x100000); /* Max size supported by the hardware */ - if (!(afu->native->spa = (struct cxl_process_element *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, afu->native->spa_order))) { pr_err("cxl_alloc_spa: Unable to allocate scheduled process area\n"); -- cgit v0.10.2 From 0e5b5ba17ac33a05d9f4a48b5eb8b5e30f2274d7 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 4 May 2016 14:48:32 +1000 Subject: cxl: Remove duplicate #defines These defines are not used, but other equivalent definitions (CXL_SPA_SW_CMD_*) are used. Remove the unused defines. Signed-off-by: Ian Munsie Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index dfdbfb0..2823fb3 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -178,15 +178,6 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0}; #define CXL_PSL_SR_An_MP (1ull << (63-62)) /* Master Process */ #define CXL_PSL_SR_An_LE (1ull << (63-63)) /* Little Endian */ -/****** CXL_PSL_LLCMD_An ****************************************************/ -#define CXL_LLCMD_TERMINATE 0x0001000000000000ULL -#define CXL_LLCMD_REMOVE 0x0002000000000000ULL -#define CXL_LLCMD_SUSPEND 0x0003000000000000ULL -#define CXL_LLCMD_RESUME 0x0004000000000000ULL -#define CXL_LLCMD_ADD 0x0005000000000000ULL -#define CXL_LLCMD_UPDATE 0x0006000000000000ULL -#define CXL_LLCMD_HANDLE_MASK 0x000000000000ffffULL - /****** CXL_PSL_ID_An ****************************************************/ #define CXL_PSL_ID_An_F (1ull << (63-31)) #define CXL_PSL_ID_An_L (1ull << (63-30)) -- cgit v0.10.2 From 3c206fa77aaaac8cd7d4cfcd840c82495b01b288 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 4 May 2016 14:52:58 +1000 Subject: cxl: Ensure PSL interrupt is configured for contexts with no AFU IRQs In the cxl kernel API, it is possible to create a context and start it without allocating any interrupts. Since we assign or allocate the PSL interrupt when allocating AFU interrupts this will lead to a situation where we start the context with no means to take any faults. The user API is not affected as it always goes through the cxl interrupt allocation code paths and will have the PSL interrupt allocated or assigned, even if no AFU interrupts were requested. This checks that at least one interrupt is configured at the time of attach, and if not it will assign the multiplexed PSL interrupt for powernv, or allocate a single interrupt for PowerVM. Signed-off-by: Ian Munsie Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index a83acf9..769971c 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -552,6 +552,17 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr) elem->common.sstp0 = cpu_to_be64(ctx->sstp0); elem->common.sstp1 = cpu_to_be64(ctx->sstp1); + + /* + * Ensure we have at least one interrupt allocated to take faults for + * kernel contexts that may not have allocated any AFU IRQs at all: + */ + if (ctx->irqs.range[0] == 0) { + rc = afu_register_irqs(ctx, 0); + if (rc) + goto out_free; + } + for (r = 0; r < CXL_IRQ_RANGES; r++) { for (i = 0; i < ctx->irqs.range[r]; i++) { if (r == 0 && i == 0) { @@ -597,6 +608,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr) enable_afu_irqs(ctx); } +out_free: free_page((u64)elem); return rc; } diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index b8b547a..5d4fb9f 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -534,6 +534,15 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr) ctx->elem->common.sstp0 = cpu_to_be64(ctx->sstp0); ctx->elem->common.sstp1 = cpu_to_be64(ctx->sstp1); + /* + * Ensure we have the multiplexed PSL interrupt set up to take faults + * for kernel contexts that may not have allocated any AFU IRQs at all: + */ + if (ctx->irqs.range[0] == 0) { + ctx->irqs.offset[0] = ctx->afu->native->psl_hwirq; + ctx->irqs.range[0] = 1; + } + for (r = 0; r < CXL_IRQ_RANGES; r++) { ctx->elem->ivte_offsets[r] = cpu_to_be16(ctx->irqs.offset[r]); ctx->elem->ivte_ranges[r] = cpu_to_be16(ctx->irqs.range[r]); -- cgit v0.10.2 From 7a0d85d313c2066712e530e668bc02bb741a685c Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Fri, 6 May 2016 17:46:36 +1000 Subject: cxl: Add kernel API to allow a context to operate with relocate disabled cxl devices typically access memory using an MMU in much the same way as the CPU, and each context includes a state register much like the MSR in the CPU. Like the CPU, the state register includes a bit to enable relocation, which we currently always enable. In some cases, it may be desirable to allow a device to access memory using real addresses instead of effective addresses, so this adds a new API, cxl_set_translation_mode, that can be used to disable relocation on a given kernel context. This can allow for the creation of a special privileged context that the device can use if it needs relocation disabled, and can use regular contexts at times when it needs relocation enabled. This interface is only available to users of the kernel API for obvious reasons, and will never be supported in a virtualised environment. This will be used by the upcoming cxl support in the mlx5 driver. Signed-off-by: Ian Munsie Signed-off-by: Michael Ellerman diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index 8075823..6d228cc 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -183,6 +183,7 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed, ctx->pid = get_task_pid(task, PIDTYPE_PID); ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID); kernel = false; + ctx->real_mode = false; } cxl_ctx_get(); @@ -219,6 +220,24 @@ void cxl_set_master(struct cxl_context *ctx) } EXPORT_SYMBOL_GPL(cxl_set_master); +int cxl_set_translation_mode(struct cxl_context *ctx, bool real_mode) +{ + if (ctx->status == STARTED) { + /* + * We could potentially update the PE and issue an update LLCMD + * to support this, but it doesn't seem to have a good use case + * since it's trivial to just create a second kernel context + * with different translation modes, so until someone convinces + * me otherwise: + */ + return -EBUSY; + } + + ctx->real_mode = real_mode; + return 0; +} +EXPORT_SYMBOL_GPL(cxl_set_translation_mode); + /* wrappers around afu_* file ops which are EXPORTED */ int cxl_fd_open(struct inode *inode, struct file *file) { diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 2823fb3..9dd3a0e 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -514,6 +514,7 @@ struct cxl_context { bool pe_inserted; bool master; bool kernel; + bool real_mode; bool pending_irq; bool pending_fault; bool pending_afu_err; diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index 769971c..c2815b9 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -617,6 +617,9 @@ static int guest_attach_process(struct cxl_context *ctx, bool kernel, u64 wed, u { pr_devel("in %s\n", __func__); + if (ctx->real_mode) + return -EPERM; + ctx->kernel = kernel; if (ctx->afu->current_mode == CXL_MODE_DIRECTED) return attach_afu_directed(ctx, wed, amr); diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 5d4fb9f..98f2cac 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -494,8 +494,9 @@ static u64 calculate_sr(struct cxl_context *ctx) if (mfspr(SPRN_LPCR) & LPCR_TC) sr |= CXL_PSL_SR_An_TC; if (ctx->kernel) { - sr |= CXL_PSL_SR_An_R | (mfmsr() & MSR_SF); - sr |= CXL_PSL_SR_An_HV; + if (!ctx->real_mode) + sr |= CXL_PSL_SR_An_R; + sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV; } else { sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R; sr &= ~(CXL_PSL_SR_An_HV); diff --git a/include/misc/cxl.h b/include/misc/cxl.h index 7d5e261..56560c5 100644 --- a/include/misc/cxl.h +++ b/include/misc/cxl.h @@ -127,6 +127,14 @@ int cxl_afu_reset(struct cxl_context *ctx); void cxl_set_master(struct cxl_context *ctx); /* + * Sets the context to use real mode memory accesses to operate with + * translation disabled. Note that this only makes sense for kernel contexts + * under bare metal, and will not work with virtualisation. May only be + * performed on stopped contexts. + */ +int cxl_set_translation_mode(struct cxl_context *ctx, bool real_mode); + +/* * Map and unmap the AFU Problem Space area. The amount and location mapped * depends on if this context is a master or slave. */ -- cgit v0.10.2 From 266eab8f32cc43b688c2e9aaab63c2565a3998c2 Mon Sep 17 00:00:00 2001 From: Christophe Lombard Date: Fri, 22 Apr 2016 15:39:22 +0200 Subject: cxl: Check periodically the coherent platform function's state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the PowerVM environment, the PHYP CoherentAccel component manages the state of the Coherent Accelerator Processor Interface adapter and virtualizes CAPI resources, handles CAPP, PSL, PSL Slice errors - and interrupts - and provides a new set of hcalls for the OS APIs to utilize Accelerator Function Unit (AFU). During the course of operation, a coherent platform function can encounter errors. Some possible reason for errors are: • Hardware recoverable and unrecoverable errors • Transient and over-threshold correctable errors PHYP implements its own state model for the coherent platform function. The state of the AFU is available through a hcall. The current implementation of the cxl driver, for the PowerVM environment, checks this state of the AFU only when an action is requested - open a device, ioctl command, memory map, attach/detach a process - from an external driver - cxlflash, libcxl. If an error is detected the cxl driver handles the error according the content of the Power Architecture Platform Requirements document. But in case of low-level troubles (or error injection), the PHYP component may reset the card and change the AFU state. The PHYP interface doesn't provide any way to be notified when that happens thus implies that the cxl driver: • cannot handle immediatly the state change of the AFU. • cannot notify other drivers (cxlflash, ...) The purpose of this patch is to wake up the cpu periodically to check the current state of each AFU and to see if we need to enter an error recovery path. Signed-off-by: Christophe Lombard Acked-by: Ian Munsie Signed-off-by: Michael Ellerman diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 9dd3a0e..d23a3a5 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -366,11 +366,13 @@ struct cxl_afu_native { }; struct cxl_afu_guest { + struct cxl_afu *parent; u64 handle; phys_addr_t p2n_phys; u64 p2n_size; int max_ints; - struct mutex recovery_lock; + bool handle_err; + struct delayed_work work_err; int previous_state; }; diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index c2815b9..bc8d0b9 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -178,6 +178,9 @@ static int afu_read_error_state(struct cxl_afu *afu, int *state_out) u64 state; int rc = 0; + if (!afu) + return -EIO; + rc = cxl_h_read_error_state(afu->guest->handle, &state); if (!rc) { WARN_ON(state != H_STATE_NORMAL && @@ -833,7 +836,6 @@ static int afu_update_state(struct cxl_afu *afu) switch (cur_state) { case H_STATE_NORMAL: afu->guest->previous_state = cur_state; - rc = 1; break; case H_STATE_DISABLE: @@ -849,7 +851,6 @@ static int afu_update_state(struct cxl_afu *afu) pci_error_handlers(afu, CXL_SLOT_RESET_EVENT, pci_channel_io_normal); pci_error_handlers(afu, CXL_RESUME_EVENT, 0); - rc = 1; } afu->guest->previous_state = 0; break; @@ -874,39 +875,30 @@ static int afu_update_state(struct cxl_afu *afu) return rc; } -static int afu_do_recovery(struct cxl_afu *afu) +static void afu_handle_errstate(struct work_struct *work) { - int rc; + struct cxl_afu_guest *afu_guest = + container_of(to_delayed_work(work), struct cxl_afu_guest, work_err); - /* many threads can arrive here, in case of detach_all for example. - * Only one needs to drive the recovery - */ - if (mutex_trylock(&afu->guest->recovery_lock)) { - rc = afu_update_state(afu); - mutex_unlock(&afu->guest->recovery_lock); - return rc; - } - return 0; + if (!afu_update_state(afu_guest->parent) && + afu_guest->previous_state == H_STATE_PERM_UNAVAILABLE) + return; + + if (afu_guest->handle_err == true) + schedule_delayed_work(&afu_guest->work_err, + msecs_to_jiffies(3000)); } static bool guest_link_ok(struct cxl *cxl, struct cxl_afu *afu) { int state; - if (afu) { - if (afu_read_error_state(afu, &state) || - state != H_STATE_NORMAL) { - if (afu_do_recovery(afu) > 0) { - /* check again in case we've just fixed it */ - if (!afu_read_error_state(afu, &state) && - state == H_STATE_NORMAL) - return true; - } - return false; - } + if (afu && (!afu_read_error_state(afu, &state))) { + if (state == H_STATE_NORMAL) + return true; } - return true; + return false; } static int afu_properties_look_ok(struct cxl_afu *afu) @@ -944,8 +936,6 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n return -ENOMEM; } - mutex_init(&afu->guest->recovery_lock); - if ((rc = dev_set_name(&afu->dev, "afu%i.%i", adapter->adapter_num, slice))) @@ -1001,6 +991,15 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n afu->enabled = true; + /* + * wake up the cpu periodically to check the state + * of the AFU using "afu" stored in the guest structure. + */ + afu->guest->parent = afu; + afu->guest->handle_err = true; + INIT_DELAYED_WORK(&afu->guest->work_err, afu_handle_errstate); + schedule_delayed_work(&afu->guest->work_err, msecs_to_jiffies(1000)); + if ((rc = cxl_pci_vphb_add(afu))) dev_info(&afu->dev, "Can't register vPHB\n"); @@ -1029,6 +1028,10 @@ void cxl_guest_remove_afu(struct cxl_afu *afu) if (!afu) return; + /* flush and stop pending job */ + afu->guest->handle_err = false; + flush_delayed_work(&afu->guest->work_err); + cxl_pci_vphb_remove(afu); cxl_sysfs_afu_remove(afu); -- cgit v0.10.2 From 2513767d2255acfec707328948dcaefe887f6044 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Tue, 1 Mar 2016 11:17:46 +0530 Subject: powerpc/powernv: Rename machine_check_pSeries_early() to powernv The routine machine_check_pSeries_early() is only used on powernv, not pseries. Hence rename machine_check_pSeries_early() to machine_check_powernv_early(). Reported-by: Paul Mackerras Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7272e6c..4c94406 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -189,7 +189,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif /* CONFIG_PPC_P7_NAP */ EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION - b machine_check_pSeries_early + b machine_check_powernv_early FTR_SECTION_ELSE b machine_check_pSeries_0 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) @@ -433,7 +433,7 @@ denorm_exception_hv: .align 7 /* moved from 0x200 */ -machine_check_pSeries_early: +machine_check_powernv_early: BEGIN_FTR_SECTION EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) /* -- cgit v0.10.2 From 2d59b3b25659463a24f05df367574d90b3cd7145 Mon Sep 17 00:00:00 2001 From: Rashmica Gupta Date: Wed, 23 Dec 2015 16:49:50 +1100 Subject: selftests/powerpc: Make reg.h common to all powerpc selftests Currently there is a reg.h in pmu/ebb that has defines that are useful in other powerpc selftests so move this up into selftests/powerpc folder. Also include in utils.h - as this is often used in self tests. Add in some other useful register defines. Signed-off-by: Rashmica Gupta Signed-off-by: Michael Ellerman diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c index e67452f..46681fe 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c +++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c @@ -15,7 +15,6 @@ #include #include "trace.h" -#include "reg.h" #include "ebb.h" diff --git a/tools/testing/selftests/powerpc/pmu/ebb/reg.h b/tools/testing/selftests/powerpc/pmu/ebb/reg.h deleted file mode 100644 index 5921b0d..0000000 --- a/tools/testing/selftests/powerpc/pmu/ebb/reg.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright 2014, Michael Ellerman, IBM Corp. - * Licensed under GPLv2. - */ - -#ifndef _SELFTESTS_POWERPC_REG_H -#define _SELFTESTS_POWERPC_REG_H - -#define __stringify_1(x) #x -#define __stringify(x) __stringify_1(x) - -#define mfspr(rn) ({unsigned long rval; \ - asm volatile("mfspr %0," __stringify(rn) \ - : "=r" (rval)); rval; }) -#define mtspr(rn, v) asm volatile("mtspr " __stringify(rn) ",%0" : \ - : "r" ((unsigned long)(v)) \ - : "memory") - -#define mb() asm volatile("sync" : : : "memory"); - -#define SPRN_MMCR2 769 -#define SPRN_MMCRA 770 -#define SPRN_MMCR0 779 -#define MMCR0_PMAO 0x00000080 -#define MMCR0_PMAE 0x04000000 -#define MMCR0_FC 0x80000000 -#define SPRN_EBBHR 804 -#define SPRN_EBBRR 805 -#define SPRN_BESCR 806 /* Branch event status & control register */ -#define SPRN_BESCRS 800 /* Branch event status & control set (1 bits set to 1) */ -#define SPRN_BESCRSU 801 /* Branch event status & control set upper */ -#define SPRN_BESCRR 802 /* Branch event status & control REset (1 bits set to 0) */ -#define SPRN_BESCRRU 803 /* Branch event status & control REset upper */ - -#define BESCR_PMEO 0x1 /* PMU Event-based exception Occurred */ -#define BESCR_PME (0x1ul << 32) /* PMU Event-based exception Enable */ - -#define SPRN_PMC1 771 -#define SPRN_PMC2 772 -#define SPRN_PMC3 773 -#define SPRN_PMC4 774 -#define SPRN_PMC5 775 -#define SPRN_PMC6 776 - -#define SPRN_SIAR 780 -#define SPRN_SDAR 781 -#define SPRN_SIER 768 - -#endif /* _SELFTESTS_POWERPC_REG_H */ diff --git a/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c b/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c index 5b1188f..f923228 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c +++ b/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c @@ -7,7 +7,6 @@ #include #include "ebb.h" -#include "reg.h" /* diff --git a/tools/testing/selftests/powerpc/reg.h b/tools/testing/selftests/powerpc/reg.h new file mode 100644 index 0000000..65bfdee --- /dev/null +++ b/tools/testing/selftests/powerpc/reg.h @@ -0,0 +1,55 @@ +/* + * Copyright 2014, Michael Ellerman, IBM Corp. + * Licensed under GPLv2. + */ + +#ifndef _SELFTESTS_POWERPC_REG_H +#define _SELFTESTS_POWERPC_REG_H + +#define __stringify_1(x) #x +#define __stringify(x) __stringify_1(x) + +#define mfspr(rn) ({unsigned long rval; \ + asm volatile("mfspr %0," _str(rn) \ + : "=r" (rval)); rval; }) +#define mtspr(rn, v) asm volatile("mtspr " _str(rn) ",%0" : \ + : "r" ((unsigned long)(v)) \ + : "memory") + +#define mb() asm volatile("sync" : : : "memory"); + +#define SPRN_MMCR2 769 +#define SPRN_MMCRA 770 +#define SPRN_MMCR0 779 +#define MMCR0_PMAO 0x00000080 +#define MMCR0_PMAE 0x04000000 +#define MMCR0_FC 0x80000000 +#define SPRN_EBBHR 804 +#define SPRN_EBBRR 805 +#define SPRN_BESCR 806 /* Branch event status & control register */ +#define SPRN_BESCRS 800 /* Branch event status & control set (1 bits set to 1) */ +#define SPRN_BESCRSU 801 /* Branch event status & control set upper */ +#define SPRN_BESCRR 802 /* Branch event status & control REset (1 bits set to 0) */ +#define SPRN_BESCRRU 803 /* Branch event status & control REset upper */ + +#define BESCR_PMEO 0x1 /* PMU Event-based exception Occurred */ +#define BESCR_PME (0x1ul << 32) /* PMU Event-based exception Enable */ + +#define SPRN_PMC1 771 +#define SPRN_PMC2 772 +#define SPRN_PMC3 773 +#define SPRN_PMC4 774 +#define SPRN_PMC5 775 +#define SPRN_PMC6 776 + +#define SPRN_SIAR 780 +#define SPRN_SDAR 781 +#define SPRN_SIER 768 + +#define SPRN_TEXASR 0x82 +#define SPRN_TFIAR 0x81 /* Transaction Failure Inst Addr */ +#define SPRN_TFHAR 0x80 /* Transaction Failure Handler Addr */ +#define TEXASR_FS 0x08000000 +#define SPRN_TAR 0x32f + +#endif /* _SELFTESTS_POWERPC_REG_H */ diff --git a/tools/testing/selftests/powerpc/utils.h b/tools/testing/selftests/powerpc/utils.h index 3b370de..a985cfa 100644 --- a/tools/testing/selftests/powerpc/utils.h +++ b/tools/testing/selftests/powerpc/utils.h @@ -11,6 +11,7 @@ #include #include #include +#include "reg.h" /* Avoid headaches with PRI?64 - just use %ll? always */ typedef unsigned long long u64; -- cgit v0.10.2 From da3ddc3b5fea695f7b2fa89c4ca17dfd529293d2 Mon Sep 17 00:00:00 2001 From: Rashmica Gupta Date: Wed, 23 Dec 2015 16:49:51 +1100 Subject: selftests/powerpc: Standardise TM calls Currently tbegin, tend etc are written as opcodes or asm instructions. So standardise these to asm instructions. Signed-off-by: Rashmica Gupta Signed-off-by: Michael Ellerman diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c index 8fde93d..d9c49f4 100644 --- a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c +++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c @@ -31,12 +31,6 @@ #include "utils.h" #include "tm.h" -#define TBEGIN ".long 0x7C00051D ;" -#define TEND ".long 0x7C00055D ;" -#define TCHECK ".long 0x7C00059C ;" -#define TSUSPEND ".long 0x7C0005DD ;" -#define TRESUME ".long 0x7C2005DD ;" -#define SPRN_TEXASR 0x82 #define SPRN_DSCR 0x03 int test_body(void) @@ -55,13 +49,13 @@ int test_body(void) "mtspr %[sprn_dscr], 3;" /* start and suspend a transaction */ - TBEGIN + "tbegin.;" "beq 1f;" - TSUSPEND + "tsuspend.;" /* hard loop until the transaction becomes doomed */ "2: ;" - TCHECK + "tcheck 0;" "bc 4, 0, 2b;" /* record DSCR and TEXASR */ @@ -70,8 +64,8 @@ int test_body(void) "mfspr 3, %[sprn_texasr];" "std 3, %[texasr];" - TRESUME - TEND + "tresume.;" + "tend.;" "li %[rv], 0;" "1: ;" : [rv]"=r"(rv), [dscr2]"=m"(dscr2), [texasr]"=m"(texasr) diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c index e44a238..1f0eb56 100644 --- a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c +++ b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c @@ -60,9 +60,9 @@ int tm_signal_stack() exit(1); asm volatile("li 1, 0 ;" /* stack ptr == NULL */ "1:" - ".long 0x7C00051D ;" /* tbegin */ + "tbegin.;" "beq 1b ;" /* retry forever */ - ".long 0x7C0005DD ; ;" /* tsuspend */ + "tsuspend.;" "ld 2, 0(1) ;" /* trigger segv" */ : : : "memory"); -- cgit v0.10.2 From d95be4ca3e457044be55c8b1c1b5ac64f17d8a92 Mon Sep 17 00:00:00 2001 From: Rashmica Gupta Date: Wed, 23 Dec 2015 16:49:52 +1100 Subject: selftests/powerpc: Add test for forking inside transaction This test does a fork syscall inside a transaction. Basic sniff test to see if we can enter the kernel during a transaction. Signed-off-by: Rashmica Gupta Signed-off-by: Michael Ellerman diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore index 7d0f14b..d216edd 100644 --- a/tools/testing/selftests/powerpc/tm/.gitignore +++ b/tools/testing/selftests/powerpc/tm/.gitignore @@ -3,3 +3,4 @@ tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy +tm-fork diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 737f72c..2db475b 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -1,4 +1,4 @@ -TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy +TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy tm-fork all: $(TEST_PROGS) diff --git a/tools/testing/selftests/powerpc/tm/tm-fork.c b/tools/testing/selftests/powerpc/tm/tm-fork.c new file mode 100644 index 0000000..8d48579 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-fork.c @@ -0,0 +1,42 @@ +/* + * Copyright 2015, Michael Neuling, IBM Corp. + * Licensed under GPLv2. + * + * Edited: Rashmica Gupta, Nov 2015 + * + * This test does a fork syscall inside a transaction. Basic sniff test + * to see if we can enter the kernel during a transaction. + */ + +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "tm.h" + +int test_fork(void) +{ + SKIP_IF(!have_htm()); + + asm __volatile__( + "tbegin.;" + "blt 1f; " + "li 0, 2;" /* fork syscall */ + "sc ;" + "tend.;" + "1: ;" + : : : "memory", "r0"); + /* If we reach here, we've passed. Otherwise we've probably crashed + * the kernel */ + + return 0; +} + +int main(int argc, char *argv[]) +{ + return test_harness(test_fork, "tm_fork"); +} -- cgit v0.10.2 From dbccb4940c9c9c8c93fcf733445e96dd2d6890d6 Mon Sep 17 00:00:00 2001 From: Rashmica Gupta Date: Wed, 23 Dec 2015 16:49:53 +1100 Subject: selftests/powerpc: Add TM test to check if TAR is corrupted If the transaction is aborted, the TAR should be rolled back to the checkpointed value before the transaction began. The value written to the TAR when the transaction is suspended should only remain there if the transaction completes successfully. Signed-off-by: Rashmica Gupta Signed-off-by: Michael Ellerman diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore index d216edd..2ac3764 100644 --- a/tools/testing/selftests/powerpc/tm/.gitignore +++ b/tools/testing/selftests/powerpc/tm/.gitignore @@ -4,3 +4,4 @@ tm-signal-msr-resv tm-signal-stack tm-vmxcopy tm-fork +tm-tar diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 2db475b..cb4b3bf 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -1,4 +1,4 @@ -TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy tm-fork +TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy tm-fork tm-tar all: $(TEST_PROGS) diff --git a/tools/testing/selftests/powerpc/tm/tm-tar.c b/tools/testing/selftests/powerpc/tm/tm-tar.c new file mode 100644 index 0000000..2d2fcc2b --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-tar.c @@ -0,0 +1,90 @@ +/* + * Copyright 2015, Michael Neuling, IBM Corp. + * Licensed under GPLv2. + * Original: Michael Neuling 19/7/2013 + * Edited: Rashmica Gupta 01/12/2015 + * + * Do some transactions, see if the tar is corrupted. + * If the transaction is aborted, the TAR should be rolled back to the + * checkpointed value before the transaction began. The value written to + * TAR in suspended mode should only remain in TAR if the transaction + * completes. + */ + +#include +#include +#include +#include + +#include "tm.h" +#include "utils.h" + +int num_loops = 10000; + +int test_tar(void) +{ + int i; + + SKIP_IF(!have_htm()); + + for (i = 0; i < num_loops; i++) + { + uint64_t result = 0; + asm __volatile__( + "li 7, 1;" + "mtspr %[tar], 7;" /* tar = 1 */ + "tbegin.;" + "beq 3f;" + "li 4, 0x7000;" /* Loop lots, to use time */ + "2:;" /* Start loop */ + "li 7, 2;" + "mtspr %[tar], 7;" /* tar = 2 */ + "tsuspend.;" + "li 7, 3;" + "mtspr %[tar], 7;" /* tar = 3 */ + "tresume.;" + "subi 4, 4, 1;" + "cmpdi 4, 0;" + "bne 2b;" + "tend.;" + + /* Transaction sucess! TAR should be 3 */ + "mfspr 7, %[tar];" + "ori %[res], 7, 4;" // res = 3|4 = 7 + "b 4f;" + + /* Abort handler. TAR should be rolled back to 1 */ + "3:;" + "mfspr 7, %[tar];" + "ori %[res], 7, 8;" // res = 1|8 = 9 + "4:;" + + : [res]"=r"(result) + : [tar]"i"(SPRN_TAR) + : "memory", "r0", "r4", "r7"); + + /* If result is anything else other than 7 or 9, the tar + * value must have been corrupted. */ + if ((result != 7) && (result != 9)) + return 1; + } + return 0; +} + +int main(int argc, char *argv[]) +{ + /* A low number of iterations (eg 100) can cause a false pass */ + if (argc > 1) { + if (strcmp(argv[1], "-h") == 0) { + printf("Syntax:\n\t%s []\n", + argv[0]); + return 1; + } else { + num_loops = atoi(argv[1]); + } + } + + printf("Starting, %d loops\n", num_loops); + + return test_harness(test_tar, "tm_tar"); +} -- cgit v0.10.2 From 16aab321872400a4ce35b90ba40484fcb5d636ba Mon Sep 17 00:00:00 2001 From: Rashmica Gupta Date: Wed, 23 Dec 2015 16:49:54 +1100 Subject: selftests/powerpc: Add test to check if TM SPRs are corrupted Testing that the TM SPRs are behaving the way they should. Uses more threads than cpus to see if the following register values persist with context switching: - the FS (failure summary) flag in TEXASR - TFIAR and TFHAR Signed-off-by: Rashmica Gupta Signed-off-by: Michael Ellerman diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore index 2ac3764..bb942db 100644 --- a/tools/testing/selftests/powerpc/tm/.gitignore +++ b/tools/testing/selftests/powerpc/tm/.gitignore @@ -5,3 +5,4 @@ tm-signal-stack tm-vmxcopy tm-fork tm-tar +tm-tmspr diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index cb4b3bf..d0505db 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -1,4 +1,4 @@ -TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy tm-fork tm-tar +TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy tm-fork tm-tar tm-tmspr all: $(TEST_PROGS) @@ -6,6 +6,7 @@ $(TEST_PROGS): ../harness.c ../utils.c tm-syscall: tm-syscall-asm.S tm-syscall: CFLAGS += -mhtm -I../../../../../usr/include +tm-tmspr: CFLAGS += -pthread include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/tm/tm-tmspr.c b/tools/testing/selftests/powerpc/tm/tm-tmspr.c new file mode 100644 index 0000000..2bda81c --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/tm-tmspr.c @@ -0,0 +1,143 @@ +/* + * Copyright 2015, Michael Neuling, IBM Corp. + * Licensed under GPLv2. + * + * Original: Michael Neuling 3/4/2014 + * Modified: Rashmica Gupta 8/12/2015 + * + * Check if any of the Transaction Memory SPRs get corrupted. + * - TFIAR - stores address of location of transaction failure + * - TFHAR - stores address of software failure handler (if transaction + * fails) + * - TEXASR - lots of info about the transacion(s) + * + * (1) create more threads than cpus + * (2) in each thread: + * (a) set TFIAR and TFHAR a unique value + * (b) loop for awhile, continually checking to see if + * either register has been corrupted. + * + * (3) Loop: + * (a) begin transaction + * (b) abort transaction + * (c) check TEXASR to see if FS has been corrupted + * + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#include "utils.h" +#include "tm.h" + +int num_loops = 10000; +int passed = 1; + +void tfiar_tfhar(void *in) +{ + int i, cpu; + unsigned long tfhar, tfhar_rd, tfiar, tfiar_rd; + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + cpu = (unsigned long)in >> 1; + CPU_SET(cpu, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); + + /* TFIAR: Last bit has to be high so userspace can read register */ + tfiar = ((unsigned long)in) + 1; + tfiar += 2; + mtspr(SPRN_TFIAR, tfiar); + + /* TFHAR: Last two bits are reserved */ + tfhar = ((unsigned long)in); + tfhar &= ~0x3UL; + tfhar += 4; + mtspr(SPRN_TFHAR, tfhar); + + for (i = 0; i < num_loops; i++) { + tfhar_rd = mfspr(SPRN_TFHAR); + tfiar_rd = mfspr(SPRN_TFIAR); + if ( (tfhar != tfhar_rd) || (tfiar != tfiar_rd) ) { + passed = 0; + return; + } + } + return; +} + +void texasr(void *in) +{ + unsigned long i; + uint64_t result = 0; + + for (i = 0; i < num_loops; i++) { + asm __volatile__( + "tbegin.;" + "beq 3f ;" + "tabort. 0 ;" + "tend.;" + + /* Abort handler */ + "3: ;" + ::: "memory"); + + /* Check the TEXASR */ + result = mfspr(SPRN_TEXASR); + if ((result & TEXASR_FS) == 0) { + passed = 0; + return; + } + } + return; +} + +int test_tmspr() +{ + pthread_t thread; + int thread_num; + unsigned long i; + + SKIP_IF(!have_htm()); + + /* To cause some context switching */ + thread_num = 10 * sysconf(_SC_NPROCESSORS_ONLN); + + /* Test TFIAR and TFHAR */ + for (i = 0 ; i < thread_num ; i += 2){ + if (pthread_create(&thread, NULL, (void*)tfiar_tfhar, (void *)i)) + return EXIT_FAILURE; + } + if (pthread_join(thread, NULL) != 0) + return EXIT_FAILURE; + + /* Test TEXASR */ + for (i = 0 ; i < thread_num ; i++){ + if (pthread_create(&thread, NULL, (void*)texasr, (void *)i)) + return EXIT_FAILURE; + } + if (pthread_join(thread, NULL) != 0) + return EXIT_FAILURE; + + if (passed) + return 0; + else + return 1; +} + +int main(int argc, char *argv[]) +{ + if (argc > 1) { + if (strcmp(argv[1], "-h") == 0) { + printf("Syntax:\t []\n"); + return 0; + } else { + num_loops = atoi(argv[1]); + } + } + return test_harness(test_tmspr, "tm_tmspr"); +} -- cgit v0.10.2 From 062b26ba3e6c5bc80f01056f5aa8546d18a2eb85 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:20 +1000 Subject: powerpc/pci: Cleanup on struct pci_controller_ops Each PHB has one instance of "struct pci_controller_ops" that includes various callbacks called by PCI subsystem. In the definition of this struct, some callbacks have explicit names for its arguments, but the left don't have. This adds all explicit names of the arguments to the callbacks in "struct pci_controller_ops" so that the code looks consistent. Also, argument name @dev is replaced by @pdev as the later one is the preferred name for PCI device. Signed-off-by: Gavin Shan Reviewed-by: Daniel Axtens Reviewed-by: Andrew Donnellan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index f5056e3..023c8c8 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -17,33 +17,34 @@ struct device_node; * PCI controller operations */ struct pci_controller_ops { - void (*dma_dev_setup)(struct pci_dev *dev); + void (*dma_dev_setup)(struct pci_dev *pdev); void (*dma_bus_setup)(struct pci_bus *bus); - int (*probe_mode)(struct pci_bus *); + int (*probe_mode)(struct pci_bus *bus); /* Called when pci_enable_device() is called. Returns true to * allow assignment/enabling of the device. */ - bool (*enable_device_hook)(struct pci_dev *); + bool (*enable_device_hook)(struct pci_dev *pdev); - void (*disable_device)(struct pci_dev *); + void (*disable_device)(struct pci_dev *pdev); - void (*release_device)(struct pci_dev *); + void (*release_device)(struct pci_dev *pdev); /* Called during PCI resource reassignment */ - resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type); - void (*reset_secondary_bus)(struct pci_dev *dev); + resource_size_t (*window_alignment)(struct pci_bus *bus, + unsigned long type); + void (*reset_secondary_bus)(struct pci_dev *pdev); #ifdef CONFIG_PCI_MSI - int (*setup_msi_irqs)(struct pci_dev *dev, + int (*setup_msi_irqs)(struct pci_dev *pdev, int nvec, int type); - void (*teardown_msi_irqs)(struct pci_dev *dev); + void (*teardown_msi_irqs)(struct pci_dev *pdev); #endif - int (*dma_set_mask)(struct pci_dev *dev, u64 dma_mask); - u64 (*dma_get_required_mask)(struct pci_dev *dev); + int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask); + u64 (*dma_get_required_mask)(struct pci_dev *pdev); - void (*shutdown)(struct pci_controller *); + void (*shutdown)(struct pci_controller *hose); }; /* -- cgit v0.10.2 From cb4224c501802940c41e2f46ae57de08fd0ca119 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:21 +1000 Subject: powerpc/powernv: Cleanup on pci_controller_ops instances This cleans up on below data struct instances to use tab instead of space indent of statement to avoid complains from scripts/checkpatch.pl. No logical changes introduced. @pnv_pci_ioda_controller_ops @pnv_npu_ioda_controller_ops Signed-off-by: Gavin Shan Reviewed-by: Daniel Axtens Reviewed-by: Andrew Donnellan Acked-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index c5baaf3..524c9c7 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3210,31 +3210,31 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) } static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, - .dma_bus_setup = pnv_pci_dma_bus_setup, + .dma_dev_setup = pnv_pci_dma_dev_setup, + .dma_bus_setup = pnv_pci_dma_bus_setup, #ifdef CONFIG_PCI_MSI - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, + .setup_msi_irqs = pnv_setup_msi_irqs, + .teardown_msi_irqs = pnv_teardown_msi_irqs, #endif - .enable_device_hook = pnv_pci_enable_device_hook, - .window_alignment = pnv_pci_window_alignment, - .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_pci_ioda_dma_set_mask, - .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, - .shutdown = pnv_pci_ioda_shutdown, + .enable_device_hook = pnv_pci_enable_device_hook, + .window_alignment = pnv_pci_window_alignment, + .reset_secondary_bus = pnv_pci_reset_secondary_bus, + .dma_set_mask = pnv_pci_ioda_dma_set_mask, + .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, + .shutdown = pnv_pci_ioda_shutdown, }; static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, + .dma_dev_setup = pnv_pci_dma_dev_setup, #ifdef CONFIG_PCI_MSI - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, + .setup_msi_irqs = pnv_setup_msi_irqs, + .teardown_msi_irqs = pnv_teardown_msi_irqs, #endif - .enable_device_hook = pnv_pci_enable_device_hook, - .window_alignment = pnv_pci_window_alignment, - .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_npu_dma_set_mask, - .shutdown = pnv_pci_ioda_shutdown, + .enable_device_hook = pnv_pci_enable_device_hook, + .window_alignment = pnv_pci_window_alignment, + .reset_secondary_bus = pnv_pci_reset_secondary_bus, + .dma_set_mask = pnv_npu_dma_set_mask, + .shutdown = pnv_pci_ioda_shutdown, }; static void __init pnv_pci_init_ioda_phb(struct device_node *np, -- cgit v0.10.2 From 475d92c27f3b32ea31f5f7c4ed7009a7b895b9e7 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:22 +1000 Subject: powerpc/powernv: Drop phb->bdfn_to_pe() The last usage of pnv_phb::bdfn_to_pe() was removed in ff57b454ddb9 ("powerpc/eeh: Do probe on pci_dn"), so drop it. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 524c9c7..10ecd97 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3195,12 +3195,6 @@ static bool pnv_pci_enable_device_hook(struct pci_dev *dev) return true; } -static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, - u32 devfn) -{ - return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; -} - static void pnv_pci_ioda_shutdown(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; @@ -3377,9 +3371,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->freeze_pe = pnv_ioda_freeze_pe; phb->unfreeze_pe = pnv_ioda_unfreeze_pe; - /* Setup RID -> PE mapping function */ - phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; - /* Setup TCEs */ phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 3f814f3..78f035e 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -110,7 +110,6 @@ struct pnv_phb { unsigned int is_64, struct msi_msg *msg); void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); void (*fixup_phb)(struct pci_controller *hose); - u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); int (*init_m64)(struct pnv_phb *phb); void (*reserve_m64_pe)(struct pci_bus *bus, unsigned long *pe_bitmap, bool all); -- cgit v0.10.2 From 13ce7598b6c0d9b3734fb070242de21b05aedd0d Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:23 +1000 Subject: powerpc/powernv: Reorder fields in struct pnv_phb This moves those fields in struct pnv_phb that are related to PE allocation around. No logical change. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 78f035e..f2a1452 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -140,15 +140,14 @@ struct pnv_phb { unsigned int io_segsize; unsigned int io_pci_base; - /* PE allocation bitmap */ - unsigned long *pe_alloc; - /* PE allocation mutex */ + /* PE allocation */ struct mutex pe_alloc_mutex; + unsigned long *pe_alloc; + struct pnv_ioda_pe *pe_array; /* M32 & IO segment maps */ unsigned int *m32_segmap; unsigned int *io_segmap; - struct pnv_ioda_pe *pe_array; /* IRQ chip */ int irq_chip_init; -- cgit v0.10.2 From 92b8f137b3620b0a07c71b1bd191f9a22d0cf0ab Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:24 +1000 Subject: powerpc/powernv: Rename PE# fields in struct pnv_phb This renames the fields related to PE number in "struct pnv_phb" for better reflecting of their usages as Alexey suggested. No logical changes introduced. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 950b3e5..69e41ce 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -75,7 +75,7 @@ static int pnv_eeh_init(void) * and P7IOC separately. So we should regard * PE#0 as valid for PHB3 and P7IOC. */ - if (phb->ioda.reserved_pe != 0) + if (phb->ioda.reserved_pe_idx != 0) eeh_add_flag(EEH_VALID_PE_ZERO); break; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 10ecd97..1d2514f 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -124,7 +124,7 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) { - if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) { + if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) { pr_warn("%s: Invalid PE %d on PHB#%x\n", __func__, pe_no, phb->hose->global_number); return; @@ -144,8 +144,8 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb) do { pe = find_next_zero_bit(phb->ioda.pe_alloc, - phb->ioda.total_pe, 0); - if (pe >= phb->ioda.total_pe) + phb->ioda.total_pe_num, 0); + if (pe >= phb->ioda.total_pe_num) return IODA_INVALID_PE; } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); @@ -199,13 +199,13 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb) * expected to be 0 or last one of PE capabicity. */ r = &phb->hose->mem_resources[1]; - if (phb->ioda.reserved_pe == 0) + if (phb->ioda.reserved_pe_idx == 0) r->start += phb->ioda.m64_segsize; - else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1)) + else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) r->end -= phb->ioda.m64_segsize; else pr_warn(" Cannot strip M64 segment for reserved PE#%d\n", - phb->ioda.reserved_pe); + phb->ioda.reserved_pe_idx); return 0; @@ -274,7 +274,7 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) return IODA_INVALID_PE; /* Allocate bitmap */ - size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); + size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); pe_alloc = kzalloc(size, GFP_KERNEL); if (!pe_alloc) { pr_warn("%s: Out of memory !\n", @@ -290,7 +290,7 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) * contributed by its child buses. For the case, we needn't * pick M64 dependent PE#. */ - if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) { + if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) { kfree(pe_alloc); return IODA_INVALID_PE; } @@ -301,8 +301,8 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) */ master_pe = NULL; i = -1; - while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) < - phb->ioda.total_pe) { + while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) < + phb->ioda.total_pe_num) { pe = &phb->ioda.pe_array[i]; if (!master_pe) { @@ -355,7 +355,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) hose->mem_offset[1] = res->start - pci_addr; phb->ioda.m64_size = resource_size(res); - phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe; + phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num; phb->ioda.m64_base = pci_addr; pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n", @@ -456,7 +456,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) s64 rc; /* Sanity check on PE number */ - if (pe_no < 0 || pe_no >= phb->ioda.total_pe) + if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num) return OPAL_EEH_STOPPED_PERM_UNAVAIL; /* @@ -1088,7 +1088,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) * same GPU get assigned the same PE. */ gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev); - for (pe_num = 0; pe_num < phb->ioda.total_pe; pe_num++) { + for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) { pe = &phb->ioda.pe_array[pe_num]; if (!pe->pdev) continue; @@ -1537,9 +1537,9 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) } else { mutex_lock(&phb->ioda.pe_alloc_mutex); *pdn->pe_num_map = bitmap_find_next_zero_area( - phb->ioda.pe_alloc, phb->ioda.total_pe, + phb->ioda.pe_alloc, phb->ioda.total_pe_num, 0, num_vfs, 0); - if (*pdn->pe_num_map >= phb->ioda.total_pe) { + if (*pdn->pe_num_map >= phb->ioda.total_pe_num) { mutex_unlock(&phb->ioda.pe_alloc_mutex); dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); kfree(pdn->pe_num_map); @@ -2858,7 +2858,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) pdn->m64_single_mode = false; total_vfs = pci_sriov_get_totalvfs(pdev); - mul = phb->ioda.total_pe; + mul = phb->ioda.total_pe_num; total_vf_bar_sz = 0; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { @@ -2960,7 +2960,7 @@ static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, region.end = res->end - phb->ioda.io_pci_base; index = region.start / phb->ioda.io_segsize; - while (index < phb->ioda.total_pe && + while (index < phb->ioda.total_pe_num && region.start <= region.end) { phb->ioda.io_segmap[index] = pe->pe_number; rc = opal_pci_map_pe_mmio_window(phb->opal_id, @@ -2985,7 +2985,7 @@ static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, phb->ioda.m32_pci_base; index = region.start / phb->ioda.m32_segsize; - while (index < phb->ioda.total_pe && + while (index < phb->ioda.total_pe_num && region.start <= region.end) { phb->ioda.m32_segmap[index] = pe->pe_number; rc = opal_pci_map_pe_mmio_window(phb->opal_id, @@ -3300,13 +3300,13 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, pr_err(" Failed to map registers !\n"); /* Initialize more IODA stuff */ - phb->ioda.total_pe = 1; + phb->ioda.total_pe_num = 1; prop32 = of_get_property(np, "ibm,opal-num-pes", NULL); if (prop32) - phb->ioda.total_pe = be32_to_cpup(prop32); + phb->ioda.total_pe_num = be32_to_cpup(prop32); prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL); if (prop32) - phb->ioda.reserved_pe = be32_to_cpup(prop32); + phb->ioda.reserved_pe_idx = be32_to_cpup(prop32); /* Parse 64-bit MMIO range */ pnv_ioda_parse_m64_window(phb); @@ -3315,29 +3315,29 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, /* FW Has already off top 64k of M32 space (MSI space) */ phb->ioda.m32_size += 0x10000; - phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; + phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num; phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0]; phb->ioda.io_size = hose->pci_io_size; - phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; + phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num; phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ - size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); + size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); m32map_off = size; - size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); + size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]); if (phb->type == PNV_PHB_IODA1) { iomap_off = size; - size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); + size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]); } pemap_off = size; - size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); + size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe); aux = memblock_virt_alloc(size, 0); phb->ioda.pe_alloc = aux; phb->ioda.m32_segmap = aux + m32map_off; if (phb->type == PNV_PHB_IODA1) phb->ioda.io_segmap = aux + iomap_off; phb->ioda.pe_array = aux + pemap_off; - set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc); + set_bit(phb->ioda.reserved_pe_idx, phb->ioda.pe_alloc); INIT_LIST_HEAD(&phb->ioda.pe_dma_list); INIT_LIST_HEAD(&phb->ioda.pe_list); @@ -3356,7 +3356,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, #endif pr_info(" %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n", - phb->ioda.total_pe, phb->ioda.reserved_pe, + phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx, phb->ioda.m32_size, phb->ioda.m32_segsize); if (phb->ioda.m64_size) pr_info(" M64: 0x%lx [segment=0x%lx]\n", diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 0db20ae..88255eb 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -377,7 +377,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) */ pe_no = pdn->pe_number; if (pe_no == IODA_INVALID_PE) { - pe_no = phb->ioda.reserved_pe; + pe_no = phb->ioda.reserved_pe_idx; } /* diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index f2a1452..784882a 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -120,8 +120,8 @@ struct pnv_phb { struct { /* Global bridge info */ - unsigned int total_pe; - unsigned int reserved_pe; + unsigned int total_pe_num; + unsigned int reserved_pe_idx; /* 32-bit MMIO window */ unsigned int m32_size; -- cgit v0.10.2 From 689ee8c95f399998f52eda827849b7771629a53c Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:25 +1000 Subject: powerpc/powernv: Data type unsigned int for PE number This changes the data type of PE number from "int" to "unsigned int" in order to match the fact PE number is never negative: * The number of PE to which the specified PCI device is attached. * The PE number map for SRIOV VFs. * The returned PE number from pnv_ioda_alloc_pe(). * The returned PE number from pnv_ioda2_pick_m64_pe(). Suggested-by: Alexey Kardashevskiy Signed-off-by: Gavin Shan Reviewed-By: Alistair Popple Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 023c8c8..220129f 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -209,14 +209,14 @@ struct pci_dn { #ifdef CONFIG_EEH struct eeh_dev *edev; /* eeh device */ #endif -#define IODA_INVALID_PE (-1) +#define IODA_INVALID_PE 0xFFFFFFFF #ifdef CONFIG_PPC_POWERNV - int pe_number; + unsigned int pe_number; int vf_index; /* VF index in the PF */ #ifdef CONFIG_PCI_IOV u16 vfs_expanded; /* number of VFs IOV BAR expanded */ u16 num_vfs; /* number of VFs enabled*/ - int *pe_num_map; /* PE# for the first VF PE or array */ + unsigned int *pe_num_map; /* PE# for the first VF PE or array */ bool m64_single_mode; /* Use M64 BAR in Single Mode */ #define IODA_INVALID_M64 (-1) int (*m64_map)[PCI_SRIOV_NUM_BARS]; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 1d2514f..4aa6cdf 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -138,7 +138,7 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) phb->ioda.pe_array[pe_no].pe_number = pe_no; } -static int pnv_ioda_alloc_pe(struct pnv_phb *phb) +static unsigned int pnv_ioda_alloc_pe(struct pnv_phb *phb) { unsigned long pe; @@ -261,7 +261,7 @@ static void pnv_ioda2_reserve_m64_pe(struct pci_bus *bus, } } -static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) +static unsigned int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) { struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; @@ -919,7 +919,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn = pci_get_pdn(dev); struct pnv_ioda_pe *pe; - int pe_num; + unsigned int pe_num; if (!pdn) { pr_err("%s: Device tree node not associated properly\n", @@ -1010,7 +1010,7 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; struct pnv_ioda_pe *pe; - int pe_num = IODA_INVALID_PE; + unsigned int pe_num = IODA_INVALID_PE; /* Check if PE is determined by M64 */ if (phb->pick_m64_pe) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 88255eb..1d92bd9 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -367,7 +367,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) struct pnv_phb *phb = pdn->phb->private_data; u8 fstate; __be16 pcierr; - int pe_no; + unsigned int pe_no; s64 rc; /* diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 784882a..66f2569 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -113,7 +113,7 @@ struct pnv_phb { int (*init_m64)(struct pnv_phb *phb); void (*reserve_m64_pe)(struct pci_bus *bus, unsigned long *pe_bitmap, bool all); - int (*pick_m64_pe)(struct pci_bus *bus, bool all); + unsigned int (*pick_m64_pe)(struct pci_bus *bus, bool all); int (*get_pe_state)(struct pnv_phb *phb, int pe_no); void (*freeze_pe)(struct pnv_phb *phb, int pe_no); int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt); -- cgit v0.10.2 From 3fa23ff8ff3f0fa4f187bc46f81c69b1fdeecd9a Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:26 +1000 Subject: powerpc/powernv: Fix initial IO and M32 segmap There are two arrays for IO and M32 segment maps on every PHB. The index of the arrays are segment number and the value stored in the corresponding element is PE number, indicating the segment is assigned to the PE. Initially, all elements in those two arrays are zeroes, meaning all segments are assigned to PE#0. It's wrong. This fixes the initial values in the elements of those two arrays to IODA_INVALID_PE, meaning all segments aren't assigned to any PE. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 4aa6cdf..59b20e5 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3240,6 +3240,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, const __be64 *prop64; const __be32 *prop32; int len; + unsigned int segno; u64 phb_id; void *aux; long rc; @@ -3334,8 +3335,13 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, aux = memblock_virt_alloc(size, 0); phb->ioda.pe_alloc = aux; phb->ioda.m32_segmap = aux + m32map_off; - if (phb->type == PNV_PHB_IODA1) + for (segno = 0; segno < phb->ioda.total_pe_num; segno++) + phb->ioda.m32_segmap[segno] = IODA_INVALID_PE; + if (phb->type == PNV_PHB_IODA1) { phb->ioda.io_segmap = aux + iomap_off; + for (segno = 0; segno < phb->ioda.total_pe_num; segno++) + phb->ioda.io_segmap[segno] = IODA_INVALID_PE; + } phb->ioda.pe_array = aux + pemap_off; set_bit(phb->ioda.reserved_pe_idx, phb->ioda.pe_alloc); -- cgit v0.10.2 From 23e79425fe7caaa93ddcee18ac4c57069f0fb4b6 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:27 +1000 Subject: powerpc/powernv: Simplify pnv_ioda_setup_pe_seg() pnv_ioda_setup_pe_seg() associates the IO and M32 segments with the owner PE. The code mapping segments should be fixed and immune from logic changes introduced to pnv_ioda_setup_pe_seg(). This moves the code mapping segments to helper pnv_ioda_setup_pe_res(). The data type for @rc is changed to "int64_t". Also, argument @hose is removed from pnv_ioda_setup_pe() as it can be got from @pe. No functional changes introduced. Signed-off-by: Gavin Shan Reviewed-By: Alistair Popple Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 59b20e5..b954fbc 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2929,19 +2929,72 @@ truncate_iov: } #endif /* CONFIG_PCI_IOV */ +static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, + struct resource *res) +{ + struct pnv_phb *phb = pe->phb; + struct pci_bus_region region; + int index; + int64_t rc; + + if (!res || !res->flags || res->start > res->end) + return; + + if (res->flags & IORESOURCE_IO) { + region.start = res->start - phb->ioda.io_pci_base; + region.end = res->end - phb->ioda.io_pci_base; + index = region.start / phb->ioda.io_segsize; + + while (index < phb->ioda.total_pe_num && + region.start <= region.end) { + phb->ioda.io_segmap[index] = pe->pe_number; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); + if (rc != OPAL_SUCCESS) { + pr_err("%s: Error %lld mapping IO segment#%d to PE#%d\n", + __func__, rc, index, pe->pe_number); + break; + } + + region.start += phb->ioda.io_segsize; + index++; + } + } else if ((res->flags & IORESOURCE_MEM) && + !pnv_pci_is_mem_pref_64(res->flags)) { + region.start = res->start - + phb->hose->mem_offset[0] - + phb->ioda.m32_pci_base; + region.end = res->end - + phb->hose->mem_offset[0] - + phb->ioda.m32_pci_base; + index = region.start / phb->ioda.m32_segsize; + + while (index < phb->ioda.total_pe_num && + region.start <= region.end) { + phb->ioda.m32_segmap[index] = pe->pe_number; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); + if (rc != OPAL_SUCCESS) { + pr_err("%s: Error %lld mapping M32 segment#%d to PE#%d", + __func__, rc, index, pe->pe_number); + break; + } + + region.start += phb->ioda.m32_segsize; + index++; + } + } +} + /* * This function is supposed to be called on basis of PE from top * to bottom style. So the the I/O or MMIO segment assigned to * parent PE could be overrided by its child PEs if necessary. */ -static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, - struct pnv_ioda_pe *pe) +static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) { - struct pnv_phb *phb = hose->private_data; - struct pci_bus_region region; struct resource *res; - int i, index; - int rc; + int i; /* * NOTE: We only care PCI bus based PE for now. For PCI @@ -2950,58 +3003,8 @@ static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, */ BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); - pci_bus_for_each_resource(pe->pbus, res, i) { - if (!res || !res->flags || - res->start > res->end) - continue; - - if (res->flags & IORESOURCE_IO) { - region.start = res->start - phb->ioda.io_pci_base; - region.end = res->end - phb->ioda.io_pci_base; - index = region.start / phb->ioda.io_segsize; - - while (index < phb->ioda.total_pe_num && - region.start <= region.end) { - phb->ioda.io_segmap[index] = pe->pe_number; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); - if (rc != OPAL_SUCCESS) { - pr_err("%s: OPAL error %d when mapping IO " - "segment #%d to PE#%d\n", - __func__, rc, index, pe->pe_number); - break; - } - - region.start += phb->ioda.io_segsize; - index++; - } - } else if ((res->flags & IORESOURCE_MEM) && - !pnv_pci_is_mem_pref_64(res->flags)) { - region.start = res->start - - hose->mem_offset[0] - - phb->ioda.m32_pci_base; - region.end = res->end - - hose->mem_offset[0] - - phb->ioda.m32_pci_base; - index = region.start / phb->ioda.m32_segsize; - - while (index < phb->ioda.total_pe_num && - region.start <= region.end) { - phb->ioda.m32_segmap[index] = pe->pe_number; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); - if (rc != OPAL_SUCCESS) { - pr_err("%s: OPAL error %d when mapping M32 " - "segment#%d to PE#%d", - __func__, rc, index, pe->pe_number); - break; - } - - region.start += phb->ioda.m32_segsize; - index++; - } - } - } + pci_bus_for_each_resource(pe->pbus, res, i) + pnv_ioda_setup_pe_res(pe, res); } static void pnv_pci_ioda_setup_seg(void) @@ -3018,7 +3021,7 @@ static void pnv_pci_ioda_setup_seg(void) continue; list_for_each_entry(pe, &phb->ioda.pe_list, list) { - pnv_ioda_setup_pe_seg(hose, pe); + pnv_ioda_setup_pe_seg(pe); } } } -- cgit v0.10.2 From 69d733e72ddae1680ffd3b5c648ff31cb46d9cc9 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:28 +1000 Subject: powerpc/powernv: IO and M32 mapping based on PCI device resources Currently, the IO and M32 segments are mapped to the corresponding PE based on the windows of the parent bridge of PE's primary bus. It's not going to work when the windows of root port or upstream port of the PCIe switch behind root port are extended to PHB's apertures in order to support hotplug in subsequent patch. This fixes the issue by mapping IO and M32 segments based on the resources of the PCI devices included in the PE, instead of the windows of the parent bridge of the PE's primary bus. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b954fbc..904790b 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2993,7 +2993,7 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, */ static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) { - struct resource *res; + struct pci_dev *pdev; int i; /* @@ -3003,8 +3003,21 @@ static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) */ BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); - pci_bus_for_each_resource(pe->pbus, res, i) - pnv_ioda_setup_pe_res(pe, res); + list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { + for (i = 0; i <= PCI_ROM_RESOURCE; i++) + pnv_ioda_setup_pe_res(pe, &pdev->resource[i]); + + /* + * If the PE contains all subordinate PCI buses, the + * windows of the child bridges should be mapped to + * the PE as well. + */ + if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev)) + continue; + for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) + pnv_ioda_setup_pe_res(pe, + &pdev->resource[PCI_BRIDGE_RESOURCES + i]); + } } static void pnv_pci_ioda_setup_seg(void) -- cgit v0.10.2 From 93289d8c08dc038a3efccf6b7fa563e1683cd868 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:29 +1000 Subject: powerpc/powernv: Track M64 segment consumption When unplugging PCI devices, their parent PEs might be offline. The consumed M64 resource by the PEs should be released at that time. As we track M32 segment consumption, this introduces an array to the PHB to track the mapping between M64 segment and PE number. Note: M64 mapping isn't covered by pnv_ioda_setup_pe_seg() as IODA2 doesn't support the mapping explicitly while it's supported on IODA1. Until now, no M64 is supported on IODA1 in software. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 904790b..832b430 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -305,6 +305,7 @@ static unsigned int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) phb->ioda.total_pe_num) { pe = &phb->ioda.pe_array[i]; + phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number; if (!master_pe) { pe->flags |= PNV_IODA_PE_MASTER; INIT_LIST_HEAD(&pe->slaves); @@ -3252,7 +3253,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, { struct pci_controller *hose; struct pnv_phb *phb; - unsigned long size, m32map_off, pemap_off, iomap_off = 0; + unsigned long size, m64map_off, m32map_off, pemap_off, iomap_off = 0; const __be64 *prop64; const __be32 *prop32; int len; @@ -3340,6 +3341,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); + m64map_off = size; + size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]); m32map_off = size; size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]); if (phb->type == PNV_PHB_IODA1) { @@ -3350,9 +3353,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe); aux = memblock_virt_alloc(size, 0); phb->ioda.pe_alloc = aux; + phb->ioda.m64_segmap = aux + m64map_off; phb->ioda.m32_segmap = aux + m32map_off; - for (segno = 0; segno < phb->ioda.total_pe_num; segno++) + for (segno = 0; segno < phb->ioda.total_pe_num; segno++) { + phb->ioda.m64_segmap[segno] = IODA_INVALID_PE; phb->ioda.m32_segmap[segno] = IODA_INVALID_PE; + } if (phb->type == PNV_PHB_IODA1) { phb->ioda.io_segmap = aux + iomap_off; for (segno = 0; segno < phb->ioda.total_pe_num; segno++) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 66f2569..a409165 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -146,6 +146,7 @@ struct pnv_phb { struct pnv_ioda_pe *pe_array; /* M32 & IO segment maps */ + unsigned int *m64_segmap; unsigned int *m32_segmap; unsigned int *io_segmap; -- cgit v0.10.2 From c430670ad11aa8c5a9f9706bf6f7376310af7f4b Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:30 +1000 Subject: powerpc/powernv: Rename M64 related functions This renames those functions picking PE number based on consumed M64 segments, mapping M64 segments to PEs as those functions are going to be shared by IODA1/IODA2 in next patch. No logical changes introduced. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 832b430..37f22b0 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -219,7 +219,7 @@ fail: return -EIO; } -static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev, +static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, unsigned long *pe_bitmap) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); @@ -246,22 +246,22 @@ static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev, } } -static void pnv_ioda2_reserve_m64_pe(struct pci_bus *bus, - unsigned long *pe_bitmap, - bool all) +static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, + unsigned long *pe_bitmap, + bool all) { struct pci_dev *pdev; list_for_each_entry(pdev, &bus->devices, bus_list) { - pnv_ioda2_reserve_dev_m64_pe(pdev, pe_bitmap); + pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap); if (all && pdev->subordinate) - pnv_ioda2_reserve_m64_pe(pdev->subordinate, - pe_bitmap, all); + pnv_ioda_reserve_m64_pe(pdev->subordinate, + pe_bitmap, all); } } -static unsigned int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) +static unsigned int pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) { struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; @@ -283,7 +283,7 @@ static unsigned int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) } /* Figure out reserved PE numbers by the PE */ - pnv_ioda2_reserve_m64_pe(bus, pe_alloc, all); + pnv_ioda_reserve_m64_pe(bus, pe_alloc, all); /* * the current bus might not own M64 window and that's all @@ -365,8 +365,8 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) /* Use last M64 BAR to cover M64 window */ phb->ioda.m64_bar_idx = 15; phb->init_m64 = pnv_ioda2_init_m64; - phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe; - phb->pick_m64_pe = pnv_ioda2_pick_m64_pe; + phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe; + phb->pick_m64_pe = pnv_ioda_pick_m64_pe; } static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no) -- cgit v0.10.2 From 9945155143926c1191bc0a1178ac2bb290467370 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 5 May 2016 12:02:13 +1000 Subject: powerpc/powernv/ioda1: M64 support on P7IOC This enables M64 window on P7IOC, which has been enabled on PHB3. Different from PHB3 where 16 M64 BARs are supported and each of them can be owned by one particular PE# exclusively or divided evenly to 256 segments, every P7IOC PHB has 16 M64 BARs and each of them are divided to 8 segments. So every P7IOC PHB supports 128 M64 segments in total. P7IOC has M64DT, which helps mapping one particular M64 segment# to arbitrary PE#. PHB3 doesn't have M64DT, indicating that one M64 segment can only be pinned to the fixed PE#. In order to unified M64 support M64 on P7IOC and PHB3, we just provide 128 M64 segments on every P7IOC PHB and each of them is pinned to the fixed PE# by bypassing the function of M64DT. In turn, we just need different phb->init_m64() for P7IOC and PHB3 and maps M64 segment in pnv_ioda_reserve_m64_pe() for P7IOC, most of the code are shared by them. Signed-off-by: Gavin Shan Reviewed-by: Alistair Popple Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 37f22b0..30a6d79 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -48,6 +48,9 @@ #include "powernv.h" #include "pci.h" +#define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */ +#define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ + /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) @@ -246,6 +249,64 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, } } +static int pnv_ioda1_init_m64(struct pnv_phb *phb) +{ + struct resource *r; + int index; + + /* + * There are 16 M64 BARs, each of which has 8 segments. So + * there are as many M64 segments as the maximum number of + * PEs, which is 128. + */ + for (index = 0; index < PNV_IODA1_M64_NUM; index++) { + unsigned long base, segsz = phb->ioda.m64_segsize; + int64_t rc; + + base = phb->ioda.m64_base + + index * PNV_IODA1_M64_SEGS * segsz; + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, index, base, 0, + PNV_IODA1_M64_SEGS * segsz); + if (rc != OPAL_SUCCESS) { + pr_warn(" Error %lld setting M64 PHB#%d-BAR#%d\n", + rc, phb->hose->global_number, index); + goto fail; + } + + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, index, + OPAL_ENABLE_M64_SPLIT); + if (rc != OPAL_SUCCESS) { + pr_warn(" Error %lld enabling M64 PHB#%d-BAR#%d\n", + rc, phb->hose->global_number, index); + goto fail; + } + } + + /* + * Exclude the segment used by the reserved PE, which + * is expected to be 0 or last supported PE#. + */ + r = &phb->hose->mem_resources[1]; + if (phb->ioda.reserved_pe_idx == 0) + r->start += phb->ioda.m64_segsize; + else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) + r->end -= phb->ioda.m64_segsize; + else + WARN(1, "Wrong reserved PE#%d on PHB#%d\n", + phb->ioda.reserved_pe_idx, phb->hose->global_number); + + return 0; + +fail: + for ( ; index >= 0; index--) + opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64); + + return -EIO; +} + static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, unsigned long *pe_bitmap, bool all) @@ -315,6 +376,26 @@ static unsigned int pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) pe->master = master_pe; list_add_tail(&pe->list, &master_pe->slaves); } + + /* + * P7IOC supports M64DT, which helps mapping M64 segment + * to one particular PE#. However, PHB3 has fixed mapping + * between M64 segment and PE#. In order to have same logic + * for P7IOC and PHB3, we enforce fixed mapping between M64 + * segment and PE# on P7IOC. + */ + if (phb->type == PNV_PHB_IODA1) { + int64_t rc; + + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_M64_WINDOW_TYPE, + pe->pe_number / PNV_IODA1_M64_SEGS, + pe->pe_number % PNV_IODA1_M64_SEGS); + if (rc != OPAL_SUCCESS) + pr_warn("%s: Error %lld mapping M64 for PHB#%d-PE#%d\n", + __func__, rc, phb->hose->global_number, + pe->pe_number); + } } kfree(pe_alloc); @@ -329,8 +410,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) const u32 *r; u64 pci_addr; - /* FIXME: Support M64 for P7IOC */ - if (phb->type != PNV_PHB_IODA2) { + if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) { pr_info(" Not support M64 window\n"); return; } @@ -364,7 +444,10 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) /* Use last M64 BAR to cover M64 window */ phb->ioda.m64_bar_idx = 15; - phb->init_m64 = pnv_ioda2_init_m64; + if (phb->type == PNV_PHB_IODA1) + phb->init_m64 = pnv_ioda1_init_m64; + else + phb->init_m64 = pnv_ioda2_init_m64; phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe; phb->pick_m64_pe = pnv_ioda_pick_m64_pe; } -- cgit v0.10.2 From b30d936f6f3ee357e48b7f50754b0855d87f1cce Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:32 +1000 Subject: powerpc/powernv/ioda1: Rename pnv_pci_ioda_setup_dma_pe() This renames pnv_pci_ioda_setup_dma_pe() to pnv_pci_ioda1_setup_dma_pe() as it's the counter-part of IODA2's pnv_pci_ioda2_setup_dma_pe(). No logical changes introduced. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 30a6d79..faee623 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2029,9 +2029,10 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { .free = pnv_ioda2_table_free, }; -static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe, unsigned int base, - unsigned int segs) +static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe, + unsigned int base, + unsigned int segs) { struct page *tce_mem = NULL; @@ -2619,7 +2620,7 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) if (phb->type == PNV_PHB_IODA1) { pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", pe->dma_weight, segs); - pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); + pnv_pci_ioda1_setup_dma_pe(phb, pe, base, segs); } else if (phb->type == PNV_PHB_IODA2) { pe_info(pe, "Assign DMA32 space\n"); segs = 0; -- cgit v0.10.2 From acce971c0e72c63cc2247e2412375e13a0be1ee1 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:33 +1000 Subject: powerpc/powernv/ioda1: Introduce PNV_IODA1_DMA32_SEGSIZE Currently, there is one macro (TCE32_TABLE_SIZE) representing the TCE table size for one DMA32 segment. The constant representing the DMA32 segment size (1 << 28) is still used in the code. This defines PNV_IODA1_DMA32_SEGSIZE representing one DMA32 segment size. the TCE table size can be calcualted when the page has fixed 4KB size. So all the related calculation depends on one macro (PNV_IODA1_DMA32_SEGSIZE). No logical changes introduced. Signed-off-by: Gavin Shan Reviewed-By: Alistair Popple Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index faee623..4b08cef 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -50,9 +50,7 @@ #define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */ #define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ - -/* 256M DMA window, 4K TCE pages, 8 bytes TCE */ -#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) +#define PNV_IODA1_DMA32_SEGSIZE 0x10000000 #define POWERNV_IOMMU_DEFAULT_LEVELS 1 #define POWERNV_IOMMU_MAX_LEVELS 5 @@ -2037,7 +2035,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, struct page *tce_mem = NULL; struct iommu_table *tbl; - unsigned int i; + unsigned int tce32_segsz, i; int64_t rc; void *addr; @@ -2057,29 +2055,34 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, /* Grab a 32-bit TCE table */ pe->tce32_seg = base; pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", - (base << 28), ((base + segs) << 28) - 1); + base * PNV_IODA1_DMA32_SEGSIZE, + (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1); /* XXX Currently, we allocate one big contiguous table for the * TCEs. We only really need one chunk per 256M of TCE space * (ie per segment) but that's an optimization for later, it * requires some added smarts with our get/put_tce implementation + * + * Each TCE page is 4KB in size and each TCE entry occupies 8 + * bytes */ + tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3); tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, - get_order(TCE32_TABLE_SIZE * segs)); + get_order(tce32_segsz * segs)); if (!tce_mem) { pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); goto fail; } addr = page_address(tce_mem); - memset(addr, 0, TCE32_TABLE_SIZE * segs); + memset(addr, 0, tce32_segsz * segs); /* Configure HW */ for (i = 0; i < segs; i++) { rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, base + i, 1, - __pa(addr) + TCE32_TABLE_SIZE * i, - TCE32_TABLE_SIZE, 0x1000); + __pa(addr) + tce32_segsz * i, + tce32_segsz, IOMMU_PAGE_SIZE_4K); if (rc) { pe_err(pe, " Failed to configure 32-bit TCE table," " err %ld\n", rc); @@ -2088,8 +2091,9 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, } /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, - base << 28, IOMMU_PAGE_SHIFT_4K); + pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs, + base * PNV_IODA1_DMA32_SEGSIZE, + IOMMU_PAGE_SHIFT_4K); /* OPAL variant of P7IOC SW invalidated TCEs */ if (phb->ioda.tce_inval_reg) @@ -2119,7 +2123,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, if (pe->tce32_seg >= 0) pe->tce32_seg = -1; if (tce_mem) - __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); + __free_pages(tce_mem, get_order(tce32_segsz * segs)); if (tbl) { pnv_pci_unlink_table_and_group(tbl, &pe->table_group); iommu_free_table(tbl, "pnv"); @@ -3456,7 +3460,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, mutex_init(&phb->ioda.pe_list_mutex); /* Calculate how many 32-bit TCE segments we have */ - phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; + phb->ioda.tce32_count = phb->ioda.m32_pci_base / + PNV_IODA1_DMA32_SEGSIZE; #if 0 /* We should really do that ... */ rc = opal_pci_set_phb_mem_window(opal->phb_id, -- cgit v0.10.2 From 801846d1deff943114f6f468b21386247b008321 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:34 +1000 Subject: powerpc/powernv: Remove DMA32 PE list PEs are put into PHB DMA32 list (phb->ioda.pe_dma_list) according to their DMA32 weight. The PEs on the list are iterated to setup their TCE32 tables at system booting time. The list is used for once at boot time and no need to keep it. This moves the logic calculating DMA32 weight of PHB and PE to pnv_ioda_setup_dma() to drop PHB's DMA32 list. Also, every PE traces the consumed DMA32 segment by @tce32_seg and @tce32_segcount are useless and they're removed. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 4b08cef..30c825a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -890,44 +890,6 @@ out: return 0; } -static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) -{ - struct pnv_ioda_pe *lpe; - - list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) { - if (lpe->dma_weight < pe->dma_weight) { - list_add_tail(&pe->dma_link, &lpe->dma_link); - return; - } - } - list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list); -} - -static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) -{ - /* This is quite simplistic. The "base" weight of a device - * is 10. 0 means no DMA is to be accounted for it. - */ - - /* If it's a bridge, no DMA */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) - return 0; - - /* Reduce the weight of slow USB controllers */ - if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || - dev->class == PCI_CLASS_SERIAL_USB_OHCI || - dev->class == PCI_CLASS_SERIAL_USB_EHCI) - return 3; - - /* Increase the weight of RAID (includes Obsidian) */ - if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) - return 15; - - /* Default */ - return 10; -} - #ifdef CONFIG_PCI_IOV static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) { @@ -1032,7 +994,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) pe->flags = PNV_IODA_PE_DEV; pe->pdev = dev; pe->pbus = NULL; - pe->tce32_seg = -1; pe->mve_number = -1; pe->rid = dev->bus->number << 8 | pdn->devfn; @@ -1048,16 +1009,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) return NULL; } - /* Assign a DMA weight to the device */ - pe->dma_weight = pnv_ioda_dma_weight(dev); - if (pe->dma_weight != 0) { - phb->ioda.dma_weight += pe->dma_weight; - phb->ioda.dma_pe_count++; - } - - /* Link the PE */ - pnv_ioda_link_pe_by_weight(phb, pe); - return pe; } @@ -1075,7 +1026,6 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) } pdn->pcidev = dev; pdn->pe_number = pe->pe_number; - pe->dma_weight += pnv_ioda_dma_weight(dev); if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) pnv_ioda_setup_same_PE(dev->subordinate, pe); } @@ -1112,10 +1062,8 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); pe->pbus = bus; pe->pdev = NULL; - pe->tce32_seg = -1; pe->mve_number = -1; pe->rid = bus->busn_res.start << 8; - pe->dma_weight = 0; if (all) pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", @@ -1137,17 +1085,6 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) /* Put PE to the list */ list_add_tail(&pe->list, &phb->ioda.pe_list); - - /* Account for one DMA PE if at least one DMA capable device exist - * below the bridge - */ - if (pe->dma_weight != 0) { - phb->ioda.dma_weight += pe->dma_weight; - phb->ioda.dma_pe_count++; - } - - /* Link the PE */ - pnv_ioda_link_pe_by_weight(phb, pe); } static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) @@ -1188,7 +1125,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) rid = npu_pdev->bus->number << 8 | npu_pdn->devfn; npu_pdn->pcidev = npu_pdev; npu_pdn->pe_number = pe_num; - pe->dma_weight += pnv_ioda_dma_weight(npu_pdev); phb->ioda.pe_rmap[rid] = pe->pe_number; /* Map the PE to this link */ @@ -1536,7 +1472,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) pe->flags = PNV_IODA_PE_VF; pe->pbus = NULL; pe->parent_dev = pdev; - pe->tce32_seg = -1; pe->mve_number = -1; pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | pci_iov_virtfn_devfn(pdev, vf_index); @@ -2027,6 +1962,54 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { .free = pnv_ioda2_table_free, }; +static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data) +{ + unsigned int *weight = (unsigned int *)data; + + /* This is quite simplistic. The "base" weight of a device + * is 10. 0 means no DMA is to be accounted for it. + */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return 0; + + if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || + dev->class == PCI_CLASS_SERIAL_USB_OHCI || + dev->class == PCI_CLASS_SERIAL_USB_EHCI) + *weight += 3; + else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) + *weight += 15; + else + *weight += 10; + + return 0; +} + +static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe) +{ + unsigned int weight = 0; + + /* SRIOV VF has same DMA32 weight as its PF */ +#ifdef CONFIG_PCI_IOV + if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) { + pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight); + return weight; + } +#endif + + if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) { + pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight); + } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) { + struct pci_dev *pdev; + + list_for_each_entry(pdev, &pe->pbus->devices, bus_list) + pnv_pci_ioda_dev_dma_weight(pdev, &weight); + } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) { + pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight); + } + + return weight; +} + static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe, unsigned int base, @@ -2043,17 +2026,12 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ /* XXX FIXME: Allocate multi-level tables on PHB3 */ - /* We shouldn't already have a 32-bit DMA associated */ - if (WARN_ON(pe->tce32_seg >= 0)) - return; - tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); /* Grab a 32-bit TCE table */ - pe->tce32_seg = base; pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", base * PNV_IODA1_DMA32_SEGSIZE, (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1); @@ -2120,8 +2098,6 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, return; fail: /* XXX Failure: Try to fallback to 64-bit only ? */ - if (pe->tce32_seg >= 0) - pe->tce32_seg = -1; if (tce_mem) __free_pages(tce_mem, get_order(tce32_segsz * segs)); if (tbl) { @@ -2532,10 +2508,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, { int64_t rc; - /* We shouldn't already have a 32-bit DMA associated */ - if (WARN_ON(pe->tce32_seg >= 0)) - return; - /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; @@ -2543,7 +2515,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe->pe_number); /* The PE will reserve all possible 32-bits space */ - pe->tce32_seg = 0; pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", phb->ioda.m32_pci_base); @@ -2559,11 +2530,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, #endif rc = pnv_pci_ioda2_setup_default_config(pe); - if (rc) { - if (pe->tce32_seg >= 0) - pe->tce32_seg = -1; + if (rc) return; - } if (pe->flags & PNV_IODA_PE_DEV) iommu_add_device(&pe->pdev->dev); @@ -2574,24 +2542,35 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, static void pnv_ioda_setup_dma(struct pnv_phb *phb) { struct pci_controller *hose = phb->hose; - unsigned int residual, remaining, segs, tw, base; + unsigned int weight, total_weight, dma_pe_count; + unsigned int residual, remaining, segs, base; struct pnv_ioda_pe *pe; + total_weight = 0; + pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight, + &total_weight); + + dma_pe_count = 0; + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + weight = pnv_pci_ioda_pe_dma_weight(pe); + if (weight > 0) + dma_pe_count++; + } + /* If we have more PE# than segments available, hand out one * per PE until we run out and let the rest fail. If not, * then we assign at least one segment per PE, plus more based * on the amount of devices under that PE */ - if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) + if (dma_pe_count > phb->ioda.tce32_count) residual = 0; else - residual = phb->ioda.tce32_count - - phb->ioda.dma_pe_count; + residual = phb->ioda.tce32_count - dma_pe_count; pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", hose->global_number, phb->ioda.tce32_count); pr_info("PCI: %d PE# for a total weight of %d\n", - phb->ioda.dma_pe_count, phb->ioda.dma_weight); + dma_pe_count, total_weight); pnv_pci_ioda_setup_opal_tce_kill(phb); @@ -2600,18 +2579,20 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) * weight */ remaining = phb->ioda.tce32_count; - tw = phb->ioda.dma_weight; base = 0; - list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { - if (!pe->dma_weight) + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + weight = pnv_pci_ioda_pe_dma_weight(pe); + if (!weight) continue; + if (!remaining) { pe_warn(pe, "No DMA32 resources available\n"); continue; } segs = 1; if (residual) { - segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; + segs += ((weight * residual) + (total_weight / 2)) / + total_weight; if (segs > remaining) segs = remaining; } @@ -2623,7 +2604,7 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) */ if (phb->type == PNV_PHB_IODA1) { pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", - pe->dma_weight, segs); + weight, segs); pnv_pci_ioda1_setup_dma_pe(phb, pe, base, segs); } else if (phb->type == PNV_PHB_IODA2) { pe_info(pe, "Assign DMA32 space\n"); @@ -3167,13 +3148,18 @@ static void pnv_npu_ioda_fixup(void) struct pci_controller *hose, *tmp; struct pnv_phb *phb; struct pnv_ioda_pe *pe; + unsigned int weight; list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { phb = hose->private_data; if (phb->type != PNV_PHB_NPU) continue; - list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + weight = pnv_pci_ioda_pe_dma_weight(pe); + if (WARN_ON(!weight)) + continue; + enable_bypass = dma_get_mask(&pe->pdev->dev) == DMA_BIT_MASK(64); pnv_npu_init_dma_pe(pe); @@ -3455,7 +3441,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->ioda.pe_array = aux + pemap_off; set_bit(phb->ioda.reserved_pe_idx, phb->ioda.pe_alloc); - INIT_LIST_HEAD(&phb->ioda.pe_dma_list); INIT_LIST_HEAD(&phb->ioda.pe_list); mutex_init(&phb->ioda.pe_list_mutex); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index a409165..117cfcd 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -53,14 +53,7 @@ struct pnv_ioda_pe { /* PE number */ unsigned int pe_number; - /* "Weight" assigned to the PE for the sake of DMA resource - * allocations - */ - unsigned int dma_weight; - /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ - int tce32_seg; - int tce32_segcount; struct iommu_table_group table_group; /* 64-bit TCE bypass region */ @@ -78,7 +71,6 @@ struct pnv_ioda_pe { struct list_head slaves; /* Link in list of PE#s */ - struct list_head dma_link; struct list_head list; }; @@ -169,17 +161,6 @@ struct pnv_phb { /* 32-bit TCE tables allocation */ unsigned long tce32_count; - /* Total "weight" for the sake of DMA resources - * allocation - */ - unsigned int dma_weight; - unsigned int dma_pe_count; - - /* Sorted list of used PE's, sorted at - * boot for resource allocation purposes - */ - struct list_head pe_dma_list; - /* TCE cache invalidate registers (physical and * remapped) */ -- cgit v0.10.2 From 2b923ed1bd96cc471bb0f0eddfed6ae4753d717b Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 5 May 2016 12:04:16 +1000 Subject: powerpc/powernv/ioda1: Improve DMA32 segment track In current implementation, the DMA32 segments required by one specific PE isn't calculated with the information hold in the PE independently. It conflicts with the PCI hotplug design: PE centralized, meaning the PE's DMA32 segments should be calculated from the information hold in the PE independently. This introduces an array (@dma32_segmap) for every PHB to track the DMA32 segmeng usage. Besides, this moves the logic calculating PE's consumed DMA32 segments to pnv_pci_ioda1_setup_dma_pe() so that PE's DMA32 segments are calculated/allocated from the information hold in the PE (DMA32 weight). Also the logic is improved: we try to allocate as much DMA32 segments as we can. It's acceptable that number of DMA32 segments less than the expected number are allocated. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 30c825a..98879a0 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2011,27 +2011,62 @@ static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe) } static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe, - unsigned int base, - unsigned int segs) + struct pnv_ioda_pe *pe) { struct page *tce_mem = NULL; struct iommu_table *tbl; - unsigned int tce32_segsz, i; + unsigned int weight, total_weight = 0; + unsigned int tce32_segsz, base, segs, avail, i; int64_t rc; void *addr; /* XXX FIXME: Handle 64-bit only DMA devices */ /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ /* XXX FIXME: Allocate multi-level tables on PHB3 */ + weight = pnv_pci_ioda_pe_dma_weight(pe); + if (!weight) + return; + + pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight, + &total_weight); + segs = (weight * phb->ioda.dma32_count) / total_weight; + if (!segs) + segs = 1; + /* + * Allocate contiguous DMA32 segments. We begin with the expected + * number of segments. With one more attempt, the number of DMA32 + * segments to be allocated is decreased by one until one segment + * is allocated successfully. + */ + do { + for (base = 0; base <= phb->ioda.dma32_count - segs; base++) { + for (avail = 0, i = base; i < base + segs; i++) { + if (phb->ioda.dma32_segmap[i] == + IODA_INVALID_PE) + avail++; + } + + if (avail == segs) + goto found; + } + } while (--segs); + + if (!segs) { + pe_warn(pe, "No available DMA32 segments\n"); + return; + } + +found: tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); /* Grab a 32-bit TCE table */ + pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n", + weight, total_weight, base, segs); pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", base * PNV_IODA1_DMA32_SEGSIZE, (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1); @@ -2068,6 +2103,10 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, } } + /* Setup DMA32 segment mapping */ + for (i = base; i < base + segs; i++) + phb->ioda.dma32_segmap[i] = pe->pe_number; + /* Setup linux iommu table */ pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs, base * PNV_IODA1_DMA32_SEGSIZE, @@ -2542,73 +2581,34 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, static void pnv_ioda_setup_dma(struct pnv_phb *phb) { struct pci_controller *hose = phb->hose; - unsigned int weight, total_weight, dma_pe_count; - unsigned int residual, remaining, segs, base; struct pnv_ioda_pe *pe; - - total_weight = 0; - pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight, - &total_weight); - - dma_pe_count = 0; - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - weight = pnv_pci_ioda_pe_dma_weight(pe); - if (weight > 0) - dma_pe_count++; - } + unsigned int weight; /* If we have more PE# than segments available, hand out one * per PE until we run out and let the rest fail. If not, * then we assign at least one segment per PE, plus more based * on the amount of devices under that PE */ - if (dma_pe_count > phb->ioda.tce32_count) - residual = 0; - else - residual = phb->ioda.tce32_count - dma_pe_count; - - pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", - hose->global_number, phb->ioda.tce32_count); - pr_info("PCI: %d PE# for a total weight of %d\n", - dma_pe_count, total_weight); + pr_info("PCI: Domain %04x has %d available 32-bit DMA segments\n", + hose->global_number, phb->ioda.dma32_count); pnv_pci_ioda_setup_opal_tce_kill(phb); - /* Walk our PE list and configure their DMA segments, hand them - * out one base segment plus any residual segments based on - * weight - */ - remaining = phb->ioda.tce32_count; - base = 0; + /* Walk our PE list and configure their DMA segments */ list_for_each_entry(pe, &phb->ioda.pe_list, list) { weight = pnv_pci_ioda_pe_dma_weight(pe); if (!weight) continue; - if (!remaining) { - pe_warn(pe, "No DMA32 resources available\n"); - continue; - } - segs = 1; - if (residual) { - segs += ((weight * residual) + (total_weight / 2)) / - total_weight; - if (segs > remaining) - segs = remaining; - } - /* * For IODA2 compliant PHB3, we needn't care about the weight. * The all available 32-bits DMA space will be assigned to * the specific PE. */ if (phb->type == PNV_PHB_IODA1) { - pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", - weight, segs); - pnv_pci_ioda1_setup_dma_pe(phb, pe, base, segs); + pnv_pci_ioda1_setup_dma_pe(phb, pe); } else if (phb->type == PNV_PHB_IODA2) { pe_info(pe, "Assign DMA32 space\n"); - segs = 0; pnv_pci_ioda2_setup_dma_pe(phb, pe); } else if (phb->type == PNV_PHB_NPU) { /* @@ -2618,9 +2618,6 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) * as the PHB3 TVT. */ } - - remaining -= segs; - base += segs; } } @@ -3327,7 +3324,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, { struct pci_controller *hose; struct pnv_phb *phb; - unsigned long size, m64map_off, m32map_off, pemap_off, iomap_off = 0; + unsigned long size, m64map_off, m32map_off, pemap_off; + unsigned long iomap_off = 0, dma32map_off = 0; const __be64 *prop64; const __be32 *prop32; int len; @@ -3413,6 +3411,10 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num; phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ + /* Calculate how many 32-bit TCE segments we have */ + phb->ioda.dma32_count = phb->ioda.m32_pci_base / + PNV_IODA1_DMA32_SEGSIZE; + /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); m64map_off = size; @@ -3422,6 +3424,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, if (phb->type == PNV_PHB_IODA1) { iomap_off = size; size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]); + dma32map_off = size; + size += phb->ioda.dma32_count * + sizeof(phb->ioda.dma32_segmap[0]); } pemap_off = size; size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe); @@ -3437,6 +3442,10 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->ioda.io_segmap = aux + iomap_off; for (segno = 0; segno < phb->ioda.total_pe_num; segno++) phb->ioda.io_segmap[segno] = IODA_INVALID_PE; + + phb->ioda.dma32_segmap = aux + dma32map_off; + for (segno = 0; segno < phb->ioda.dma32_count; segno++) + phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE; } phb->ioda.pe_array = aux + pemap_off; set_bit(phb->ioda.reserved_pe_idx, phb->ioda.pe_alloc); @@ -3445,7 +3454,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, mutex_init(&phb->ioda.pe_list_mutex); /* Calculate how many 32-bit TCE segments we have */ - phb->ioda.tce32_count = phb->ioda.m32_pci_base / + phb->ioda.dma32_count = phb->ioda.m32_pci_base / PNV_IODA1_DMA32_SEGSIZE; #if 0 /* We should really do that ... */ diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 117cfcd..14d9391 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -142,6 +142,10 @@ struct pnv_phb { unsigned int *m32_segmap; unsigned int *io_segmap; + /* DMA32 segment maps - IODA1 only */ + unsigned int dma32_count; + unsigned int *dma32_segmap; + /* IRQ chip */ int irq_chip_init; struct irq_chip irq_chip; @@ -158,9 +162,6 @@ struct pnv_phb { */ unsigned char pe_rmap[0x10000]; - /* 32-bit TCE tables allocation */ - unsigned long tce32_count; - /* TCE cache invalidate registers (physical and * remapped) */ -- cgit v0.10.2 From 1e9167726c41b8d2126327379f4f82acbd548ba8 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:36 +1000 Subject: powerpc/powernv: Use PE instead of number during setup and release In current implementation, the PEs that are allocated or picked from the reserved list are identified by PE number. The PE instance has to be picked according to the PE number eventually. We have same issue when PE is released. For pnv_ioda_pick_m64_pe() and pnv_ioda_alloc_pe(), this returns PE instance so that pnv_ioda_setup_bus_PE() can use the allocated or reserved PE instance directly. Also, pnv_ioda_setup_bus_PE() returns the reserved/allocated PE instance to be used in subsequent patches. On the other hand, pnv_ioda_free_pe() uses PE instance (not number) as its argument. No logical changes introduced. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 98879a0..c762f38 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -123,6 +123,14 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); } +static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) +{ + phb->ioda.pe_array[pe_no].phb = phb; + phb->ioda.pe_array[pe_no].pe_number = pe_no; + + return &phb->ioda.pe_array[pe_no]; +} + static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) { if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) { @@ -135,11 +143,10 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) pr_debug("%s: PE %d was reserved on PHB#%x\n", __func__, pe_no, phb->hose->global_number); - phb->ioda.pe_array[pe_no].phb = phb; - phb->ioda.pe_array[pe_no].pe_number = pe_no; + pnv_ioda_init_pe(phb, pe_no); } -static unsigned int pnv_ioda_alloc_pe(struct pnv_phb *phb) +static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) { unsigned long pe; @@ -147,20 +154,20 @@ static unsigned int pnv_ioda_alloc_pe(struct pnv_phb *phb) pe = find_next_zero_bit(phb->ioda.pe_alloc, phb->ioda.total_pe_num, 0); if (pe >= phb->ioda.total_pe_num) - return IODA_INVALID_PE; + return NULL; } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); - phb->ioda.pe_array[pe].phb = phb; - phb->ioda.pe_array[pe].pe_number = pe; - return pe; + return pnv_ioda_init_pe(phb, pe); } -static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) +static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) { - WARN_ON(phb->ioda.pe_array[pe].pdev); + struct pnv_phb *phb = pe->phb; - memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); - clear_bit(pe, phb->ioda.pe_alloc); + WARN_ON(pe->pdev); + + memset(pe, 0, sizeof(struct pnv_ioda_pe)); + clear_bit(pe->pe_number, phb->ioda.pe_alloc); } /* The default M64 BAR is shared by all PEs */ @@ -320,7 +327,7 @@ static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, } } -static unsigned int pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) +static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) { struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; @@ -330,7 +337,7 @@ static unsigned int pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) /* Root bus shouldn't use M64 */ if (pci_is_root_bus(bus)) - return IODA_INVALID_PE; + return NULL; /* Allocate bitmap */ size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); @@ -338,7 +345,7 @@ static unsigned int pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) if (!pe_alloc) { pr_warn("%s: Out of memory !\n", __func__); - return IODA_INVALID_PE; + return NULL; } /* Figure out reserved PE numbers by the PE */ @@ -351,7 +358,7 @@ static unsigned int pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) */ if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) { kfree(pe_alloc); - return IODA_INVALID_PE; + return NULL; } /* @@ -397,7 +404,7 @@ static unsigned int pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) } kfree(pe_alloc); - return master_pe->pe_number; + return master_pe; } static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) @@ -963,7 +970,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn = pci_get_pdn(dev); struct pnv_ioda_pe *pe; - unsigned int pe_num; if (!pdn) { pr_err("%s: Device tree node not associated properly\n", @@ -973,8 +979,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) if (pdn->pe_number != IODA_INVALID_PE) return NULL; - pe_num = pnv_ioda_alloc_pe(phb); - if (pe_num == IODA_INVALID_PE) { + pe = pnv_ioda_alloc_pe(phb); + if (!pe) { pr_warning("%s: Not enough PE# available, disabling device\n", pci_name(dev)); return NULL; @@ -987,10 +993,9 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) * * At some point we want to remove the PDN completely anyways */ - pe = &phb->ioda.pe_array[pe_num]; pci_dev_get(dev); pdn->pcidev = dev; - pdn->pe_number = pe_num; + pdn->pe_number = pe->pe_number; pe->flags = PNV_IODA_PE_DEV; pe->pdev = dev; pe->pbus = NULL; @@ -1001,8 +1006,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ - if (pe_num) - pnv_ioda_free_pe(phb, pe_num); + pnv_ioda_free_pe(pe); pdn->pe_number = IODA_INVALID_PE; pe->pdev = NULL; pci_dev_put(dev); @@ -1037,28 +1041,26 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) * subordinate PCI devices and buses. The second type of PE is normally * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports. */ -static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) +static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) { struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; - struct pnv_ioda_pe *pe; - unsigned int pe_num = IODA_INVALID_PE; + struct pnv_ioda_pe *pe = NULL; /* Check if PE is determined by M64 */ if (phb->pick_m64_pe) - pe_num = phb->pick_m64_pe(bus, all); + pe = phb->pick_m64_pe(bus, all); /* The PE number isn't pinned by M64 */ - if (pe_num == IODA_INVALID_PE) - pe_num = pnv_ioda_alloc_pe(phb); + if (!pe) + pe = pnv_ioda_alloc_pe(phb); - if (pe_num == IODA_INVALID_PE) { + if (!pe) { pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", __func__, pci_domain_nr(bus), bus->number); - return; + return NULL; } - pe = &phb->ioda.pe_array[pe_num]; pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); pe->pbus = bus; pe->pdev = NULL; @@ -1067,17 +1069,16 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) if (all) pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", - bus->busn_res.start, bus->busn_res.end, pe_num); + bus->busn_res.start, bus->busn_res.end, pe->pe_number); else pe_info(pe, "Secondary bus %d associated with PE#%d\n", - bus->busn_res.start, pe_num); + bus->busn_res.start, pe->pe_number); if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ - if (pe_num) - pnv_ioda_free_pe(phb, pe_num); + pnv_ioda_free_pe(pe); pe->pbus = NULL; - return; + return NULL; } /* Associate it with all child devices */ @@ -1085,6 +1086,8 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) /* Put PE to the list */ list_add_tail(&pe->list, &phb->ioda.pe_list); + + return pe; } static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) @@ -1396,7 +1399,7 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) pnv_ioda_deconfigure_pe(phb, pe); - pnv_ioda_free_pe(phb, pe->pe_number); + pnv_ioda_free_pe(pe); } } @@ -1405,6 +1408,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) struct pci_bus *bus; struct pci_controller *hose; struct pnv_phb *phb; + struct pnv_ioda_pe *pe; struct pci_dn *pdn; struct pci_sriov *iov; u16 num_vfs, i; @@ -1429,8 +1433,11 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) /* Release PE numbers */ if (pdn->m64_single_mode) { for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] != IODA_INVALID_PE) - pnv_ioda_free_pe(phb, pdn->pe_num_map[i]); + if (pdn->pe_num_map[i] == IODA_INVALID_PE) + continue; + + pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; + pnv_ioda_free_pe(pe); } } else bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); @@ -1483,8 +1490,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ - if (pe_num) - pnv_ioda_free_pe(phb, pe_num); + pnv_ioda_free_pe(pe); pe->pdev = NULL; continue; } @@ -1503,6 +1509,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) struct pci_bus *bus; struct pci_controller *hose; struct pnv_phb *phb; + struct pnv_ioda_pe *pe; struct pci_dn *pdn; int ret; u16 i; @@ -1545,11 +1552,13 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) /* Calculate available PE for required VFs */ if (pdn->m64_single_mode) { for (i = 0; i < num_vfs; i++) { - pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb); - if (pdn->pe_num_map[i] == IODA_INVALID_PE) { + pe = pnv_ioda_alloc_pe(phb); + if (!pe) { ret = -EBUSY; goto m64_failed; } + + pdn->pe_num_map[i] = pe->pe_number; } } else { mutex_lock(&phb->ioda.pe_alloc_mutex); @@ -1594,8 +1603,11 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) m64_failed: if (pdn->m64_single_mode) { for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] != IODA_INVALID_PE) - pnv_ioda_free_pe(phb, pdn->pe_num_map[i]); + if (pdn->pe_num_map[i] == IODA_INVALID_PE) + continue; + + pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; + pnv_ioda_free_pe(pe); } } else bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 14d9391..904f60b 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -105,7 +105,7 @@ struct pnv_phb { int (*init_m64)(struct pnv_phb *phb); void (*reserve_m64_pe)(struct pci_bus *bus, unsigned long *pe_bitmap, bool all); - unsigned int (*pick_m64_pe)(struct pci_bus *bus, bool all); + struct pnv_ioda_pe *(*pick_m64_pe)(struct pci_bus *bus, bool all); int (*get_pe_state)(struct pnv_phb *phb, int pe_no); void (*freeze_pe)(struct pnv_phb *phb, int pe_no); int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt); -- cgit v0.10.2 From bd251b893d46240fc12d9fa9316d9ea6ec2b69a7 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:37 +1000 Subject: powerpc/pci: Rename pcibios_{add, remove}_pci_devices() This renames pcibios_{add,remove}_pci_devices() to avoid conflicts with names of the weak functions in PCI subsystem, which have the prefix "pcibios". No logical changes introduced. Signed-off-by: Gavin Shan Reviewed-By: Alistair Popple Reviewed-by: Andrew Donnellan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 220129f..99027b8 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -260,10 +260,10 @@ static inline struct eeh_dev *pdn_to_eeh_dev(struct pci_dn *pdn) extern struct pci_bus *pcibios_find_pci_bus(struct device_node *dn); /** Remove all of the PCI devices under this bus */ -extern void pcibios_remove_pci_devices(struct pci_bus *bus); +extern void pci_hp_remove_devices(struct pci_bus *bus); /** Discover new pci devices under this bus, and add them */ -extern void pcibios_add_pci_devices(struct pci_bus *bus); +extern void pci_hp_add_devices(struct pci_bus *bus); extern void isa_bridge_find_early(struct pci_controller *hose); diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index fb6207d..618d13c 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -621,7 +621,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * We don't remove the corresponding PE instances because * we need the information afterwords. The attached EEH * devices are expected to be attached soon when calling - * into pcibios_add_pci_devices(). + * into pci_hp_add_devices(). */ eeh_pe_state_mark(pe, EEH_PE_KEEP); if (bus) { @@ -630,7 +630,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, } else { eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); pci_lock_rescan_remove(); - pcibios_remove_pci_devices(bus); + pci_hp_remove_devices(bus); pci_unlock_rescan_remove(); } } else if (frozen_bus) { @@ -681,7 +681,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, if (pe->type & EEH_PE_VF) eeh_add_virt_device(edev, NULL); else - pcibios_add_pci_devices(bus); + pci_hp_add_devices(bus); } else if (frozen_bus && rmv_data->removed) { pr_info("EEH: Sleep 5s ahead of partial hotplug\n"); ssleep(5); @@ -691,7 +691,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, if (pe->type & EEH_PE_VF) eeh_add_virt_device(edev, NULL); else - pcibios_add_pci_devices(frozen_bus); + pci_hp_add_devices(frozen_bus); } eeh_pe_state_clear(pe, EEH_PE_KEEP); @@ -896,7 +896,7 @@ perm_error: eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); pci_lock_rescan_remove(); - pcibios_remove_pci_devices(frozen_bus); + pci_hp_remove_devices(frozen_bus); pci_unlock_rescan_remove(); } } @@ -981,7 +981,7 @@ static void eeh_handle_special_event(void) bus = eeh_pe_bus_get(phb_pe); eeh_pe_dev_traverse(pe, eeh_report_failure, NULL); - pcibios_remove_pci_devices(bus); + pci_hp_remove_devices(bus); } pci_unlock_rescan_remove(); } diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 59c4361..2d108e5 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -38,20 +38,20 @@ void pcibios_release_device(struct pci_dev *dev) } /** - * pcibios_remove_pci_devices - remove all devices under this bus + * pci_hp_remove_devices - remove all devices under this bus * @bus: the indicated PCI bus * * Remove all of the PCI devices under this bus both from the * linux pci device tree, and from the powerpc EEH address cache. */ -void pcibios_remove_pci_devices(struct pci_bus *bus) +void pci_hp_remove_devices(struct pci_bus *bus) { struct pci_dev *dev, *tmp; struct pci_bus *child_bus; /* First go down child busses */ list_for_each_entry(child_bus, &bus->children, node) - pcibios_remove_pci_devices(child_bus); + pci_hp_remove_devices(child_bus); pr_debug("PCI: Removing devices on bus %04x:%02x\n", pci_domain_nr(bus), bus->number); @@ -60,11 +60,10 @@ void pcibios_remove_pci_devices(struct pci_bus *bus) pci_stop_and_remove_bus_device(dev); } } - -EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); +EXPORT_SYMBOL_GPL(pci_hp_remove_devices); /** - * pcibios_add_pci_devices - adds new pci devices to bus + * pci_hp_add_devices - adds new pci devices to bus * @bus: the indicated PCI bus * * This routine will find and fixup new pci devices under @@ -74,7 +73,7 @@ EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); * is how this routine differs from other, similar pcibios * routines.) */ -void pcibios_add_pci_devices(struct pci_bus * bus) +void pci_hp_add_devices(struct pci_bus *bus) { int slotno, mode, pass, max; struct pci_dev *dev; @@ -114,4 +113,4 @@ void pcibios_add_pci_devices(struct pci_bus * bus) } pcibios_finish_adding_to_bus(bus); } -EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); +EXPORT_SYMBOL_GPL(pci_hp_add_devices); diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c index b46b57d..b770d6d 100644 --- a/drivers/pci/hotplug/rpadlpar_core.c +++ b/drivers/pci/hotplug/rpadlpar_core.c @@ -380,7 +380,7 @@ int dlpar_remove_pci_slot(char *drc_name, struct device_node *dn) } /* Remove all devices below slot */ - pcibios_remove_pci_devices(bus); + pci_hp_remove_devices(bus); /* Unmap PCI IO space */ if (pcibios_unmap_io_space(bus)) { diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c index 611f605..8d13202 100644 --- a/drivers/pci/hotplug/rpaphp_core.c +++ b/drivers/pci/hotplug/rpaphp_core.c @@ -404,7 +404,7 @@ static int enable_slot(struct hotplug_slot *hotplug_slot) if (state == PRESENT) { pci_lock_rescan_remove(); - pcibios_add_pci_devices(slot->bus); + pci_hp_add_devices(slot->bus); pci_unlock_rescan_remove(); slot->state = CONFIGURED; } else if (state == EMPTY) { @@ -426,7 +426,7 @@ static int disable_slot(struct hotplug_slot *hotplug_slot) return -EINVAL; pci_lock_rescan_remove(); - pcibios_remove_pci_devices(slot->bus); + pci_hp_remove_devices(slot->bus); pci_unlock_rescan_remove(); vm_unmap_aliases(); diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c index 7836d69..137a892 100644 --- a/drivers/pci/hotplug/rpaphp_pci.c +++ b/drivers/pci/hotplug/rpaphp_pci.c @@ -116,7 +116,7 @@ int rpaphp_enable_slot(struct slot *slot) } if (list_empty(&bus->devices)) - pcibios_add_pci_devices(bus); + pci_hp_add_devices(bus); if (!list_empty(&bus->devices)) { info->adapter_status = CONFIGURED; -- cgit v0.10.2 From 3773dd258e91e1e14d95cdf17b9d83a1a7b27af1 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:38 +1000 Subject: powerpc/pci: Rename pcibios_find_pci_bus() This renames pcibios_find_pci_bus() to pci_find_bus_by_node() to avoid conflicts with those PCI subsystem weak function names, which have prefix "pcibios". No logical changes introduced. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 99027b8..f0a5b16 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -257,7 +257,7 @@ static inline struct eeh_dev *pdn_to_eeh_dev(struct pci_dn *pdn) #endif /** Find the bus corresponding to the indicated device node */ -extern struct pci_bus *pcibios_find_pci_bus(struct device_node *dn); +extern struct pci_bus *pci_find_bus_by_node(struct device_node *dn); /** Remove all of the PCI devices under this bus */ extern void pci_hp_remove_devices(struct pci_bus *bus); diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 5d4a3df..aee22b4 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -54,8 +54,7 @@ find_bus_among_children(struct pci_bus *bus, return child; } -struct pci_bus * -pcibios_find_pci_bus(struct device_node *dn) +struct pci_bus *pci_find_bus_by_node(struct device_node *dn) { struct pci_dn *pdn = dn->data; @@ -64,7 +63,7 @@ pcibios_find_pci_bus(struct device_node *dn) return find_bus_among_children(pdn->phb->bus, dn); } -EXPORT_SYMBOL_GPL(pcibios_find_pci_bus); +EXPORT_SYMBOL_GPL(pci_find_bus_by_node); struct pci_controller *init_phb_dynamic(struct device_node *dn) { diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c index b770d6d..dc67f39 100644 --- a/drivers/pci/hotplug/rpadlpar_core.c +++ b/drivers/pci/hotplug/rpadlpar_core.c @@ -175,7 +175,7 @@ static int dlpar_add_pci_slot(char *drc_name, struct device_node *dn) struct pci_dev *dev; struct pci_controller *phb; - if (pcibios_find_pci_bus(dn)) + if (pci_find_bus_by_node(dn)) return -EINVAL; /* Add pci bus */ @@ -212,7 +212,7 @@ static int dlpar_remove_phb(char *drc_name, struct device_node *dn) struct pci_dn *pdn; int rc = 0; - if (!pcibios_find_pci_bus(dn)) + if (!pci_find_bus_by_node(dn)) return -EINVAL; /* If pci slot is hotpluggable, use hotplug to remove it */ @@ -356,7 +356,7 @@ int dlpar_remove_pci_slot(char *drc_name, struct device_node *dn) pci_lock_rescan_remove(); - bus = pcibios_find_pci_bus(dn); + bus = pci_find_bus_by_node(dn); if (!bus) { ret = -EINVAL; goto out; diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c index 137a892..ea41ea1 100644 --- a/drivers/pci/hotplug/rpaphp_pci.c +++ b/drivers/pci/hotplug/rpaphp_pci.c @@ -93,7 +93,7 @@ int rpaphp_enable_slot(struct slot *slot) if (rc) return rc; - bus = pcibios_find_pci_bus(slot->dn); + bus = pci_find_bus_by_node(slot->dn); if (!bus) { err("%s: no pci_bus for dn %s\n", __func__, slot->dn->full_name); return -EINVAL; -- cgit v0.10.2 From 6384d97780c123d2e167242862ed4ea9051210e4 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:39 +1000 Subject: powerpc/pci: Move pci_find_bus_by_node() around This moves pci_find_bus_by_node() from arch/powerpc/platforms/ pseries/pci_dlpar.c to arch/powerpc/kernel/pci-hotplug.c so that the function can be used by pSeries and PowerNV platform at the same time. Also, below cleanup applied. No functional changes introduced. * Remove variable "busdn" in find_bus_among_children() * Use PCI_DN() to convert device node to pci_dn Signed-off-by: Gavin Shan Acked-by: Benjamin Herrenschmidt Reviewed-by: Andrew Donnellan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 2d108e5..46587a1 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -21,6 +21,35 @@ #include #include +static struct pci_bus *find_bus_among_children(struct pci_bus *bus, + struct device_node *dn) +{ + struct pci_bus *child = NULL; + struct pci_bus *tmp; + + if (pci_bus_to_OF_node(bus) == dn) + return bus; + + list_for_each_entry(tmp, &bus->children, node) { + child = find_bus_among_children(tmp, dn); + if (child) + break; + } + + return child; +} + +struct pci_bus *pci_find_bus_by_node(struct device_node *dn) +{ + struct pci_dn *pdn = PCI_DN(dn); + + if (!pdn || !pdn->phb || !pdn->phb->bus) + return NULL; + + return find_bus_among_children(pdn->phb->bus, dn); +} +EXPORT_SYMBOL_GPL(pci_find_bus_by_node); + /** * pcibios_release_device - release PCI device * @dev: PCI device diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index aee22b4..906dbaa 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -34,37 +34,6 @@ #include "pseries.h" -static struct pci_bus * -find_bus_among_children(struct pci_bus *bus, - struct device_node *dn) -{ - struct pci_bus *child = NULL; - struct pci_bus *tmp; - struct device_node *busdn; - - busdn = pci_bus_to_OF_node(bus); - if (busdn == dn) - return bus; - - list_for_each_entry(tmp, &bus->children, node) { - child = find_bus_among_children(tmp, dn); - if (child) - break; - }; - return child; -} - -struct pci_bus *pci_find_bus_by_node(struct device_node *dn) -{ - struct pci_dn *pdn = dn->data; - - if (!pdn || !pdn->phb || !pdn->phb->bus) - return NULL; - - return find_bus_among_children(pdn->phb->bus, dn); -} -EXPORT_SYMBOL_GPL(pci_find_bus_by_node); - struct pci_controller *init_phb_dynamic(struct device_node *dn) { struct pci_controller *phb; -- cgit v0.10.2 From d8f66f411e2dc1c27413f0ba73f49bccfa998f1b Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:40 +1000 Subject: powerpc/pci: Export pci_add_device_node_info() This renames update_dn_pci_info() to pci_add_device_node_info() with corresponding adjustment on the parameter type and exports it. The function is used to create pdn (struct pci_dn) for the indicated device node. Another function add_pdn(), almost wrapper of pci_add_device_node_info(), to be used in traverse_pci_devices(). No logical changes introduced. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index f0a5b16..07b94ec 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -235,7 +235,8 @@ extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev); extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev); extern void remove_dev_pci_data(struct pci_dev *pdev); -extern void *update_dn_pci_info(struct device_node *dn, void *data); +extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, + struct device_node *dn); static inline int pci_device_from_OF_node(struct device_node *np, u8 *bus, u8 *devfn) diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 38102cb..0a249ff 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -282,13 +282,9 @@ void remove_dev_pci_data(struct pci_dev *pdev) #endif /* CONFIG_PCI_IOV */ } -/* - * Traverse_func that inits the PCI fields of the device node. - * NOTE: this *must* be done before read/write config to the device. - */ -void *update_dn_pci_info(struct device_node *dn, void *data) +struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, + struct device_node *dn) { - struct pci_controller *phb = data; const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL); const __be32 *regs; struct device_node *parent; @@ -299,7 +295,7 @@ void *update_dn_pci_info(struct device_node *dn, void *data) return NULL; dn->data = pdn; pdn->node = dn; - pdn->phb = phb; + pdn->phb = hose; #ifdef CONFIG_PPC_POWERNV pdn->pe_number = IODA_INVALID_PE; #endif @@ -331,8 +327,9 @@ void *update_dn_pci_info(struct device_node *dn, void *data) if (pdn->parent) list_add_tail(&pdn->list, &pdn->parent->child_list); - return NULL; + return pdn; } +EXPORT_SYMBOL_GPL(pci_add_device_node_info); /* * Traverse a device tree stopping each PCI device in the tree. @@ -432,6 +429,18 @@ void *traverse_pci_dn(struct pci_dn *root, return NULL; } +static void *add_pdn(struct device_node *dn, void *data) +{ + struct pci_controller *hose = data; + struct pci_dn *pdn; + + pdn = pci_add_device_node_info(hose, dn); + if (!pdn) + return ERR_PTR(-ENOMEM); + + return NULL; +} + /** * pci_devs_phb_init_dynamic - setup pci devices under this PHB * phb: pci-to-host bridge (top-level bridge connecting to cpu) @@ -446,8 +455,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb) struct pci_dn *pdn; /* PHB nodes themselves must not match */ - update_dn_pci_info(dn, phb); - pdn = dn->data; + pdn = pci_add_device_node_info(phb, dn); if (pdn) { pdn->devfn = pdn->busno = -1; pdn->vendor_id = pdn->device_id = pdn->class_code = 0; @@ -456,7 +464,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb) } /* Update dn->phb ptrs for new phb and children devices */ - traverse_pci_devices(dn, update_dn_pci_info, phb); + traverse_pci_devices(dn, add_pdn, phb); } /** diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index fa73494..9883bc7 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -267,7 +267,7 @@ static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long act pdn = parent ? PCI_DN(parent) : NULL; if (pdn) { /* Create pdn and EEH device */ - update_dn_pci_info(np, pdn->phb); + pci_add_device_node_info(pdn->phb, np); eeh_dev_init(PCI_DN(np), pdn->phb); } -- cgit v0.10.2 From de5a28ac5a2d9619dc82286b08dadca577133bbe Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:41 +1000 Subject: powerpc/pci: Introduce pci_remove_device_node_info() This implements and exports pci_remove_device_node_info(). It's used to remove the pdn (struct pci_dn) for the indicated device node. The function is going to be used by PowerNV PCI hotplug driver. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 07b94ec..467c0b0 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -237,6 +237,7 @@ extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev); extern void remove_dev_pci_data(struct pci_dev *pdev); extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, struct device_node *dn); +extern void pci_remove_device_node_info(struct device_node *dn); static inline int pci_device_from_OF_node(struct device_node *np, u8 *bus, u8 *devfn) diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 0a249ff..ce10281 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -331,6 +331,29 @@ struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, } EXPORT_SYMBOL_GPL(pci_add_device_node_info); +void pci_remove_device_node_info(struct device_node *dn) +{ + struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL; +#ifdef CONFIG_EEH + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + + if (edev) + edev->pdn = NULL; +#endif + + if (!pdn) + return; + + WARN_ON(!list_empty(&pdn->child_list)); + list_del(&pdn->list); + if (pdn->parent) + of_node_put(pdn->parent->node); + + dn->data = NULL; + kfree(pdn); +} +EXPORT_SYMBOL_GPL(pci_remove_device_node_info); + /* * Traverse a device tree stopping each PCI device in the tree. * This is done depth first. As each node is processed, a "pre" -- cgit v0.10.2 From cdddc577d956d55a62581ba3681f8ba66eca3873 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:42 +1000 Subject: powerpc/pci: Export pci_traverse_device_nodes() This renames traverse_pci_devices() to pci_traverse_device_nodes(). The function traverses all subordinate device nodes of the specified one. Also, below cleanup applied to the function. No logical changes introduced. * Rename "pre" to "fn". * Avoid assignment in if condition reported from checkpatch.pl. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index ca0c5bf..8753e4e 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -33,9 +33,9 @@ extern struct pci_dev *isa_bridge_pcidev; /* may be NULL if no ISA bus */ struct device_node; struct pci_dn; -typedef void *(*traverse_func)(struct device_node *me, void *data); -void *traverse_pci_devices(struct device_node *start, traverse_func pre, - void *data); +void *pci_traverse_device_nodes(struct device_node *start, + void *(*fn)(struct device_node *, void *), + void *data); void *traverse_pci_dn(struct pci_dn *root, void *(*fn)(struct pci_dn *, void *), void *data); diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index ce10281..ecdccce 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -372,8 +372,9 @@ EXPORT_SYMBOL_GPL(pci_remove_device_node_info); * one of these nodes we also assume its siblings are non-pci for * performance. */ -void *traverse_pci_devices(struct device_node *start, traverse_func pre, - void *data) +void *pci_traverse_device_nodes(struct device_node *start, + void *(*fn)(struct device_node *, void *), + void *data) { struct device_node *dn, *nextdn; void *ret; @@ -388,8 +389,11 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre, if (classp) class = of_read_number(classp, 1); - if (pre && ((ret = pre(dn, data)) != NULL)) - return ret; + if (fn) { + ret = fn(dn, data); + if (ret) + return ret; + } /* If we are a PCI bridge, go down */ if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI || @@ -411,6 +415,7 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre, } return NULL; } +EXPORT_SYMBOL_GPL(pci_traverse_device_nodes); static struct pci_dn *pci_dn_next_one(struct pci_dn *root, struct pci_dn *pdn) @@ -487,7 +492,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb) } /* Update dn->phb ptrs for new phb and children devices */ - traverse_pci_devices(dn, add_pdn, phb); + pci_traverse_device_nodes(dn, add_pdn, phb); } /** diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 272e9ec..543a638 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -305,7 +305,7 @@ static int msi_quota_for_device(struct pci_dev *dev, int request) memset(&counts, 0, sizeof(struct msi_counts)); /* Work out how many devices we have below this PE */ - traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts); + pci_traverse_device_nodes(pe_dn, count_non_bridge_devices, &counts); if (counts.num_devices == 0) { pr_err("rtas_msi: found 0 devices under PE for %s\n", @@ -320,7 +320,7 @@ static int msi_quota_for_device(struct pci_dev *dev, int request) /* else, we have some more calculating to do */ counts.requestor = pci_device_to_OF_node(dev); counts.request = request; - traverse_pci_devices(pe_dn, count_spare_msis, &counts); + pci_traverse_device_nodes(pe_dn, count_spare_msis, &counts); /* If the quota isn't an integer multiple of the total, we can * use the remainder as spare MSIs for anyone that wants them. */ -- cgit v0.10.2 From 4a5954ed7726497415ae49204cfc8e38ee8b127e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:43 +1000 Subject: powerpc/pci: Don't scan empty slot In hotplug case, function pci_add_pci_devices() is called to rescan the specified PCI bus, which might not have any child devices. Access to the PCI bus's child device node will cause kernel crash without exception. This adds one more check to skip scanning PCI bus that doesn't have any subordinate devices from device-tree, in order to avoid kernel crash. Signed-off-by: Gavin Shan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 46587a1..2d71269 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -120,7 +120,8 @@ void pci_hp_add_devices(struct pci_bus *bus) if (mode == PCI_PROBE_DEVTREE) { /* use ofdt-based probe */ of_rescan_bus(dn, bus); - } else if (mode == PCI_PROBE_NORMAL) { + } else if (mode == PCI_PROBE_NORMAL && + dn->child && PCI_DN(dn->child)) { /* * Use legacy probe. In the partial hotplug case, we * probably have grandchildren devices unplugged. So -- cgit v0.10.2 From 4fad494321351f0ac412945c6a464109ad96734a Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:44 +1000 Subject: powerpc/powernv: Simplify pnv_eeh_reset() This drops unnecessary nested if statements in pnv_eeh_reset() to improve the code readability. After the changes, the unused local variable "ret" is dropped as well. No logical changes introduced. Signed-off-by: Gavin Shan Reviewed-by: Andrew Donnellan Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 69e41ce..9226df1 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1009,8 +1009,9 @@ static int pnv_eeh_reset_vf_pe(struct eeh_pe *pe, int option) static int pnv_eeh_reset(struct eeh_pe *pe, int option) { struct pci_controller *hose = pe->phb; + struct pnv_phb *phb; struct pci_bus *bus; - int ret; + int64_t rc; /* * For PHB reset, we always have complete reset. For those PEs whose @@ -1026,45 +1027,39 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) * reset. The side effect is that EEH core has to clear the frozen * state explicitly after BAR restore. */ - if (pe->type & EEH_PE_PHB) { - ret = pnv_eeh_phb_reset(hose, option); - } else { - struct pnv_phb *phb; - s64 rc; + if (pe->type & EEH_PE_PHB) + return pnv_eeh_phb_reset(hose, option); - /* - * The frozen PE might be caused by PAPR error injection - * registers, which are expected to be cleared after hitting - * frozen PE as stated in the hardware spec. Unfortunately, - * that's not true on P7IOC. So we have to clear it manually - * to avoid recursive EEH errors during recovery. - */ - phb = hose->private_data; - if (phb->model == PNV_PHB_MODEL_P7IOC && - (option == EEH_RESET_HOT || - option == EEH_RESET_FUNDAMENTAL)) { - rc = opal_pci_reset(phb->opal_id, - OPAL_RESET_PHB_ERROR, - OPAL_ASSERT_RESET); - if (rc != OPAL_SUCCESS) { - pr_warn("%s: Failure %lld clearing " - "error injection registers\n", - __func__, rc); - return -EIO; - } + /* + * The frozen PE might be caused by PAPR error injection + * registers, which are expected to be cleared after hitting + * frozen PE as stated in the hardware spec. Unfortunately, + * that's not true on P7IOC. So we have to clear it manually + * to avoid recursive EEH errors during recovery. + */ + phb = hose->private_data; + if (phb->model == PNV_PHB_MODEL_P7IOC && + (option == EEH_RESET_HOT || + option == EEH_RESET_FUNDAMENTAL)) { + rc = opal_pci_reset(phb->opal_id, + OPAL_RESET_PHB_ERROR, + OPAL_ASSERT_RESET); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld clearing error injection registers\n", + __func__, rc); + return -EIO; } - - bus = eeh_pe_bus_get(pe); - if (pe->type & EEH_PE_VF) - ret = pnv_eeh_reset_vf_pe(pe, option); - else if (pci_is_root_bus(bus) || - pci_is_root_bus(bus->parent)) - ret = pnv_eeh_root_reset(hose, option); - else - ret = pnv_eeh_bridge_reset(bus->self, option); } - return ret; + bus = eeh_pe_bus_get(pe); + if (pe->type & EEH_PE_VF) + return pnv_eeh_reset_vf_pe(pe, option); + + if (pci_is_root_bus(bus) || + pci_is_root_bus(bus->parent)) + return pnv_eeh_root_reset(hose, option); + + return pnv_eeh_bridge_reset(bus->self, option); } /** -- cgit v0.10.2 From c8ceacc22bce95d3a9cff198c9c27a30105a16b8 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 3 May 2016 15:41:45 +1000 Subject: powerpc/powernv: Exclude root bus in pnv_pci_reset_secondary_bus() The function pnv_pci_reset_secondary_bus() is called like below. It's impossible for call the function on root bus. So it's safe to remove the root bus case in the function. No functional changes introduced. pci_parent_bus_reset() / pci_bus_reset() / pci_try_reset_bus() pci_reset_bridge_secondary_bus() pcibios_reset_secondary_bus() pnv_pci_reset_secondary_bus() Signed-off-by: Gavin Shan Reviewed-by: Daniel Axtens Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 9226df1..593b8dc 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -868,16 +868,8 @@ static int pnv_eeh_bridge_reset(struct pci_dev *dev, int option) void pnv_pci_reset_secondary_bus(struct pci_dev *dev) { - struct pci_controller *hose; - - if (pci_is_root_bus(dev->bus)) { - hose = pci_bus_to_host(dev->bus); - pnv_eeh_root_reset(hose, EEH_RESET_HOT); - pnv_eeh_root_reset(hose, EEH_RESET_DEACTIVATE); - } else { - pnv_eeh_bridge_reset(dev, EEH_RESET_HOT); - pnv_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE); - } + pnv_eeh_bridge_reset(dev, EEH_RESET_HOT); + pnv_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE); } static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type, -- cgit v0.10.2 From 54de285beb9ef8b85cf240f7cda161bd6ae5d04c Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 29 Apr 2016 18:55:15 +1000 Subject: vfio/spapr: Relax the IOMMU compatibility check We are going to have multiple different types of PHB on the same system with POWER8 + NVLink and PHBs will have different IOMMU ops. However we only really care about one callback - create_table - so we can relax the compatibility check here. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Acked-by: Alex Williamson Signed-off-by: Michael Ellerman diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 0582b72..3054e3f 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -1188,7 +1188,8 @@ static int tce_iommu_attach_group(void *iommu_data, goto unlock_exit; } table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); - if (table_group_tmp->ops != table_group->ops) { + if (table_group_tmp->ops->create_table != + table_group->ops->create_table) { pr_warn("tce_vfio: Group %d is incompatible with group %d\n", iommu_group_id(iommu_group), iommu_group_id(tcegrp->grp)); -- cgit v0.10.2 From a7cf13caad8bf368a28f31dbb4807d61bc9ace18 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 29 Apr 2016 18:55:16 +1000 Subject: powerpc/powernv: Rename pnv_pci_ioda2_tce_invalidate_entire As in fact pnv_pci_ioda2_tce_invalidate_entire() invalidates TCEs for the specific PE rather than the entire cache, rename it to pnv_pci_ioda2_tce_invalidate_pe(). In later patches we will add a proper pnv_pci_ioda2_tce_invalidate_entire(). Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index c762f38..acce430 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1840,7 +1840,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { .get = pnv_tce_get, }; -static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe) +static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) { /* 01xb - invalidate TCEs that match the specified PE# */ unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF); @@ -2191,7 +2191,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, pnv_pci_link_table_and_group(phb->hose->node, num, tbl, &pe->table_group); - pnv_pci_ioda2_tce_invalidate_entire(pe); + pnv_pci_ioda2_tce_invalidate_pe(pe); return 0; } @@ -2335,7 +2335,7 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, if (ret) pe_warn(pe, "Unmapping failed, ret = %ld\n", ret); else - pnv_pci_ioda2_tce_invalidate_entire(pe); + pnv_pci_ioda2_tce_invalidate_pe(pe); pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); -- cgit v0.10.2 From bef9253f551189febf0052a06dc079eb34137a58 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 29 Apr 2016 18:55:17 +1000 Subject: powerpc/powernv: Define TCE Kill flags This replaces magic constants for TCE Kill IODA2 register with macros. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index acce430..d7fbbf1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1840,10 +1840,13 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { .get = pnv_tce_get, }; +#define TCE_KILL_INVAL_PE PPC_BIT(1) +#define TCE_KILL_INVAL_TCE PPC_BIT(2) + static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) { /* 01xb - invalidate TCEs that match the specified PE# */ - unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF); + unsigned long val = TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF); struct pnv_phb *phb = pe->phb; struct pnv_ioda_pe *npe; int i; @@ -1871,7 +1874,7 @@ static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, unsigned long start, end, inc; /* We'll invalidate DMA address in PE scope */ - start = 0x2ull << 60; + start = TCE_KILL_INVAL_TCE; start |= (pe_number & 0xFF); end = start; -- cgit v0.10.2 From 0bbcdb437da0c4acba582058d3d1a84b93bc081d Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 29 Apr 2016 18:55:18 +1000 Subject: powerpc/powernv/npu: TCE Kill helpers cleanup NPU PHB TCE Kill register is exactly the same as in the rest of POWER8 so let's reuse the existing code for NPU. The only bit missing is a helper to reset the entire TCE cache so this moves such a helper from NPU code and renames it. Since pnv_npu_tce_invalidate() does really invalidate the entire cache, this uses pnv_pci_ioda2_tce_invalidate_entire() directly for NPU. This adds an explicit comment for workaround for invalidating NPU TCE cache. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Alistair Popple Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 7229acd..778570c 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -25,8 +25,6 @@ * Other types of TCE cache invalidation are not functional in the * hardware. */ -#define TCE_KILL_INVAL_ALL PPC_BIT(0) - static struct pci_dev *get_pci_dev(struct device_node *dn) { return PCI_DN(dn)->pcidev; @@ -161,45 +159,6 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, return pe; } -void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe) -{ - struct pnv_phb *phb = npe->phb; - - if (WARN_ON(phb->type != PNV_PHB_NPU || - !phb->ioda.tce_inval_reg || - !(npe->flags & PNV_IODA_PE_DEV))) - return; - - mb(); /* Ensure previous TCE table stores are visible */ - __raw_writeq(cpu_to_be64(TCE_KILL_INVAL_ALL), - phb->ioda.tce_inval_reg); -} - -void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe, - struct iommu_table *tbl, - unsigned long index, - unsigned long npages, - bool rm) -{ - struct pnv_phb *phb = npe->phb; - - /* We can only invalidate the whole cache on NPU */ - unsigned long val = TCE_KILL_INVAL_ALL; - - if (WARN_ON(phb->type != PNV_PHB_NPU || - !phb->ioda.tce_inval_reg || - !(npe->flags & PNV_IODA_PE_DEV))) - return; - - mb(); /* Ensure previous TCE table stores are visible */ - if (rm) - __raw_rm_writeq(cpu_to_be64(val), - (__be64 __iomem *) phb->ioda.tce_inval_reg_phys); - else - __raw_writeq(cpu_to_be64(val), - phb->ioda.tce_inval_reg); -} - void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe) { struct pnv_ioda_pe *gpe; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d7fbbf1..9b99aa1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1840,9 +1840,23 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { .get = pnv_tce_get, }; +#define TCE_KILL_INVAL_ALL PPC_BIT(0) #define TCE_KILL_INVAL_PE PPC_BIT(1) #define TCE_KILL_INVAL_TCE PPC_BIT(2) +void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm) +{ + const unsigned long val = TCE_KILL_INVAL_ALL; + + mb(); /* Ensure previous TCE table stores are visible */ + if (rm) + __raw_rm_writeq(cpu_to_be64(val), + (__be64 __iomem *) + phb->ioda.tce_inval_reg_phys); + else + __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); +} + static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) { /* 01xb - invalidate TCEs that match the specified PE# */ @@ -1863,7 +1877,7 @@ static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) if (!npe || npe->phb->type != PNV_PHB_NPU) continue; - pnv_npu_tce_invalidate_entire(npe); + pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); } } @@ -1912,14 +1926,19 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, index, npages); if (pe->flags & PNV_IODA_PE_PEER) - /* Invalidate PEs using the same TCE table */ + /* + * The NVLink hardware does not support TCE kill + * per TCE entry so we have to invalidate + * the entire cache for it. + */ for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { npe = pe->peers[i]; - if (!npe || npe->phb->type != PNV_PHB_NPU) + if (!npe || npe->phb->type != PNV_PHB_NPU || + !npe->phb->ioda.tce_inval_reg) continue; - pnv_npu_tce_invalidate(npe, tbl, index, - npages, rm); + pnv_pci_ioda2_tce_invalidate_entire(npe->phb, + rm); } } } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 904f60b..85436e3 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -218,15 +218,10 @@ extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); /* Nvlink functions */ -extern void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe); -extern void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe, - struct iommu_table *tbl, - unsigned long index, - unsigned long npages, - bool rm); extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe); extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe); extern int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enabled); extern int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask); +extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); #endif /* __POWERNV_PCI_H */ -- cgit v0.10.2 From 6969af73521a9042d91c9efd41f22cf7be7f1ba9 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 29 Apr 2016 18:55:19 +1000 Subject: powerpc/powernv/npu: Use the correct IOMMU page size This uses the page size from iommu_table instead of hard-coded 4K. This should cause no change in behavior. While we are here, move bits around to prepare for further rework which will define and use iommu_table_group_ops. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Alistair Popple Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 778570c..5bd5fee 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -204,8 +204,7 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) struct pnv_phb *phb = npe->phb; struct pci_dev *gpdev; struct pnv_ioda_pe *gpe; - void *addr; - unsigned int size; + struct iommu_table *tbl; int64_t rc; /* @@ -219,11 +218,11 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) if (!gpe) return; - addr = (void *)gpe->table_group.tables[0]->it_base; - size = gpe->table_group.tables[0]->it_size << 3; + tbl = gpe->table_group.tables[0]; rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, - npe->pe_number, 1, __pa(addr), - size, 0x1000); + npe->pe_number, 1, __pa(tbl->it_base), + tbl->it_size << 3, + IOMMU_PAGE_SIZE(tbl)); if (rc != OPAL_SUCCESS) pr_warn("%s: Error %lld setting DMA window on PHB#%d-PE#%d\n", __func__, rc, phb->hose->global_number, npe->pe_number); -- cgit v0.10.2 From f9f834567432e4d21f26debe10e1ca0ab52446e5 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 29 Apr 2016 18:55:20 +1000 Subject: powerpc/powernv/npu: Simplify DMA setup NPU devices are emulated in firmware and mainly used for NPU NVLink training; one NPU device is per a hardware link. Their DMA/TCE setup must match the GPU which is connected via PCIe and NVLink so any changes to the DMA/TCE setup on the GPU PCIe device need to be propagated to the NVLink device as this is what device drivers expect and it doesn't make much sense to do anything else. This makes NPU DMA setup explicit. pnv_npu_ioda_controller_ops::pnv_npu_dma_set_mask is moved to pci-ioda, made static and prints warning as dma_set_mask() should never be called on this function as in any case it will not configure GPU; so we make this explicit. Instead of using PNV_IODA_PE_PEER and peers[] (which the next patch will remove), we test every PCI device if there are corresponding NVLink devices. If there are any, we propagate bypass mode to just found NPU devices by calling the setup helper directly (which takes @bypass) and avoid guessing (i.e. calculating from DMA mask) whether we need bypass or not on NPU devices. Since DMA setup happens in very rare occasion, this will not slow down booting or VFIO start/stop much. This renames pnv_npu_disable_bypass to pnv_npu_dma_set_32 to make it more clear what the function really does which is programming 32bit table address to the TVT ("disabling bypass" means writing zeroes to the TVT). This removes pnv_npu_dma_set_bypass() from pnv_npu_ioda_fixup() as the DMA configuration on NPU does not matter until dma_set_mask() is called on GPU and that will do the NPU DMA configuration. This removes phb->dma_dev_setup initialization for NPU as pnv_pci_ioda_dma_dev_setup is no-op for it anyway. This stops using npe->tce_bypass_base as it never changes and values other than zero are not supported. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Alistair Popple Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 5bd5fee..bec9267 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -196,10 +196,9 @@ void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe) } /* - * For the NPU we want to point the TCE table at the same table as the - * real PCI device. + * Enables 32 bit DMA on NPU. */ -static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) +static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) { struct pnv_phb *phb = npe->phb; struct pci_dev *gpdev; @@ -235,72 +234,62 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) } /* - * Enable/disable bypass mode on the NPU. The NPU only supports one + * Enables bypass mode on the NPU. The NPU only supports one * window per link, so bypass needs to be explicitly enabled or * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be * active at the same time. */ -int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enable) +static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) { struct pnv_phb *phb = npe->phb; int64_t rc = 0; + phys_addr_t top = memblock_end_of_DRAM(); if (phb->type != PNV_PHB_NPU || !npe->pdev) return -EINVAL; - if (enable) { - /* Enable the bypass window */ - phys_addr_t top = memblock_end_of_DRAM(); - - npe->tce_bypass_base = 0; - top = roundup_pow_of_two(top); - dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n", - npe->pe_number); - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, - npe->pe_number, npe->pe_number, - npe->tce_bypass_base, top); - } else { - /* - * Disable the bypass window by replacing it with the - * TCE32 window. - */ - pnv_npu_disable_bypass(npe); - } + /* Enable the bypass window */ + + top = roundup_pow_of_two(top); + dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n", + npe->pe_number); + rc = opal_pci_map_pe_dma_window_real(phb->opal_id, + npe->pe_number, npe->pe_number, + 0 /* bypass base */, top); return rc; } -int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) +void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) { - struct pci_controller *hose = pci_bus_to_host(npdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(npdev); - struct pnv_ioda_pe *npe, *gpe; - struct pci_dev *gpdev; - uint64_t top; - bool bypass = false; + int i; + struct pnv_phb *phb; + struct pci_dn *pdn; + struct pnv_ioda_pe *npe; + struct pci_dev *npdev; - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return -ENXIO; + for (i = 0; ; ++i) { + npdev = pnv_pci_get_npu_dev(gpdev, i); - /* We only do bypass if it's enabled on the linked device */ - npe = &phb->ioda.pe_array[pdn->pe_number]; - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - if (!gpe) - return -ENODEV; + if (!npdev) + break; - if (gpe->tce_bypass_enabled) { - top = gpe->tce_bypass_base + memblock_end_of_DRAM() - 1; - bypass = (dma_mask >= top); - } + pdn = pci_get_pdn(npdev); + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return; - if (bypass) - dev_info(&npdev->dev, "Using 64-bit DMA iommu bypass\n"); - else - dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); + phb = pci_bus_to_host(npdev->bus)->private_data; - pnv_npu_dma_set_bypass(npe, bypass); - *npdev->dev.dma_mask = dma_mask; + /* We only do bypass if it's enabled on the linked device */ + npe = &phb->ioda.pe_array[pdn->pe_number]; - return 0; + if (bypass) { + dev_info(&npdev->dev, + "Using 64-bit DMA iommu bypass\n"); + pnv_npu_dma_set_bypass(npe); + } else { + dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); + pnv_npu_dma_set_32(npe); + } + } } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 9b99aa1..c94ee1b 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1669,8 +1669,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) struct pnv_ioda_pe *pe; uint64_t top; bool bypass = false; - struct pci_dev *linked_npu_dev; - int i; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) return -ENODEV;; @@ -1691,15 +1689,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) *pdev->dev.dma_mask = dma_mask; /* Update peer npu devices */ - if (pe->flags & PNV_IODA_PE_PEER) - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - if (!pe->peers[i]) - continue; - - linked_npu_dev = pe->peers[i]->pdev; - if (dma_get_mask(&linked_npu_dev->dev) != dma_mask) - dma_set_mask(&linked_npu_dev->dev, dma_mask); - } + pnv_npu_try_dma_set_bypass(pdev, bypass); return 0; } @@ -3194,7 +3184,6 @@ static void pnv_npu_ioda_fixup(void) enable_bypass = dma_get_mask(&pe->pdev->dev) == DMA_BIT_MASK(64); pnv_npu_init_dma_pe(pe); - pnv_npu_dma_set_bypass(pe, enable_bypass); } } } @@ -3340,6 +3329,14 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .shutdown = pnv_pci_ioda_shutdown, }; +static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) +{ + dev_err_once(&npdev->dev, + "%s operation unsupported for NVLink devices\n", + __func__); + return -EPERM; +} + static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, #ifdef CONFIG_PCI_MSI @@ -3516,9 +3513,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->freeze_pe = pnv_ioda_freeze_pe; phb->unfreeze_pe = pnv_ioda_unfreeze_pe; - /* Setup TCEs */ - phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; - /* Setup MSI support */ pnv_pci_init_ioda_msis(phb); @@ -3531,10 +3525,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, */ ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; - if (phb->type == PNV_PHB_NPU) + if (phb->type == PNV_PHB_NPU) { hose->controller_ops = pnv_npu_ioda_controller_ops; - else + } else { + phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; hose->controller_ops = pnv_pci_ioda_controller_ops; + } #ifdef CONFIG_PCI_IOV ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 85436e3..a38349a 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -220,8 +220,7 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); /* Nvlink functions */ extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe); extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe); -extern int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enabled); -extern int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask); +extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); #endif /* __POWERNV_PCI_H */ -- cgit v0.10.2 From 7d623e42560ea69eb248d48a1b1b105f28807e6e Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 29 Apr 2016 18:55:21 +1000 Subject: powerpc/powernv/ioda2: Export debug helper pe_level_printk() This exports debugging helper pe_level_printk() and corresponding macroses so they can be used in npu-dma.c. Signed-off-by: Alexey Kardashevskiy Reviewed-By: Alistair Popple Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index c94ee1b..b2831c0 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -57,7 +57,7 @@ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); -static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, +void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -88,13 +88,6 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, va_end(args); } -#define pe_err(pe, fmt, ...) \ - pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__) -#define pe_warn(pe, fmt, ...) \ - pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__) -#define pe_info(pe, fmt, ...) \ - pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) - static bool pnv_iommu_bypass_disabled __read_mostly; static int __init iommu_setup(char *str) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index a38349a..b426614 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -217,6 +217,15 @@ extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); +extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, + const char *fmt, ...); +#define pe_err(pe, fmt, ...) \ + pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__) +#define pe_warn(pe, fmt, ...) \ + pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__) +#define pe_info(pe, fmt, ...) \ + pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) + /* Nvlink functions */ extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe); extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe); -- cgit v0.10.2 From b575c731fe58c1ecb1247c6a334ba6c7b4c0b480 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 29 Apr 2016 18:55:22 +1000 Subject: powerpc/powernv/npu: Add set/unset window helpers The upcoming NVLink passthrough support will require NPU code to cope with two DMA windows. This adds a pnv_npu_set_window() helper which programs 32bit window to the hardware. This also adds multilevel TCE support. This adds a pnv_npu_unset_window() helper which removes the DMA window from the hardware. This does not make difference now as the caller - pnv_npu_dma_set_bypass() - enables bypass in the hardware but the next patch will use it to manage TCE table lists for TCE Kill handling. Signed-off-by: Alexey Kardashevskiy Reviewed-By: Alistair Popple Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index bec9267..800d70f 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -159,6 +159,56 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, return pe; } +static long pnv_npu_set_window(struct pnv_ioda_pe *npe, + struct iommu_table *tbl) +{ + struct pnv_phb *phb = npe->phb; + int64_t rc; + const unsigned long size = tbl->it_indirect_levels ? + tbl->it_level_size : tbl->it_size; + const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; + const __u64 win_size = tbl->it_size << tbl->it_page_shift; + + pe_info(npe, "Setting up window %llx..%llx pg=%lx\n", + start_addr, start_addr + win_size - 1, + IOMMU_PAGE_SIZE(tbl)); + + rc = opal_pci_map_pe_dma_window(phb->opal_id, + npe->pe_number, + npe->pe_number, + tbl->it_indirect_levels + 1, + __pa(tbl->it_base), + size << 3, + IOMMU_PAGE_SIZE(tbl)); + if (rc) { + pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); + return rc; + } + pnv_pci_ioda2_tce_invalidate_entire(phb, false); + + return 0; +} + +static long pnv_npu_unset_window(struct pnv_ioda_pe *npe) +{ + struct pnv_phb *phb = npe->phb; + int64_t rc; + + pe_info(npe, "Removing DMA window\n"); + + rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, + npe->pe_number, + 0/* levels */, 0/* table address */, + 0/* table size */, 0/* page size */); + if (rc) { + pe_err(npe, "Unmapping failed, ret = %lld\n", rc); + return rc; + } + pnv_pci_ioda2_tce_invalidate_entire(phb, false); + + return 0; +} + void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe) { struct pnv_ioda_pe *gpe; @@ -200,10 +250,8 @@ void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe) */ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) { - struct pnv_phb *phb = npe->phb; struct pci_dev *gpdev; struct pnv_ioda_pe *gpe; - struct iommu_table *tbl; int64_t rc; /* @@ -217,14 +265,7 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) if (!gpe) return; - tbl = gpe->table_group.tables[0]; - rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, - npe->pe_number, 1, __pa(tbl->it_base), - tbl->it_size << 3, - IOMMU_PAGE_SIZE(tbl)); - if (rc != OPAL_SUCCESS) - pr_warn("%s: Error %lld setting DMA window on PHB#%d-PE#%d\n", - __func__, rc, phb->hose->global_number, npe->pe_number); + rc = pnv_npu_set_window(npe, gpe->table_group.tables[0]); /* * We don't initialise npu_pe->tce32_table as we always use @@ -248,6 +289,10 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) if (phb->type != PNV_PHB_NPU || !npe->pdev) return -EINVAL; + rc = pnv_npu_unset_window(npe); + if (rc != OPAL_SUCCESS) + return rc; + /* Enable the bypass window */ top = roundup_pow_of_two(top); -- cgit v0.10.2 From 85674868cecebdf6eb7239ecf9c32b6273208d03 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 29 Apr 2016 18:55:23 +1000 Subject: powerpc/powernv/npu: Rework TCE Kill handling The pnv_ioda_pe struct keeps an array of peers. At the moment it is only used to link GPU and NPU for 2 purposes: 1. Access NPU quickly when configuring DMA for GPU - this was addressed in the previos patch by removing use of it as DMA setup is not what the kernel would constantly do. 2. Invalidate TCE cache for NPU when it is invalidated for GPU. GPU and NPU are in different PE. There is already a mechanism to attach multiple iommu_table_group to the same iommu_table (used for VFIO), we can reuse it here so does this patch. This gets rid of peers[] array and PNV_IODA_PE_PEER flag as they are not needed anymore. While we are here, add TCE cache invalidation after enabling bypass. Signed-off-by: Alexey Kardashevskiy Reviewed-By: Alistair Popple Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 800d70f..cb2d1da 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -136,22 +136,17 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, struct pnv_ioda_pe *pe; struct pci_dn *pdn; - if (npe->flags & PNV_IODA_PE_PEER) { - pe = npe->peers[0]; - pdev = pe->pdev; - } else { - pdev = pnv_pci_get_gpu_dev(npe->pdev); - if (!pdev) - return NULL; - - pdn = pci_get_pdn(pdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return NULL; + pdev = pnv_pci_get_gpu_dev(npe->pdev); + if (!pdev) + return NULL; - hose = pci_bus_to_host(pdev->bus); - phb = hose->private_data; - pe = &phb->ioda.pe_array[pdn->pe_number]; - } + pdn = pci_get_pdn(pdev); + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return NULL; + + hose = pci_bus_to_host(pdev->bus); + phb = hose->private_data; + pe = &phb->ioda.pe_array[pdn->pe_number]; if (gpdev) *gpdev = pdev; @@ -186,6 +181,10 @@ static long pnv_npu_set_window(struct pnv_ioda_pe *npe, } pnv_pci_ioda2_tce_invalidate_entire(phb, false); + /* Add the table to the list so its TCE cache will get invalidated */ + pnv_pci_link_table_and_group(phb->hose->node, 0, + tbl, &npe->table_group); + return 0; } @@ -206,43 +205,10 @@ static long pnv_npu_unset_window(struct pnv_ioda_pe *npe) } pnv_pci_ioda2_tce_invalidate_entire(phb, false); - return 0; -} - -void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe) -{ - struct pnv_ioda_pe *gpe; - struct pci_dev *gpdev; - int i, avail = -1; - - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) - return; - - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - if (!gpe) - return; - - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - /* Nothing to do if the PE is already connected. */ - if (gpe->peers[i] == npe) - return; - - if (!gpe->peers[i]) - avail = i; - } + pnv_pci_unlink_table_and_group(npe->table_group.tables[0], + &npe->table_group); - if (WARN_ON(avail < 0)) - return; - - gpe->peers[avail] = npe; - gpe->flags |= PNV_IODA_PE_PEER; - - /* - * We assume that the NPU devices only have a single peer PE - * (the GPU PCIe device PE). - */ - npe->peers[0] = gpe; - npe->flags |= PNV_IODA_PE_PEER; + return 0; } /* @@ -302,6 +268,9 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) npe->pe_number, npe->pe_number, 0 /* bypass base */, top); + if (rc == OPAL_SUCCESS) + pnv_pci_ioda2_tce_invalidate_entire(phb, false); + return rc; } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b2831c0..ef31218 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1845,23 +1845,12 @@ static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) /* 01xb - invalidate TCEs that match the specified PE# */ unsigned long val = TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF); struct pnv_phb *phb = pe->phb; - struct pnv_ioda_pe *npe; - int i; if (!phb->ioda.tce_inval_reg) return; mb(); /* Ensure above stores are visible */ __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); - - if (pe->flags & PNV_IODA_PE_PEER) - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - npe = pe->peers[i]; - if (!npe || npe->phb->type != PNV_PHB_NPU) - continue; - - pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); - } } static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, @@ -1896,33 +1885,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, struct iommu_table_group_link *tgl; list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { - struct pnv_ioda_pe *npe; struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : pe->phb->ioda.tce_inval_reg; - int i; - - pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm, - invalidate, tbl->it_page_shift, - index, npages); - if (pe->flags & PNV_IODA_PE_PEER) + if (pe->phb->type == PNV_PHB_NPU) { /* * The NVLink hardware does not support TCE kill * per TCE entry so we have to invalidate * the entire cache for it. */ - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - npe = pe->peers[i]; - if (!npe || npe->phb->type != PNV_PHB_NPU || - !npe->phb->ioda.tce_inval_reg) - continue; - - pnv_pci_ioda2_tce_invalidate_entire(npe->phb, - rm); - } + pnv_pci_ioda2_tce_invalidate_entire(pe->phb, rm); + continue; + } + pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm, + invalidate, tbl->it_page_shift, + index, npages); } } @@ -3156,31 +3136,6 @@ static void pnv_pci_ioda_create_dbgfs(void) #endif /* CONFIG_DEBUG_FS */ } -static void pnv_npu_ioda_fixup(void) -{ - bool enable_bypass; - struct pci_controller *hose, *tmp; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - unsigned int weight; - - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - phb = hose->private_data; - if (phb->type != PNV_PHB_NPU) - continue; - - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - weight = pnv_pci_ioda_pe_dma_weight(pe); - if (WARN_ON(!weight)) - continue; - - enable_bypass = dma_get_mask(&pe->pdev->dev) == - DMA_BIT_MASK(64); - pnv_npu_init_dma_pe(pe); - } - } -} - static void pnv_pci_ioda_fixup(void) { pnv_pci_ioda_setup_PEs(); @@ -3193,9 +3148,6 @@ static void pnv_pci_ioda_fixup(void) eeh_init(); eeh_addr_cache_build(); #endif - - /* Link NPU IODA tables to their PCI devices. */ - pnv_npu_ioda_fixup(); } /* diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index b426614..2b35606 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -24,7 +24,6 @@ enum pnv_phb_model { #define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */ #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ -#define PNV_IODA_PE_PEER (1 << 6) /* PE has peers */ /* Data associated with a PE, including IOMMU tracking etc.. */ struct pnv_phb; @@ -32,9 +31,6 @@ struct pnv_ioda_pe { unsigned long flags; struct pnv_phb *phb; -#define PNV_IODA_MAX_PEER_PES 8 - struct pnv_ioda_pe *peers[PNV_IODA_MAX_PEER_PES]; - /* A PE can be associated with a single device or an * entire bus (& children). In the former case, pdev * is populated, in the later case, pbus is. @@ -227,8 +223,6 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) /* Nvlink functions */ -extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe); -extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe); extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); -- cgit v0.10.2 From b5cb9ab1a00b112fcb96164c814f1f111deeafba Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 29 Apr 2016 18:55:24 +1000 Subject: powerpc/powernv/npu: Enable NVLink pass through IBM POWER8 NVlink systems come with Tesla K40-ish GPUs each of which also has a couple of fast speed links (NVLink). The interface to links is exposed as an emulated PCI bridge which is included into the same IOMMU group as the corresponding GPU. In the kernel, NPUs get a separate PHB of the PNV_PHB_NPU type and a PE which behave pretty much as the standard IODA2 PHB except NPU PHB has just a single TVE in the hardware which means it can have either 32bit window or 64bit window or DMA bypass but never two of these. In order to make these links work when GPU is passed to the guest, these bridges need to be passed as well; otherwise performance will degrade. This implements and exports API to manage NPU state in regard to VFIO; it replicates iommu_table_group_ops. This defines a new pnv_pci_ioda2_npu_ops which is assigned to the IODA2 bridge if there are NPUs for a GPU on the bridge. The new callbacks call the default IODA2 callbacks plus new NPU API. This adds a gpe_table_group_to_npe() helper to find NPU PE for the IODA2 table_group, it is not expected to fail as the helper is only called from the pnv_pci_ioda2_npu_ops. This does not define NPU-specific .release_ownership() so after VFIO is finished, DMA on NPU is disabled which is ok as the nvidia driver sets DMA mask when probing which enable 32 or 64bit DMA on NPU. This adds a pnv_pci_npu_setup_iommu() helper which adds NPUs to the GPU group if any found. The helper uses helpers to look for the "ibm,gpu" property in the device tree which is a phandle of the corresponding GPU. This adds an additional loop over PEs in pnv_ioda_setup_dma() as the main loop skips NPU PEs as they do not have 32bit DMA segments. As pnv_npu_set_window() and pnv_npu_unset_window() are started being used by the new IODA2-NPU IOMMU group, this makes the helpers public and adds the DMA window number parameter. Signed-off-by: Alexey Kardashevskiy Reviewed-By: Alistair Popple [mpe: Add pnv_pci_ioda_setup_iommu_api() to fix build with IOMMU_API=n] Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index cb2d1da..0459e10 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -154,7 +155,7 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, return pe; } -static long pnv_npu_set_window(struct pnv_ioda_pe *npe, +long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, struct iommu_table *tbl) { struct pnv_phb *phb = npe->phb; @@ -182,13 +183,13 @@ static long pnv_npu_set_window(struct pnv_ioda_pe *npe, pnv_pci_ioda2_tce_invalidate_entire(phb, false); /* Add the table to the list so its TCE cache will get invalidated */ - pnv_pci_link_table_and_group(phb->hose->node, 0, + pnv_pci_link_table_and_group(phb->hose->node, num, tbl, &npe->table_group); return 0; } -static long pnv_npu_unset_window(struct pnv_ioda_pe *npe) +long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num) { struct pnv_phb *phb = npe->phb; int64_t rc; @@ -205,7 +206,7 @@ static long pnv_npu_unset_window(struct pnv_ioda_pe *npe) } pnv_pci_ioda2_tce_invalidate_entire(phb, false); - pnv_pci_unlink_table_and_group(npe->table_group.tables[0], + pnv_pci_unlink_table_and_group(npe->table_group.tables[num], &npe->table_group); return 0; @@ -231,7 +232,7 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) if (!gpe) return; - rc = pnv_npu_set_window(npe, gpe->table_group.tables[0]); + rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]); /* * We don't initialise npu_pe->tce32_table as we always use @@ -255,7 +256,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) if (phb->type != PNV_PHB_NPU || !npe->pdev) return -EINVAL; - rc = pnv_npu_unset_window(npe); + rc = pnv_npu_unset_window(npe, 0); if (rc != OPAL_SUCCESS) return rc; @@ -307,3 +308,54 @@ void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) } } } + +/* Switch ownership from platform code to external user (e.g. VFIO) */ +void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) +{ + struct pnv_phb *phb = npe->phb; + int64_t rc; + + /* + * Note: NPU has just a single TVE in the hardware which means that + * while used by the kernel, it can have either 32bit window or + * DMA bypass but never both. So we deconfigure 32bit window only + * if it was enabled at the moment of ownership change. + */ + if (npe->table_group.tables[0]) { + pnv_npu_unset_window(npe, 0); + return; + } + + /* Disable bypass */ + rc = opal_pci_map_pe_dma_window_real(phb->opal_id, + npe->pe_number, npe->pe_number, + 0 /* bypass base */, 0); + if (rc) { + pe_err(npe, "Failed to disable bypass, err %lld\n", rc); + return; + } + pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); +} + +struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) +{ + struct pnv_phb *phb = npe->phb; + struct pci_bus *pbus = phb->hose->bus; + struct pci_dev *npdev, *gpdev = NULL, *gptmp; + struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); + + if (!gpe || !gpdev) + return NULL; + + list_for_each_entry(npdev, &pbus->devices, bus_list) { + gptmp = pnv_pci_get_gpu_dev(npdev); + + if (gptmp != gpdev) + continue; + + pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev)); + iommu_group_add_device(gpe->table_group.group, &npdev->dev); + } + + return gpe; +} diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index ef31218..4cfe704 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2389,6 +2389,116 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .take_ownership = pnv_ioda2_take_ownership, .release_ownership = pnv_ioda2_release_ownership, }; + +static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe **ptmppe = opaque; + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); + struct pci_dn *pdn = pci_get_pdn(pdev); + + if (!pdn || pdn->pe_number == IODA_INVALID_PE) + return 0; + + hose = pci_bus_to_host(pdev->bus); + phb = hose->private_data; + if (phb->type != PNV_PHB_NPU) + return 0; + + *ptmppe = &phb->ioda.pe_array[pdn->pe_number]; + + return 1; +} + +/* + * This returns PE of associated NPU. + * This assumes that NPU is in the same IOMMU group with GPU and there is + * no other PEs. + */ +static struct pnv_ioda_pe *gpe_table_group_to_npe( + struct iommu_table_group *table_group) +{ + struct pnv_ioda_pe *npe = NULL; + int ret = iommu_group_for_each_dev(table_group->group, &npe, + gpe_table_group_to_npe_cb); + + BUG_ON(!ret || !npe); + + return npe; +} + +static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group, + int num, struct iommu_table *tbl) +{ + long ret = pnv_pci_ioda2_set_window(table_group, num, tbl); + + if (ret) + return ret; + + ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl); + if (ret) + pnv_pci_ioda2_unset_window(table_group, num); + + return ret; +} + +static long pnv_pci_ioda2_npu_unset_window( + struct iommu_table_group *table_group, + int num) +{ + long ret = pnv_pci_ioda2_unset_window(table_group, num); + + if (ret) + return ret; + + return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num); +} + +static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group) +{ + /* + * Detach NPU first as pnv_ioda2_take_ownership() will destroy + * the iommu_table if 32bit DMA is enabled. + */ + pnv_npu_take_ownership(gpe_table_group_to_npe(table_group)); + pnv_ioda2_take_ownership(table_group); +} + +static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = { + .get_table_size = pnv_pci_ioda2_get_table_size, + .create_table = pnv_pci_ioda2_create_table, + .set_window = pnv_pci_ioda2_npu_set_window, + .unset_window = pnv_pci_ioda2_npu_unset_window, + .take_ownership = pnv_ioda2_npu_take_ownership, + .release_ownership = pnv_ioda2_release_ownership, +}; + +static void pnv_pci_ioda_setup_iommu_api(void) +{ + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe, *gpe; + + /* + * Now we have all PHBs discovered, time to add NPU devices to + * the corresponding IOMMU groups. + */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + phb = hose->private_data; + + if (phb->type != PNV_PHB_NPU) + continue; + + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + gpe = pnv_pci_npu_setup_iommu(pe); + if (gpe) + gpe->table_group.ops = &pnv_pci_ioda2_npu_ops; + } + } +} +#else /* !CONFIG_IOMMU_API */ +static void pnv_pci_ioda_setup_iommu_api(void) { }; #endif static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb) @@ -3115,6 +3225,8 @@ static void pnv_pci_ioda_setup_DMA(void) phb = hose->private_data; phb->initialized = 1; } + + pnv_pci_ioda_setup_iommu_api(); } static void pnv_pci_ioda_create_dbgfs(void) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 2b35606..7dee25e 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -225,5 +225,11 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, /* Nvlink functions */ extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); +extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); +extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, + struct iommu_table *tbl); +extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); +extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); +extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); #endif /* __POWERNV_PCI_H */ -- cgit v0.10.2 From 848912e547c4569445a61203a7df402646a88c25 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 12 May 2016 19:43:37 +1000 Subject: Revert "powerpc/powernv: Exclude root bus in pnv_pci_reset_secondary_bus()" This reverts commit c8ceacc22bce95d3a9cff198c9c27a30105a16b8. Gavin says: I missed the fact that it affects the PCI passthrou path as reported by Alexey: When passing GPU (0003:01:00.0) which seats behind the root port, the reset request is routed to skiboot in original code. In skiboot, the link bouncing events are masked during the reset. So we don't see EEH (freeze all) error even link bouncing happens. With the changes included, the reset is done by kernel and the link bouncing events aren't masked by altering content of PHB3 (or P7IOC) specific hardware registers which are invisible to kernel (skiboot hides the hardware specific). It means the link bouncing is seen by the root port and it causes a EEH (freeze all) error. The PCI passthrough on GPU device cannot work. Requested-by: Alexey Kardashevskiy Requested-by: Gavin Shan Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 593b8dc..9226df1 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -868,8 +868,16 @@ static int pnv_eeh_bridge_reset(struct pci_dev *dev, int option) void pnv_pci_reset_secondary_bus(struct pci_dev *dev) { - pnv_eeh_bridge_reset(dev, EEH_RESET_HOT); - pnv_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE); + struct pci_controller *hose; + + if (pci_is_root_bus(dev->bus)) { + hose = pci_bus_to_host(dev->bus); + pnv_eeh_root_reset(hose, EEH_RESET_HOT); + pnv_eeh_root_reset(hose, EEH_RESET_DEACTIVATE); + } else { + pnv_eeh_bridge_reset(dev, EEH_RESET_HOT); + pnv_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE); + } } static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type, -- cgit v0.10.2 From affeb0f2d3a9af419ad7ef4ac782e1540b2f7b28 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 27 Apr 2016 11:14:50 +1000 Subject: powerpc/eeh: Don't report error in eeh_pe_reset_and_recover() The function eeh_pe_reset_and_recover() is used to recover EEH error when the passthrough device are transferred to guest and backwards, meaning the device's driver is vfio-pci or none. When the driver is vfio-pci that provides error_detected() error handler only, the handler simply stops the guest and it's not expected behaviour. On the other hand, no error handlers will be called if we don't have a bound driver. This ignores the error handler in eeh_pe_reset_and_recover() that reports the error to device driver to avoid the exceptional behaviour. Fixes: 5cfb20b9 ("powerpc/eeh: Emulate EEH recovery for VFIO devices") Cc: stable@vger.kernel.org #v3.18+ Signed-off-by: Gavin Shan Reviewed-by: Russell Currey Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 618d13c..fe1dc63 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -564,9 +564,6 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) /* Save states */ eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); - /* Report error */ - eeh_pe_dev_traverse(pe, eeh_report_error, &result); - /* Issue reset */ ret = eeh_reset_pe(pe); if (ret) { -- cgit v0.10.2 From 5a0cdbfd17b90a89c64a71d8aec9773ecdb20d0d Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 27 Apr 2016 11:14:51 +1000 Subject: powerpc/eeh: Restore initial state in eeh_pe_reset_and_recover() The function eeh_pe_reset_and_recover() is used to recover EEH error when the passthrou device are transferred to guest and backwards. The content in the device's config space will be lost on PE reset issued in the middle of the recovery. The function saves/restores it before/after the reset. However, config access to some adapters like Broadcom BCM5719 at this point will causes fenced PHB. The config space is always blocked and we save 0xFF's that are restored at late point. The memory BARs are totally corrupted, causing another EEH error upon access to one of the memory BARs. This restores the config space on those adapters like BCM5719 from the content saved to the EEH device when it's populated, to resolve above issue. Fixes: 5cfb20b9 ("powerpc/eeh: Emulate EEH recovery for VFIO devices") Cc: stable@vger.kernel.org #v3.18+ Signed-off-by: Gavin Shan Reviewed-by: Russell Currey Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index fe1dc63..291fd51 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -171,6 +171,16 @@ static void *eeh_dev_save_state(void *data, void *userdata) if (!edev) return NULL; + /* + * We cannot access the config space on some adapters. + * Otherwise, it will cause fenced PHB. We don't save + * the content in their config space and will restore + * from the initial config space saved when the EEH + * device is created. + */ + if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) + return NULL; + pdev = eeh_dev_to_pci_dev(edev); if (!pdev) return NULL; @@ -312,6 +322,19 @@ static void *eeh_dev_restore_state(void *data, void *userdata) if (!edev) return NULL; + /* + * The content in the config space isn't saved because + * the blocked config space on some adapters. We have + * to restore the initial saved config space when the + * EEH device is created. + */ + if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { + if (list_is_last(&edev->list, &edev->pe->edevs)) + eeh_pe_restore_bars(edev->pe); + + return NULL; + } + pdev = eeh_dev_to_pci_dev(edev); if (!pdev) return NULL; -- cgit v0.10.2 From 2efc771f2492dfce71ce281efb9cc93c3ae8f2fc Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 27 Apr 2016 11:14:52 +1000 Subject: powerpc/eeh: Ignore handlers in eeh_pe_reset_and_recover() The function eeh_pe_reset_and_recover() is used to recover EEH error when the passthrough device are transferred to guest and backwards, meaning the device's driver is vfio-pci or none. In both cases, the handlers triggered by eeh_report_reset() and eeh_report_resume() shouldn't be called. This ignores the error handlers from eeh_report_reset() and eeh_report_resume(). Signed-off-by: Gavin Shan Reviewed-by: Russell Currey Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 291fd51..2714a3b 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -575,7 +575,7 @@ static int eeh_clear_pe_frozen_state(struct eeh_pe *pe, int eeh_pe_reset_and_recover(struct eeh_pe *pe) { - int result, ret; + int ret; /* Bail if the PE is being recovered */ if (pe->state & EEH_PE_RECOVERING) @@ -601,15 +601,9 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) return ret; } - /* Notify completion of reset */ - eeh_pe_dev_traverse(pe, eeh_report_reset, &result); - /* Restore device state */ eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); - /* Resume */ - eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); - /* Clear recovery mode */ eeh_pe_state_clear(pe, EEH_PE_RECOVERING); -- cgit v0.10.2 From d6d63d720db3a63dd45142c2ce7978b52e0e684f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 27 Apr 2016 11:14:53 +1000 Subject: powerpc/eeh: Drop unnecessary label in eeh_pe_change_owner() The label "reset" in eeh_pe_change_owner() is used only for once. No need to keep it and just drop it. No logical changes introduced. Signed-off-by: Gavin Shan Reviewed-by: David Gibson Reviewed-by: Russell Currey Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index c4e41cf..9d1d46c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1336,14 +1336,11 @@ static int eeh_pe_change_owner(struct eeh_pe *pe) id->subdevice != pdev->subsystem_device) continue; - goto reset; + return eeh_pe_reset_and_recover(pe); } } return eeh_unfreeze_pe(pe, true); - -reset: - return eeh_pe_reset_and_recover(pe); } /** -- cgit v0.10.2 From c2078d9ef600bdbe568c89e5ddc2c6f15b7982c8 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Mon, 11 Apr 2016 16:17:22 -0300 Subject: Revert "powerpc/eeh: Fix crash in eeh_add_device_early() on Cell" This reverts commit 89a51df5ab1d38b257300b8ac940bbac3bb0eb9b. The function eeh_add_device_early() is used to perform EEH initialization in devices added later on the system, like in hotplug/DLPAR scenarios. Since the commit 89a51df5ab1d ("powerpc/eeh: Fix crash in eeh_add_device_early() on Cell") a new check was introduced in this function - Cell has no EEH capabilities which led to kernel oops if hotplug was performed, so checking for eeh_enabled() was introduced to avoid the issue. However, in architectures that EEH is present like pSeries or PowerNV, we might reach a case in which no PCI devices are present on boot time and so EEH is not initialized. Then, if a device is added via DLPAR for example, eeh_add_device_early() fails because eeh_enabled() is false, and EEH end up not being enabled at all. This reverts the aforementioned patch since a new verification was introduced by the commit d91dafc02f42 ("powerpc/eeh: Delay probing EEH device during hotplug") and so the original Cell issue does not happen anymore. Cc: stable@vger.kernel.org # v4.1+ Reviewed-by: Gavin Shan Signed-off-by: Guilherme G. Piccoli Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 9d1d46c..c9bc78e 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1068,7 +1068,7 @@ void eeh_add_device_early(struct pci_dn *pdn) struct pci_controller *phb; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); - if (!edev || !eeh_enabled()) + if (!edev) return; if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) -- cgit v0.10.2 From 8445a87f7092bc8336ea1305be9306f26b846d93 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Mon, 11 Apr 2016 16:17:23 -0300 Subject: powerpc/iommu: Remove the dependency on EEH struct in DDW mechanism Commit 39baadbf36ce ("powerpc/eeh: Remove eeh information from pci_dn") changed the pci_dn struct by removing its EEH-related members. As part of this clean-up, DDW mechanism was modified to read the device configuration address from eeh_dev struct. As a consequence, now if we disable EEH mechanism on kernel command-line for example, the DDW mechanism will fail, generating a kernel oops by dereferencing a NULL pointer (which turns to be the eeh_dev pointer). This patch just changes the configuration address calculation on DDW functions to a manual calculation based on pci_dn members instead of using eeh_dev-based address. No functional changes were made. This was tested on pSeries, both in PHyp and qemu guest. Fixes: 39baadbf36ce ("powerpc/eeh: Remove eeh information from pci_dn") Cc: stable@vger.kernel.org # v3.4+ Reviewed-by: Gavin Shan Signed-off-by: Guilherme G. Piccoli Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index bd98ce2..b7dfc13 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -912,7 +912,8 @@ machine_arch_initcall(pseries, find_existing_ddw_windows); static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, struct ddw_query_response *query) { - struct eeh_dev *edev; + struct device_node *dn; + struct pci_dn *pdn; u32 cfg_addr; u64 buid; int ret; @@ -923,11 +924,10 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, * Retrieve them from the pci device, not the node with the * dma-window property */ - edev = pci_dev_to_eeh_dev(dev); - cfg_addr = edev->config_addr; - if (edev->pe_config_addr) - cfg_addr = edev->pe_config_addr; - buid = edev->phb->buid; + dn = pci_device_to_OF_node(dev); + pdn = PCI_DN(dn); + buid = pdn->phb->buid; + cfg_addr = (pdn->busno << 8) | pdn->devfn; ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, cfg_addr, BUID_HI(buid), BUID_LO(buid)); @@ -941,7 +941,8 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, struct ddw_create_response *create, int page_shift, int window_shift) { - struct eeh_dev *edev; + struct device_node *dn; + struct pci_dn *pdn; u32 cfg_addr; u64 buid; int ret; @@ -952,11 +953,10 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, * Retrieve them from the pci device, not the node with the * dma-window property */ - edev = pci_dev_to_eeh_dev(dev); - cfg_addr = edev->config_addr; - if (edev->pe_config_addr) - cfg_addr = edev->pe_config_addr; - buid = edev->phb->buid; + dn = pci_device_to_OF_node(dev); + pdn = PCI_DN(dn); + buid = pdn->phb->buid; + cfg_addr = (pdn->busno << 8) | pdn->devfn; do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ -- cgit v0.10.2 From 92a86756904b127a3450262b33c0b9f8e99f6b36 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 12 May 2016 15:47:09 +1000 Subject: powerpc/powernv: Fix insufficient memory allocation The pnv_pci_init_ioda_phb() helper allocates a blob to store auxilary data such PE and M32/M64 segment allocation maps; this single blob has few partitions, size of each is derived from the PE number - phb->ioda.total_pe_num. It was assumed that the minimum PE number is 8, however it is 4 for NPU so the pe_alloc part was missing in the allocated blob. It was invisible till recently as we were not tracking used M64 segments and NPUs do not use M32 segments so the phb->ioda.m32_segmap (which was pointing to the same address as phb->ioda.pe_alloc) has never been written to leaving the pe_alloc memory intact. After commit 401203ac2d "powerpc/powernv: Track M64 segment consumption" the pe_alloc gets corrupted and PE allocation cannot work. This fixes the issue by enforcing the minimum PE number to 8. Fixes: 401203ac2d15 ("powerpc/powernv: Track M64 segment consumption") Signed-off-by: Alexey Kardashevskiy Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 4cfe704..7e1e4f1 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3504,7 +3504,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, PNV_IODA1_DMA32_SEGSIZE; /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ - size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); + size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8, + sizeof(unsigned long)); m64map_off = size; size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]); m32map_off = size; -- cgit v0.10.2 From 1d4e89cf0a4e819fbde428e9e5286e7141a02e06 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 12 May 2016 15:47:10 +1000 Subject: powerpc/powernv/npu: Add PE to PHB's list Before commit 3e68dc57 "powerpc/powernv: Remove DMA32 PE list", NPU PEs were linked to the NPU PHB via phb->ioda.pe_dma_list; after that fix, the phb->ioda.pe_list is used. During the pe_dma_list removal, list_add_tail(&phb->ioda.pe_dma_list) was removed, however no list_add() was added so does this patch. Fixes: 3e68dc57219a ("powerpc/powernv: Remove DMA32 PE list") Signed-off-by: Alexey Kardashevskiy Reviewed-by: Gavin Shan Signed-off-by: Michael Ellerman diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 7e1e4f1..3a5ea82 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1006,6 +1006,9 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) return NULL; } + /* Put PE to the list */ + list_add_tail(&pe->list, &phb->ioda.pe_list); + return pe; } -- cgit v0.10.2 From 6a369fa28509c3b6a35db979d6fa97a075b94876 Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Wed, 13 Jan 2016 15:54:46 +0800 Subject: powerpc/fsl: Fix SPI compatible on t208xrdb and t1040rdb On the t208xrdb and t1040rdb, the SPI device is n25q512ax3 instead of n25q512a. Signed-off-by: Hou Zhiqiang Signed-off-by: Scott Wood diff --git a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi index 72691ef..7c4afdb 100644 --- a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi @@ -109,7 +109,7 @@ flash@0 { #address-cells = <1>; #size-cells = <1>; - compatible = "micron,n25q512a", "jedec,spi-nor"; + compatible = "micron,n25q512ax3", "jedec,spi-nor"; reg = <0>; spi-max-frequency = <10000000>; /* input clock */ }; diff --git a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi index dc93268..ff87e67 100644 --- a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi @@ -113,7 +113,7 @@ flash@0 { #address-cells = <1>; #size-cells = <1>; - compatible = "micron,n25q512a", "jedec,spi-nor"; + compatible = "micron,n25q512ax3", "jedec,spi-nor"; reg = <0>; spi-max-frequency = <10000000>; /* input clock */ }; -- cgit v0.10.2 From a8165d421dabd476d811f89942e345827f7b1497 Mon Sep 17 00:00:00 2001 From: chenhui zhao Date: Fri, 15 Jan 2016 17:38:20 +0800 Subject: powerpc/fsl-pci: Add a workaround for PCI 5 errata Issue: As a master, the PCI IP block can combine a memory write to the last PCI double word (4 bytes) of a cacheline with a 4 byte memory write to the first PCI double word of the subsequent cacheline. This affects 32-bit PCI target devices that blindly assert STOP on memory-write transactions, without detecting that the data beat being transferred is the last data beat of the transaction. It can cause a hang. PCI-X operation is not affected by this erratum. Workaround: Setting the bit MDS in the PCI Bus Function Register will disable the combining of crossing cacheline boundary requests into one burst transaction. Therefore, it can prevent the errata scenario from occurring. This errata exists in MPC8543, MPC8543E, MPC8545, MPC8545E, MPC8547, MPC8547E, MPC8548 and MPC8548E. Refer to PCI 5 in MPC8548 errata document. Signed-off-by: Zhao Chenhui Signed-off-by: Zhiqiang Hou [scottwood: whitespace fix] Signed-off-by: Scott Wood diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 85729f4..0ef9df4 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -527,6 +528,8 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary) u8 hdr_type, progif; struct device_node *dev; struct ccsr_pci __iomem *pci; + u16 temp; + u32 svr = mfspr(SPRN_SVR); dev = pdev->dev.of_node; @@ -596,6 +599,27 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary) PPC_INDIRECT_TYPE_SURPRESS_PRIMARY_BUS; if (fsl_pcie_check_link(hose)) hose->indirect_type |= PPC_INDIRECT_TYPE_NO_PCIE_LINK; + } else { + /* + * Set PBFR(PCI Bus Function Register)[10] = 1 to + * disable the combining of crossing cacheline + * boundary requests into one burst transaction. + * PCI-X operation is not affected. + * Fix erratum PCI 5 on MPC8548 + */ +#define PCI_BUS_FUNCTION 0x44 +#define PCI_BUS_FUNCTION_MDS 0x400 /* Master disable streaming */ + if (((SVR_SOC_VER(svr) == SVR_8543) || + (SVR_SOC_VER(svr) == SVR_8545) || + (SVR_SOC_VER(svr) == SVR_8547) || + (SVR_SOC_VER(svr) == SVR_8548)) && + !early_find_capability(hose, 0, 0, PCI_CAP_ID_PCIX)) { + early_read_config_word(hose, 0, 0, + PCI_BUS_FUNCTION, &temp); + temp |= PCI_BUS_FUNCTION_MDS; + early_write_config_word(hose, 0, 0, + PCI_BUS_FUNCTION, temp); + } } printk(KERN_INFO "Found FSL PCI host bridge at 0x%016llx. " -- cgit v0.10.2 From ed8fd100b494b95118097ddfcc3f8c112a9a04f0 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 8 Mar 2016 17:30:36 -0600 Subject: powerpc/fsl: Remove FSL_SOC dependency from FSL_LBC This dependency led to kconfig errors when MTD_NAND_FSL_ELBC was enabled, which selects FSL_LBC, in the absence of FSL_SOC, as reported in http://patchwork.ozlabs.org/patch/564405/ It was originally suggested to add an FSL_SOC dependency to MTD_NAND_FSL_ELBC, but the FSL_SOC symbol has been a growing problem due to hardware being shared between PPC and ARM SoCs. Even though eLBC isn't found on ARM SoCs (the newer IFC is used instead), I don't want to expand the use of FSL_SOC for things other than functions exported by fsl_soc.c. In particular, it would be odd to add it to MTD_NAND_FSL_ELBC and then remove it from MTD_NAND_FSL_IFC. Removing artificial dependencies also helps get compile-test exposure via randconfig, allyesconfig, etc. Reported-by: Brian Norris Cc: Brian Norris Signed-off-by: Scott Wood diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a630ac2..7268d32 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -797,7 +797,6 @@ config 4xx_SOC config FSL_LBC bool "Freescale Local Bus support" - depends on FSL_SOC help Enables reporting of errors from the Freescale local bus controller. Also contains some common code used by -- cgit v0.10.2 From d2d79dcc661d6a5b70597ab982e09d7752d66101 Mon Sep 17 00:00:00 2001 From: Chenhui Zhao Date: Fri, 15 Apr 2016 18:47:46 +0800 Subject: powerpc/fsl: Fix rcpm compatible string For T1040, T1042, T1023, and T1024, they should use the compatible string "fsl,qoriq-rcpm-2.1". Signed-off-by: Chenhui Zhao Signed-off-by: Scott Wood diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi index 99e421d..6e0b489 100644 --- a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi @@ -263,7 +263,7 @@ }; rcpm: global-utilities@e2000 { - compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.0"; + compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.1"; reg = <0xe2000 0x1000>; }; diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi index e0f4da5..507649e 100644 --- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi @@ -472,7 +472,7 @@ }; rcpm: global-utilities@e2000 { - compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.0"; + compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.1"; reg = <0xe2000 0x1000>; }; -- cgit v0.10.2 From 4680c9d50637f78338e786c920aef6c171c7cc00 Mon Sep 17 00:00:00 2001 From: Alessio Igor Bogani Date: Mon, 18 Apr 2016 08:36:19 +0200 Subject: powerpc/fsl: Fix build of the dtb embedded kernel images Commit dc37374b9c833 ("powerpc/fsl: Move Freescale device tree files into fsl folder") moved a lot of device tree files into fsl directory, fixing Makefile for cuImage target only. Unfortunately there are other targets which require embedding a device tree into the kernel image (e.g. dtbImage.%). So use a more generic approach. Signed-off-by: Alessio Igor Bogani [scottwood: cleaned up commit message] Signed-off-by: Scott Wood diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 6116510..8fe78a3 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -362,9 +362,6 @@ $(obj)/cuImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(obj)/cuImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb) -$(obj)/cuImage.%: vmlinux $(obj)/fsl/%.dtb $(wrapperbits) - $(call if_changed,wrap,cuboot-$*,,$(obj)/fsl/$*.dtb) - $(obj)/simpleImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(call if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) @@ -381,6 +378,9 @@ $(obj)/treeImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(obj)/%.dtb: $(src)/dts/%.dts FORCE $(call if_changed_dep,dtc) +$(obj)/%.dtb: $(src)/dts/fsl/%.dts FORCE + $(call if_changed_dep,dtc) + # If there isn't a platform selected then just strip the vmlinux. ifeq (,$(image-y)) image-y := vmlinux.strip -- cgit v0.10.2 From a66639d4eec5e03bdc5d9a312b7eebd3827a4d14 Mon Sep 17 00:00:00 2001 From: Alessio Igor Bogani Date: Wed, 20 Apr 2016 10:18:29 +0200 Subject: powerpc/86xx: Move pci1 definition to the include file Signed-off-by: Alessio Igor Bogani Signed-off-by: Scott Wood diff --git a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts index 0424fc2..c88d4ef 100644 --- a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts +++ b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts @@ -211,6 +211,10 @@ 0x0 0x00400000>; }; }; + + pci1: pcie@fef09000 { + status = "disabled"; + }; }; /include/ "mpc8641si-post.dtsi" diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts index 84b3d38..8385157 100644 --- a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts +++ b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts @@ -24,10 +24,6 @@ model = "GEF_SBC310"; compatible = "gef,sbc310"; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x0 0x40000000>; // set by uboot @@ -223,29 +219,11 @@ }; pci1: pcie@fef09000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0xfef09000 0x1000>; - bus-range = <0x0 0xff>; ranges = <0x02000000 0x0 0xc0000000 0xc0000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0xfe400000 0x0 0x00400000>; - clock-frequency = <100000000>; - interrupts = <0x19 0x2 0 0>; - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - interrupt-map = < - 0x0000 0x0 0x0 0x1 &mpic 0x4 0x2 - 0x0000 0x0 0x0 0x2 &mpic 0x5 0x2 - 0x0000 0x0 0x0 0x3 &mpic 0x6 0x2 - 0x0000 0x0 0x0 0x4 &mpic 0x7 0x2 - >; pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xc0000000 0x02000000 0x0 0xc0000000 0x0 0x20000000 diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts index 974446a..ff423ab 100644 --- a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts +++ b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts @@ -209,6 +209,10 @@ 0x0 0x00400000>; }; }; + + pci1: pcie@fef09000 { + status = "disabled"; + }; }; /include/ "mpc8641si-post.dtsi" diff --git a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts index 554001f..11bea3e 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts +++ b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts @@ -15,10 +15,6 @@ model = "MPC8641HPCN"; compatible = "fsl,mpc8641hpcn"; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x00000000 0x40000000>; // 1G at 0x0 @@ -359,29 +355,11 @@ }; pci1: pcie@ffe09000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0xffe09000 0x1000>; - bus-range = <0 0xff>; ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0xffc10000 0x0 0x00010000>; - clock-frequency = <100000000>; - interrupts = <25 2 0 0>; - interrupt-map-mask = <0xf800 0 0 7>; - interrupt-map = < - /* IDSEL 0x0 */ - 0x0000 0 0 1 &mpic 4 1 - 0x0000 0 0 2 &mpic 5 1 - 0x0000 0 0 3 &mpic 6 1 - 0x0000 0 0 4 &mpic 7 1 - >; + pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xa0000000 0x02000000 0x0 0xa0000000 0x0 0x20000000 diff --git a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts index fec5867..7ff6204 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts +++ b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts @@ -17,10 +17,6 @@ #address-cells = <2>; #size-cells = <2>; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x0 0x00000000 0x0 0x40000000>; // 1G at 0x0 @@ -326,29 +322,11 @@ }; pci1: pcie@fffe09000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0x0f 0xffe09000 0x0 0x1000>; - bus-range = <0x0 0xff>; ranges = <0x02000000 0x0 0xe0000000 0x0c 0x20000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0x0f 0xffc10000 0x0 0x00010000>; - clock-frequency = <100000000>; - interrupts = <25 2 0 0>; - interrupt-map-mask = <0xf800 0 0 7>; - interrupt-map = < - /* IDSEL 0x0 */ - 0x0000 0 0 1 &mpic 4 1 - 0x0000 0 0 2 &mpic 5 1 - 0x0000 0 0 3 &mpic 6 1 - 0x0000 0 0 4 &mpic 7 1 - >; + pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xe0000000 0x02000000 0x0 0xe0000000 0x0 0x20000000 diff --git a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi index 70889d8..a76ffda 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi @@ -113,6 +113,33 @@ pcie@0 { reg = <0 0 0 0 0>; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + device_type = "pci"; + }; +}; + +&pci1 { + compatible = "fsl,mpc8641-pcie"; + device_type = "pci"; + #size-cells = <2>; + #address-cells = <3>; + bus-range = <0x0 0xff>; + clock-frequency = <100000000>; + interrupts = <25 2 0 0>; + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; + + interrupt-map = < + 0x0000 0x0 0x0 0x1 &mpic 0x4 0x1 + 0x0000 0x0 0x0 0x2 &mpic 0x5 0x1 + 0x0000 0x0 0x0 0x3 &mpic 0x6 0x1 + 0x0000 0x0 0x0 0x4 &mpic 0x7 0x1 + >; + + pcie@0 { + reg = <0 0 0 0 0>; + #interrupt-cells = <1>; #size-cells = <2>; #address-cells = <3>; device_type = "pci"; diff --git a/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi b/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi index 9e03328..7c6db6f 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi +++ b/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi @@ -25,6 +25,7 @@ serial0 = &serial0; serial1 = &serial1; pci0 = &pci0; + pci1 = &pci1; }; cpus { diff --git a/arch/powerpc/boot/dts/fsl/sbc8641d.dts b/arch/powerpc/boot/dts/fsl/sbc8641d.dts index 0a9733c..75870a1 100644 --- a/arch/powerpc/boot/dts/fsl/sbc8641d.dts +++ b/arch/powerpc/boot/dts/fsl/sbc8641d.dts @@ -19,10 +19,6 @@ model = "SBC8641D"; compatible = "wind,sbc8641"; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x00000000 0x20000000>; // 512M at 0x0 @@ -165,30 +161,11 @@ }; pci1: pcie@f8009000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0xf8009000 0x1000>; - bus-range = <0 0xff>; ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0xe3000000 0x0 0x00100000>; - clock-frequency = <100000000>; - interrupts = <25 2 0 0>; - interrupt-map-mask = <0xf800 0 0 7>; - interrupt-map = < - /* IDSEL 0x0 */ - 0x0000 0 0 1 &mpic 4 1 - 0x0000 0 0 2 &mpic 5 1 - 0x0000 0 0 3 &mpic 6 1 - 0x0000 0 0 4 &mpic 7 1 - >; pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xa0000000 0x02000000 0x0 0xa0000000 0x0 0x20000000 -- cgit v0.10.2 From 1eef33bec12d7e46a58f12b3604556a80c362a0d Mon Sep 17 00:00:00 2001 From: Alessio Igor Bogani Date: Tue, 3 May 2016 14:31:18 +0200 Subject: powerpc/86xx: Fix PCI interrupt map definition Fix PCI interrupt map definition from 2 to 4 cells. Move interrupt-map and interrupt-map-mask and clone interrupts into the pcie child nodes. Signed-off-by: Alessio Igor Bogani Signed-off-by: Scott Wood diff --git a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi index a76ffda..eeb7c65 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi @@ -102,14 +102,6 @@ bus-range = <0x0 0xff>; clock-frequency = <100000000>; interrupts = <24 2 0 0>; - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - - interrupt-map = < - 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 - 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 - 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 - 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1 - >; pcie@0 { reg = <0 0 0 0 0>; @@ -117,6 +109,14 @@ #size-cells = <2>; #address-cells = <3>; device_type = "pci"; + interrupts = <24 2 0 0>; + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; + interrupt-map = < + 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1 0x0 0x0 + >; }; }; @@ -128,14 +128,6 @@ bus-range = <0x0 0xff>; clock-frequency = <100000000>; interrupts = <25 2 0 0>; - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - - interrupt-map = < - 0x0000 0x0 0x0 0x1 &mpic 0x4 0x1 - 0x0000 0x0 0x0 0x2 &mpic 0x5 0x1 - 0x0000 0x0 0x0 0x3 &mpic 0x6 0x1 - 0x0000 0x0 0x0 0x4 &mpic 0x7 0x1 - >; pcie@0 { reg = <0 0 0 0 0>; @@ -143,5 +135,13 @@ #size-cells = <2>; #address-cells = <3>; device_type = "pci"; + interrupts = <25 2 0 0>; + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; + interrupt-map = < + 0x0000 0x0 0x0 0x1 &mpic 0x4 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x2 &mpic 0x5 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x3 &mpic 0x6 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x4 &mpic 0x7 0x1 0x0 0x0 + >; }; }; -- cgit v0.10.2