From 202648a6070b69d60c6d0926ff06c8863e231468 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Date: Mon, 27 Apr 2015 21:48:47 +0900
Subject: powerpc: Constify irq_domain_ops

The irq_domain_ops are not modified by the driver and the irqdomain core
code accepts pointer to a const data.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski.k@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index 45598da..31c3347 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -204,7 +204,7 @@ static int i8259_host_xlate(struct irq_domain *h, struct device_node *ct,
 	return 0;
 }
 
-static struct irq_domain_ops i8259_host_ops = {
+static const struct irq_domain_ops i8259_host_ops = {
 	.match = i8259_host_match,
 	.map = i8259_host_map,
 	.xlate = i8259_host_xlate,
diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index b287337..d78f136 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -691,7 +691,7 @@ static int ipic_host_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops ipic_host_ops = {
+static const struct irq_domain_ops ipic_host_ops = {
 	.match	= ipic_host_match,
 	.map	= ipic_host_map,
 	.xlate	= irq_domain_xlate_onetwocell,
diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/sysdev/mpc8xx_pic.c
index c4828c0..d93a78be 100644
--- a/arch/powerpc/sysdev/mpc8xx_pic.c
+++ b/arch/powerpc/sysdev/mpc8xx_pic.c
@@ -120,7 +120,7 @@ static int mpc8xx_pic_host_xlate(struct irq_domain *h, struct device_node *ct,
 }
 
 
-static struct irq_domain_ops mpc8xx_pic_host_ops = {
+static const struct irq_domain_ops mpc8xx_pic_host_ops = {
 	.map = mpc8xx_pic_host_map,
 	.xlate = mpc8xx_pic_host_xlate,
 };
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index b2b8447..c8e7333 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -1195,7 +1195,7 @@ static void mpic_cascade(unsigned int irq, struct irq_desc *desc)
 	chip->irq_eoi(&desc->irq_data);
 }
 
-static struct irq_domain_ops mpic_host_ops = {
+static const struct irq_domain_ops mpic_host_ops = {
 	.match = mpic_host_match,
 	.map = mpic_host_map,
 	.xlate = mpic_host_xlate,
diff --git a/arch/powerpc/sysdev/mv64x60_pic.c b/arch/powerpc/sysdev/mv64x60_pic.c
index 8848e99..0f842dd 100644
--- a/arch/powerpc/sysdev/mv64x60_pic.c
+++ b/arch/powerpc/sysdev/mv64x60_pic.c
@@ -223,7 +223,7 @@ static int mv64x60_host_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops mv64x60_host_ops = {
+static const struct irq_domain_ops mv64x60_host_ops = {
 	.map   = mv64x60_host_map,
 };
 
diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c
index 543765e..6512cd8 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_ic.c
+++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c
@@ -271,7 +271,7 @@ static int qe_ic_host_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops qe_ic_host_ops = {
+static const struct irq_domain_ops qe_ic_host_ops = {
 	.match = qe_ic_host_match,
 	.map = qe_ic_host_map,
 	.xlate = irq_domain_xlate_onetwocell,
diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c
index 188012c..57b5447 100644
--- a/arch/powerpc/sysdev/tsi108_pci.c
+++ b/arch/powerpc/sysdev/tsi108_pci.c
@@ -397,7 +397,7 @@ static int pci_irq_host_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops pci_irq_domain_ops = {
+static const struct irq_domain_ops pci_irq_domain_ops = {
 	.map = pci_irq_host_map,
 	.xlate = pci_irq_host_xlate,
 };
diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c
index 7c37157..1cd057f 100644
--- a/arch/powerpc/sysdev/uic.c
+++ b/arch/powerpc/sysdev/uic.c
@@ -189,7 +189,7 @@ static int uic_host_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops uic_host_ops = {
+static const struct irq_domain_ops uic_host_ops = {
 	.map	= uic_host_map,
 	.xlate	= irq_domain_xlate_twocell,
 };
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index 878a540..5bc5889 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -360,7 +360,7 @@ static int xics_host_xlate(struct irq_domain *h, struct device_node *ct,
 	return 0;
 }
 
-static struct irq_domain_ops xics_host_ops = {
+static const struct irq_domain_ops xics_host_ops = {
 	.match = xics_host_match,
 	.map = xics_host_map,
 	.xlate = xics_host_xlate,
diff --git a/arch/powerpc/sysdev/xilinx_intc.c b/arch/powerpc/sysdev/xilinx_intc.c
index 56f0524..43b8b27 100644
--- a/arch/powerpc/sysdev/xilinx_intc.c
+++ b/arch/powerpc/sysdev/xilinx_intc.c
@@ -179,7 +179,7 @@ static int xilinx_intc_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops xilinx_intc_ops = {
+static const struct irq_domain_ops xilinx_intc_ops = {
 	.map = xilinx_intc_map,
 	.xlate = xilinx_intc_xlate,
 };
-- 
cgit v0.10.2


From 2222ce0fbbcc4ebfa9995c8d23d72c8239ad712c Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 29 Apr 2015 20:44:58 -0500
Subject: powerpc/pseries: Fix possible leaked device node reference

Failure return from dlpar_configure_connector when dlpar adding cpus
results in leaking references to the cpus parent device node. Move the
call to of_node_put() prior to checking the result of
dlpar_configure_connector.

Fixes: 8d5ff320766f ("powerpc/pseries: Make dlpar_configure_connector parent node aware")

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 019d34a..47d9cebe 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -421,11 +421,10 @@ static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
 		return -ENODEV;
 
 	dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
+	of_node_put(parent);
 	if (!dn)
 		return -EINVAL;
 
-	of_node_put(parent);
-
 	rc = dlpar_attach_node(dn);
 	if (rc) {
 		dlpar_release_drc(drc_index);
-- 
cgit v0.10.2


From f1e7c202a98cb87cc650d99d014f87e6248ae530 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 25 Mar 2015 20:11:56 +1100
Subject: powerpc: Make STRICT_MM_TYPECHECKS a config option

The STRICT_MM_TYPECHECKS code has bit-rotted over the years. To make it
possible to easily build test it, make it a CONFIG option.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 0efa8f9..3a510f4 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -19,6 +19,14 @@ config PPC_WERROR
 	depends on !PPC_DISABLE_WERROR
 	default y
 
+config STRICT_MM_TYPECHECKS
+	bool "Do extra type checking on mm types"
+	default n
+	help
+	  This option turns on extra type checking for some mm related types.
+
+	  If you don't know what this means, say N.
+
 config PRINT_STACK_DEPTH
 	int "Stack depth to print" if DEBUG_KERNEL
 	default 64
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 69c0598..71294a6 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -278,9 +278,7 @@ extern long long virt_phys_offset;
 
 #ifndef __ASSEMBLY__
 
-#undef STRICT_MM_TYPECHECKS
-
-#ifdef STRICT_MM_TYPECHECKS
+#ifdef CONFIG_STRICT_MM_TYPECHECKS
 /* These are used to make use of C type-checking. */
 
 /* PTE level */
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 43e6ad4..f951d9c 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -118,7 +118,7 @@
  */
 #ifndef __real_pte
 
-#ifdef STRICT_MM_TYPECHECKS
+#ifdef CONFIG_STRICT_MM_TYPECHECKS
 #define __real_pte(e,p)		((real_pte_t){(e)})
 #define __rpte_to_pte(r)	((r).pte)
 #else
-- 
cgit v0.10.2


From 5af7a6f3e2d015dcaaeffa48c6d47238415cbe66 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 10 Apr 2015 11:52:06 +1000
Subject: powerpc/pasemi: Only the build the pasemi MSI code for PASEMI=y

The pasemi MSI code is currently always built when MPIC=y && PCI_MSI=y.
It should not have any effect on other platforms, because it immediately
checks the MPIC's compatible property for "pasemi,pwrficient-openpic".

However it's odd that it's still built even when PASEMI=n. It also
needn't be in sysdev, as it's only used by pasemi. So move it into
platforms/pasemi, whereby it will only be built for PASEMI=y.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/pasemi/Makefile b/arch/powerpc/platforms/pasemi/Makefile
index 8e8d4ca..60b4e0f 100644
--- a/arch/powerpc/platforms/pasemi/Makefile
+++ b/arch/powerpc/platforms/pasemi/Makefile
@@ -1,2 +1,3 @@
 obj-y	+= setup.o pci.o time.o idle.o powersave.o iommu.o dma_lib.o misc.o
 obj-$(CONFIG_PPC_PASEMI_MDIO)	+= gpio_mdio.o
+obj-$(CONFIG_PCI_MSI)		+= msi.o
diff --git a/arch/powerpc/platforms/pasemi/msi.c b/arch/powerpc/platforms/pasemi/msi.c
new file mode 100644
index 0000000..0b37066
--- /dev/null
+++ b/arch/powerpc/platforms/pasemi/msi.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright 2007, Olof Johansson, PA Semi
+ *
+ * Based on arch/powerpc/sysdev/mpic_u3msi.c:
+ *
+ * Copyright 2006, Segher Boessenkool, IBM Corporation.
+ * Copyright 2006-2007, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2 of the
+ * License.
+ *
+ */
+
+#include <linux/irq.h>
+#include <linux/msi.h>
+#include <asm/mpic.h>
+#include <asm/prom.h>
+#include <asm/hw_irq.h>
+#include <asm/ppc-pci.h>
+#include <asm/msi_bitmap.h>
+
+#include <sysdev/mpic.h>
+
+/* Allocate 16 interrupts per device, to give an alignment of 16,
+ * since that's the size of the grouping w.r.t. affinity. If someone
+ * needs more than 32 MSI's down the road we'll have to rethink this,
+ * but it should be OK for now.
+ */
+#define ALLOC_CHUNK 16
+
+#define PASEMI_MSI_ADDR 0xfc080000
+
+/* A bit ugly, can we get this from the pci_dev somehow? */
+static struct mpic *msi_mpic;
+
+
+static void mpic_pasemi_msi_mask_irq(struct irq_data *data)
+{
+	pr_debug("mpic_pasemi_msi_mask_irq %d\n", data->irq);
+	pci_msi_mask_irq(data);
+	mpic_mask_irq(data);
+}
+
+static void mpic_pasemi_msi_unmask_irq(struct irq_data *data)
+{
+	pr_debug("mpic_pasemi_msi_unmask_irq %d\n", data->irq);
+	mpic_unmask_irq(data);
+	pci_msi_unmask_irq(data);
+}
+
+static struct irq_chip mpic_pasemi_msi_chip = {
+	.irq_shutdown		= mpic_pasemi_msi_mask_irq,
+	.irq_mask		= mpic_pasemi_msi_mask_irq,
+	.irq_unmask		= mpic_pasemi_msi_unmask_irq,
+	.irq_eoi		= mpic_end_irq,
+	.irq_set_type		= mpic_set_irq_type,
+	.irq_set_affinity	= mpic_set_affinity,
+	.name			= "PASEMI-MSI",
+};
+
+static void pasemi_msi_teardown_msi_irqs(struct pci_dev *pdev)
+{
+	struct msi_desc *entry;
+
+	pr_debug("pasemi_msi_teardown_msi_irqs, pdev %p\n", pdev);
+
+	list_for_each_entry(entry, &pdev->msi_list, list) {
+		if (entry->irq == NO_IRQ)
+			continue;
+
+		irq_set_msi_desc(entry->irq, NULL);
+		msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap,
+				       virq_to_hw(entry->irq), ALLOC_CHUNK);
+		irq_dispose_mapping(entry->irq);
+	}
+
+	return;
+}
+
+static int pasemi_msi_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+	unsigned int virq;
+	struct msi_desc *entry;
+	struct msi_msg msg;
+	int hwirq;
+
+	if (type == PCI_CAP_ID_MSIX)
+		pr_debug("pasemi_msi: MSI-X untested, trying anyway\n");
+	pr_debug("pasemi_msi_setup_msi_irqs, pdev %p nvec %d type %d\n",
+		 pdev, nvec, type);
+
+	msg.address_hi = 0;
+	msg.address_lo = PASEMI_MSI_ADDR;
+
+	list_for_each_entry(entry, &pdev->msi_list, list) {
+		/* Allocate 16 interrupts for now, since that's the grouping for
+		 * affinity. This can be changed later if it turns out 32 is too
+		 * few MSIs for someone, but restrictions will apply to how the
+		 * sources can be changed independently.
+		 */
+		hwirq = msi_bitmap_alloc_hwirqs(&msi_mpic->msi_bitmap,
+						ALLOC_CHUNK);
+		if (hwirq < 0) {
+			pr_debug("pasemi_msi: failed allocating hwirq\n");
+			return hwirq;
+		}
+
+		virq = irq_create_mapping(msi_mpic->irqhost, hwirq);
+		if (virq == NO_IRQ) {
+			pr_debug("pasemi_msi: failed mapping hwirq 0x%x\n",
+				  hwirq);
+			msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap, hwirq,
+					       ALLOC_CHUNK);
+			return -ENOSPC;
+		}
+
+		/* Vector on MSI is really an offset, the hardware adds
+		 * it to the value written at the magic address. So set
+		 * it to 0 to remain sane.
+		 */
+		mpic_set_vector(virq, 0);
+
+		irq_set_msi_desc(virq, entry);
+		irq_set_chip(virq, &mpic_pasemi_msi_chip);
+		irq_set_irq_type(virq, IRQ_TYPE_EDGE_RISING);
+
+		pr_debug("pasemi_msi: allocated virq 0x%x (hw 0x%x) " \
+			 "addr 0x%x\n", virq, hwirq, msg.address_lo);
+
+		/* Likewise, the device writes [0...511] into the target
+		 * register to generate MSI [512...1023]
+		 */
+		msg.data = hwirq-0x200;
+		pci_write_msi_msg(virq, &msg);
+	}
+
+	return 0;
+}
+
+int mpic_pasemi_msi_init(struct mpic *mpic)
+{
+	int rc;
+
+	if (!mpic->irqhost->of_node ||
+	    !of_device_is_compatible(mpic->irqhost->of_node,
+				     "pasemi,pwrficient-openpic"))
+		return -ENODEV;
+
+	rc = mpic_msi_init_allocator(mpic);
+	if (rc) {
+		pr_debug("pasemi_msi: Error allocating bitmap!\n");
+		return rc;
+	}
+
+	pr_debug("pasemi_msi: Registering PA Semi MPIC MSI callbacks\n");
+
+	msi_mpic = mpic;
+	WARN_ON(ppc_md.setup_msi_irqs);
+	ppc_md.setup_msi_irqs = pasemi_msi_setup_msi_irqs;
+	ppc_md.teardown_msi_irqs = pasemi_msi_teardown_msi_irqs;
+
+	return 0;
+}
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index f7cb2a1..5b492a6 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -2,7 +2,7 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
 ccflags-$(CONFIG_PPC64)		:= $(NO_MINIMAL_TOC)
 
-mpic-msi-obj-$(CONFIG_PCI_MSI)	+= mpic_msi.o mpic_u3msi.o mpic_pasemi_msi.o
+mpic-msi-obj-$(CONFIG_PCI_MSI)	+= mpic_msi.o mpic_u3msi.o
 obj-$(CONFIG_MPIC)		+= mpic.o $(mpic-msi-obj-y)
 obj-$(CONFIG_MPIC_TIMER)        += mpic_timer.o
 obj-$(CONFIG_FSL_MPIC_TIMER_WAKEUP)	+= fsl_mpic_timer_wakeup.o
diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h
index 24bf07a..32971a4 100644
--- a/arch/powerpc/sysdev/mpic.h
+++ b/arch/powerpc/sysdev/mpic.h
@@ -15,7 +15,6 @@
 extern void mpic_msi_reserve_hwirq(struct mpic *mpic, irq_hw_number_t hwirq);
 extern int mpic_msi_init_allocator(struct mpic *mpic);
 extern int mpic_u3msi_init(struct mpic *mpic);
-extern int mpic_pasemi_msi_init(struct mpic *mpic);
 #else
 static inline void mpic_msi_reserve_hwirq(struct mpic *mpic,
 					  irq_hw_number_t hwirq)
@@ -27,11 +26,12 @@ static inline int mpic_u3msi_init(struct mpic *mpic)
 {
 	return -1;
 }
+#endif
 
-static inline int mpic_pasemi_msi_init(struct mpic *mpic)
-{
-	return -1;
-}
+#if defined(CONFIG_PCI_MSI) && defined(CONFIG_PPC_PASEMI)
+int mpic_pasemi_msi_init(struct mpic *mpic);
+#else
+static inline int mpic_pasemi_msi_init(struct mpic *mpic) { return -1; }
 #endif
 
 extern int mpic_set_irq_type(struct irq_data *d, unsigned int flow_type);
diff --git a/arch/powerpc/sysdev/mpic_pasemi_msi.c b/arch/powerpc/sysdev/mpic_pasemi_msi.c
deleted file mode 100644
index a3f660e..0000000
--- a/arch/powerpc/sysdev/mpic_pasemi_msi.c
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright 2007, Olof Johansson, PA Semi
- *
- * Based on arch/powerpc/sysdev/mpic_u3msi.c:
- *
- * Copyright 2006, Segher Boessenkool, IBM Corporation.
- * Copyright 2006-2007, Michael Ellerman, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2 of the
- * License.
- *
- */
-
-#undef DEBUG
-
-#include <linux/irq.h>
-#include <linux/msi.h>
-#include <asm/mpic.h>
-#include <asm/prom.h>
-#include <asm/hw_irq.h>
-#include <asm/ppc-pci.h>
-#include <asm/msi_bitmap.h>
-
-#include "mpic.h"
-
-/* Allocate 16 interrupts per device, to give an alignment of 16,
- * since that's the size of the grouping w.r.t. affinity. If someone
- * needs more than 32 MSI's down the road we'll have to rethink this,
- * but it should be OK for now.
- */
-#define ALLOC_CHUNK 16
-
-#define PASEMI_MSI_ADDR 0xfc080000
-
-/* A bit ugly, can we get this from the pci_dev somehow? */
-static struct mpic *msi_mpic;
-
-
-static void mpic_pasemi_msi_mask_irq(struct irq_data *data)
-{
-	pr_debug("mpic_pasemi_msi_mask_irq %d\n", data->irq);
-	pci_msi_mask_irq(data);
-	mpic_mask_irq(data);
-}
-
-static void mpic_pasemi_msi_unmask_irq(struct irq_data *data)
-{
-	pr_debug("mpic_pasemi_msi_unmask_irq %d\n", data->irq);
-	mpic_unmask_irq(data);
-	pci_msi_unmask_irq(data);
-}
-
-static struct irq_chip mpic_pasemi_msi_chip = {
-	.irq_shutdown		= mpic_pasemi_msi_mask_irq,
-	.irq_mask		= mpic_pasemi_msi_mask_irq,
-	.irq_unmask		= mpic_pasemi_msi_unmask_irq,
-	.irq_eoi		= mpic_end_irq,
-	.irq_set_type		= mpic_set_irq_type,
-	.irq_set_affinity	= mpic_set_affinity,
-	.name			= "PASEMI-MSI",
-};
-
-static void pasemi_msi_teardown_msi_irqs(struct pci_dev *pdev)
-{
-	struct msi_desc *entry;
-
-	pr_debug("pasemi_msi_teardown_msi_irqs, pdev %p\n", pdev);
-
-	list_for_each_entry(entry, &pdev->msi_list, list) {
-		if (entry->irq == NO_IRQ)
-			continue;
-
-		irq_set_msi_desc(entry->irq, NULL);
-		msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap,
-				       virq_to_hw(entry->irq), ALLOC_CHUNK);
-		irq_dispose_mapping(entry->irq);
-	}
-
-	return;
-}
-
-static int pasemi_msi_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
-{
-	unsigned int virq;
-	struct msi_desc *entry;
-	struct msi_msg msg;
-	int hwirq;
-
-	if (type == PCI_CAP_ID_MSIX)
-		pr_debug("pasemi_msi: MSI-X untested, trying anyway\n");
-	pr_debug("pasemi_msi_setup_msi_irqs, pdev %p nvec %d type %d\n",
-		 pdev, nvec, type);
-
-	msg.address_hi = 0;
-	msg.address_lo = PASEMI_MSI_ADDR;
-
-	list_for_each_entry(entry, &pdev->msi_list, list) {
-		/* Allocate 16 interrupts for now, since that's the grouping for
-		 * affinity. This can be changed later if it turns out 32 is too
-		 * few MSIs for someone, but restrictions will apply to how the
-		 * sources can be changed independently.
-		 */
-		hwirq = msi_bitmap_alloc_hwirqs(&msi_mpic->msi_bitmap,
-						ALLOC_CHUNK);
-		if (hwirq < 0) {
-			pr_debug("pasemi_msi: failed allocating hwirq\n");
-			return hwirq;
-		}
-
-		virq = irq_create_mapping(msi_mpic->irqhost, hwirq);
-		if (virq == NO_IRQ) {
-			pr_debug("pasemi_msi: failed mapping hwirq 0x%x\n",
-				  hwirq);
-			msi_bitmap_free_hwirqs(&msi_mpic->msi_bitmap, hwirq,
-					       ALLOC_CHUNK);
-			return -ENOSPC;
-		}
-
-		/* Vector on MSI is really an offset, the hardware adds
-		 * it to the value written at the magic address. So set
-		 * it to 0 to remain sane.
-		 */
-		mpic_set_vector(virq, 0);
-
-		irq_set_msi_desc(virq, entry);
-		irq_set_chip(virq, &mpic_pasemi_msi_chip);
-		irq_set_irq_type(virq, IRQ_TYPE_EDGE_RISING);
-
-		pr_debug("pasemi_msi: allocated virq 0x%x (hw 0x%x) " \
-			 "addr 0x%x\n", virq, hwirq, msg.address_lo);
-
-		/* Likewise, the device writes [0...511] into the target
-		 * register to generate MSI [512...1023]
-		 */
-		msg.data = hwirq-0x200;
-		pci_write_msi_msg(virq, &msg);
-	}
-
-	return 0;
-}
-
-int mpic_pasemi_msi_init(struct mpic *mpic)
-{
-	int rc;
-
-	if (!mpic->irqhost->of_node ||
-	    !of_device_is_compatible(mpic->irqhost->of_node,
-				     "pasemi,pwrficient-openpic"))
-		return -ENODEV;
-
-	rc = mpic_msi_init_allocator(mpic);
-	if (rc) {
-		pr_debug("pasemi_msi: Error allocating bitmap!\n");
-		return rc;
-	}
-
-	pr_debug("pasemi_msi: Registering PA Semi MPIC MSI callbacks\n");
-
-	msi_mpic = mpic;
-	WARN_ON(ppc_md.setup_msi_irqs);
-	ppc_md.setup_msi_irqs = pasemi_msi_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = pasemi_msi_teardown_msi_irqs;
-
-	return 0;
-}
-- 
cgit v0.10.2


From 5c0aebf6e101e9ea6ddea7aaba13582c89206333 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 15 Apr 2015 12:17:07 +1000
Subject: powerpc: Show utsname->machine in boot-up banner

Currently we print "Starting Linux PPC64" at boot. But we don't mention
anywhere whether the kernel is big or little endian.

If we print the utsname->machine value instead we get either "ppc64" or
"ppc64le" which is much more informative, eg:

  Starting Linux ppc64le #1 SMP Wed Apr 15 12:12:20 AEST 2015

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index c69671c..9fe3dcd 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -523,7 +523,8 @@ void __init setup_system(void)
 	smp_release_cpus();
 #endif
 
-	pr_info("Starting Linux PPC64 %s\n", init_utsname()->version);
+	pr_info("Starting Linux %s %s\n", init_utsname()->machine,
+		 init_utsname()->version);
 
 	pr_info("-----------------------------------------------------\n");
 	pr_info("ppc64_pft_size    = 0x%llx\n", ppc64_pft_size);
-- 
cgit v0.10.2


From e79c8385c878abdf90290cba6bea08fd60058562 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 21 Apr 2015 22:26:57 +1000
Subject: powerpc: Don't do gcc version checks if we're building with clang

We have several checks for bad gcc versions in our Makefile. These don't
apply if we're building with clang, so skip them in that case.

The obvious check would be for ${COMPILER} = "gcc", but because of the
way the logic in the top level Makefile conditionally sets COMPILER,
it's possible that we're building with gcc but COMPILER was not set.

So instead check for ${COMPILER} != "clang", which we know is currently
the only other possibility.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 07a4808..3609504 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -314,7 +314,8 @@ TOUT	:= .tmp_gas_check
 # - Require gcc 4.0 or above on 64-bit
 # - gcc-4.2.0 has issues compiling modules on 64-bit
 checkbin:
-	@if test "$(cc-version)" = "0304" ; then \
+	@if test "${COMPILER}" != "clang" \
+	    && test "$(cc-version)" = "0304" ; then \
 		if ! /bin/echo mftb 5 | $(AS) -v -mppc -many -o $(TOUT) >/dev/null 2>&1 ; then \
 			echo -n '*** ${VERSION}.${PATCHLEVEL} kernels no longer build '; \
 			echo 'correctly with gcc-3.4 and your version of binutils.'; \
@@ -322,13 +323,15 @@ checkbin:
 			false; \
 		fi ; \
 	fi
-	@if test "$(cc-version)" -lt "0400" \
+	@if test "${COMPILER}" != "clang" \
+	    && test "$(cc-version)" -lt "0400" \
 	    && test "x${CONFIG_PPC64}" = "xy" ; then \
                 echo -n "Sorry, GCC v4.0 or above is required to build " ; \
                 echo "the 64-bit powerpc kernel." ; \
                 false ; \
         fi
-	@if test "$(cc-fullversion)" = "040200" \
+	@if test "${COMPILER}" != "clang" \
+	    && test "$(cc-fullversion)" = "040200" \
 	    && test "x${CONFIG_MODULES}${CONFIG_PPC64}" = "xyy" ; then \
 		echo -n '*** GCC-4.2.0 cannot compile the 64-bit powerpc ' ; \
 		echo 'kernel with modules enabled.' ; \
-- 
cgit v0.10.2


From 60e065f70bdb0b0e916389024922ad40f3270c96 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 23 Apr 2015 17:27:12 +1000
Subject: powerpc: Reject binutils 2.24 when building little endian

There is a bug in binutils 2.24 which causes miscompilation if we're
building little endian and using weak symbols (which the kernel does).

It is fixed in binutils commit 57fa7b8c7e59 "Correct elf_merge_st_other
arguments for weak symbols", which is in binutils 2.25 and has been
backported to the binutils 2.24 branch and has been picked up by most
distros it seems.

However if we're running stock 2.24 (no extra version) then the bug is
present, so check for that and bail.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 3609504..a314cb0 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -339,6 +339,14 @@ checkbin:
 		echo 'disable kernel modules' ; \
 		false ; \
 	fi
+	@if test "x${CONFIG_CPU_LITTLE_ENDIAN}" = "xy" \
+	    && $(LD) --version | head -1 | grep ' 2\.24$$' >/dev/null ; then \
+		echo -n '*** binutils 2.24 miscompiles weak symbols ' ; \
+		echo 'in some circumstances.' ; \
+		echo -n '*** Please use a different binutils version.' ; \
+		false ; \
+	fi
+
 
 CLEAN_FILES += $(TOUT)
 
-- 
cgit v0.10.2


From 63da88dd48268af415b3677e20f6bcb92c1bf32c Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 22 Apr 2015 15:40:34 +1000
Subject: powerpc/vdso: Remove unused debug code

It's in the git history if we ever need it back.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 305eb0d..869fd0c 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -140,50 +140,6 @@ struct lib64_elfinfo
 };
 
 
-#ifdef __DEBUG
-static void dump_one_vdso_page(struct page *pg, struct page *upg)
-{
-	printk("kpg: %p (c:%d,f:%08lx)", __va(page_to_pfn(pg) << PAGE_SHIFT),
-	       page_count(pg),
-	       pg->flags);
-	if (upg && !IS_ERR(upg) /* && pg != upg*/) {
-		printk(" upg: %p (c:%d,f:%08lx)", __va(page_to_pfn(upg)
-						       << PAGE_SHIFT),
-		       page_count(upg),
-		       upg->flags);
-	}
-	printk("\n");
-}
-
-static void dump_vdso_pages(struct vm_area_struct * vma)
-{
-	int i;
-
-	if (!vma || is_32bit_task()) {
-		printk("vDSO32 @ %016lx:\n", (unsigned long)vdso32_kbase);
-		for (i=0; i<vdso32_pages; i++) {
-			struct page *pg = virt_to_page(vdso32_kbase +
-						       i*PAGE_SIZE);
-			struct page *upg = (vma && vma->vm_mm) ?
-				follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0)
-				: NULL;
-			dump_one_vdso_page(pg, upg);
-		}
-	}
-	if (!vma || !is_32bit_task()) {
-		printk("vDSO64 @ %016lx:\n", (unsigned long)vdso64_kbase);
-		for (i=0; i<vdso64_pages; i++) {
-			struct page *pg = virt_to_page(vdso64_kbase +
-						       i*PAGE_SIZE);
-			struct page *upg = (vma && vma->vm_mm) ?
-				follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0)
-				: NULL;
-			dump_one_vdso_page(pg, upg);
-		}
-	}
-}
-#endif /* DEBUG */
-
 /*
  * This is called from binfmt_elf, we create the special vma for the
  * vDSO and insert it into the mm struct tree
-- 
cgit v0.10.2


From 6e5c077519f2661c56647fc21b2ec2ba0a50bb75 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 22 Apr 2015 15:40:35 +1000
Subject: powerpc/vdso: Combine start/size variables

In vdso_fixup_features() we have start64/start32 and size64/size32, but
they have the same types, ie. void * and unsigned long.

They're only used to save the return value from find_sectionXX() for the
subsequent call to do_feature_fixups(), so there's no overlap in their
usage either.

So we can just consolidate them into start/size and avoid the
duplication.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 869fd0c..8331d0b 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -525,55 +525,52 @@ static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
 static __init int vdso_fixup_features(struct lib32_elfinfo *v32,
 				      struct lib64_elfinfo *v64)
 {
-	void *start32;
-	unsigned long size32;
+	unsigned long size;
+	void *start;
 
 #ifdef CONFIG_PPC64
-	void *start64;
-	unsigned long size64;
-
-	start64 = find_section64(v64->hdr, "__ftr_fixup", &size64);
-	if (start64)
+	start = find_section64(v64->hdr, "__ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(cur_cpu_spec->cpu_features,
-				  start64, start64 + size64);
+				  start, start + size);
 
-	start64 = find_section64(v64->hdr, "__mmu_ftr_fixup", &size64);
-	if (start64)
+	start = find_section64(v64->hdr, "__mmu_ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(cur_cpu_spec->mmu_features,
-				  start64, start64 + size64);
+				  start, start + size);
 
-	start64 = find_section64(v64->hdr, "__fw_ftr_fixup", &size64);
-	if (start64)
+	start = find_section64(v64->hdr, "__fw_ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(powerpc_firmware_features,
-				  start64, start64 + size64);
+				  start, start + size);
 
-	start64 = find_section64(v64->hdr, "__lwsync_fixup", &size64);
-	if (start64)
+	start = find_section64(v64->hdr, "__lwsync_fixup", &size);
+	if (start)
 		do_lwsync_fixups(cur_cpu_spec->cpu_features,
-				 start64, start64 + size64);
+				 start, start + size);
 #endif /* CONFIG_PPC64 */
 
-	start32 = find_section32(v32->hdr, "__ftr_fixup", &size32);
-	if (start32)
+	start = find_section32(v32->hdr, "__ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(cur_cpu_spec->cpu_features,
-				  start32, start32 + size32);
+				  start, start + size);
 
-	start32 = find_section32(v32->hdr, "__mmu_ftr_fixup", &size32);
-	if (start32)
+	start = find_section32(v32->hdr, "__mmu_ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(cur_cpu_spec->mmu_features,
-				  start32, start32 + size32);
+				  start, start + size);
 
 #ifdef CONFIG_PPC64
-	start32 = find_section32(v32->hdr, "__fw_ftr_fixup", &size32);
-	if (start32)
+	start = find_section32(v32->hdr, "__fw_ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(powerpc_firmware_features,
-				  start32, start32 + size32);
+				  start, start + size);
 #endif /* CONFIG_PPC64 */
 
-	start32 = find_section32(v32->hdr, "__lwsync_fixup", &size32);
-	if (start32)
+	start = find_section32(v32->hdr, "__lwsync_fixup", &size);
+	if (start)
 		do_lwsync_fixups(cur_cpu_spec->cpu_features,
-				 start32, start32 + size32);
+				 start, start + size);
 
 	return 0;
 }
-- 
cgit v0.10.2


From e0d0059169945c8ee16790d2e7244cea397dfd56 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 11 May 2015 20:01:02 +1000
Subject: powerpc/vdso: Disable building the 32-bit VDSO on little endian

The only little endian configuration we support is ppc64le. As such if
we're building little endian we don't need a 32-bit VDSO, because there
is no 32-bit userspace.

This patch is a fairly ugly mess of #ifdefs, but is the minimal logic
required to disable the 32-bit VDSO. We can hopefully clean up the
result in future with some further refactoring.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index c1ebbda..87c7d14 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -33,11 +33,12 @@ obj-y				:= cputable.o ptrace.o syscalls.o \
 				   signal.o sysfs.o cacheinfo.o time.o \
 				   prom.o traps.o setup-common.o \
 				   udbg.o misc.o io.o dma.o \
-				   misc_$(CONFIG_WORD_SIZE).o vdso32/ \
+				   misc_$(CONFIG_WORD_SIZE).o \
 				   of_platform.o prom_parse.o
 obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 				   signal_64.o ptrace32.o \
 				   paca.o nvram_64.o firmware.o
+obj-$(CONFIG_VDSO32)		+= vdso32/
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 8331d0b..b457bfa 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -49,13 +49,16 @@
 /* The alignment of the vDSO */
 #define VDSO_ALIGNMENT	(1 << 16)
 
-extern char vdso32_start, vdso32_end;
-static void *vdso32_kbase = &vdso32_start;
 static unsigned int vdso32_pages;
+static void *vdso32_kbase;
 static struct page **vdso32_pagelist;
 unsigned long vdso32_sigtramp;
 unsigned long vdso32_rt_sigtramp;
 
+#ifdef CONFIG_VDSO32
+extern char vdso32_start, vdso32_end;
+#endif
+
 #ifdef CONFIG_PPC64
 extern char vdso64_start, vdso64_end;
 static void *vdso64_kbase = &vdso64_start;
@@ -248,6 +251,7 @@ const char *arch_vma_name(struct vm_area_struct *vma)
 
 
+#ifdef CONFIG_VDSO32
 static void * __init find_section32(Elf32_Ehdr *ehdr, const char *secname,
 				  unsigned long *size)
 {
@@ -335,6 +339,20 @@ static int __init vdso_do_func_patch32(struct lib32_elfinfo *v32,
 
 	return 0;
 }
+#else /* !CONFIG_VDSO32 */
+static unsigned long __init find_function32(struct lib32_elfinfo *lib,
+					    const char *symname)
+{
+	return 0;
+}
+
+static int __init vdso_do_func_patch32(struct lib32_elfinfo *v32,
+				       struct lib64_elfinfo *v64,
+				       const char *orig, const char *fix)
+{
+	return 0;
+}
+#endif /* CONFIG_VDSO32 */
 
 
 #ifdef CONFIG_PPC64
@@ -445,6 +463,7 @@ static __init int vdso_do_find_sections(struct lib32_elfinfo *v32,
 	 * Locate symbol tables & text section
 	 */
 
+#ifdef CONFIG_VDSO32
 	v32->dynsym = find_section32(v32->hdr, ".dynsym", &v32->dynsymsize);
 	v32->dynstr = find_section32(v32->hdr, ".dynstr", NULL);
 	if (v32->dynsym == NULL || v32->dynstr == NULL) {
@@ -457,6 +476,7 @@ static __init int vdso_do_find_sections(struct lib32_elfinfo *v32,
 		return -1;
 	}
 	v32->text = sect - vdso32_kbase;
+#endif
 
 #ifdef CONFIG_PPC64
 	v64->dynsym = find_section64(v64->hdr, ".dynsym", &v64->dynsymsize);
@@ -493,7 +513,9 @@ static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32,
 static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
 				       struct lib64_elfinfo *v64)
 {
+#ifdef CONFIG_VDSO32
 	Elf32_Sym *sym32;
+#endif
 #ifdef CONFIG_PPC64
 	Elf64_Sym *sym64;
 
@@ -508,6 +530,7 @@ static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
 		(sym64->st_value - VDSO64_LBASE);
 #endif /* CONFIG_PPC64 */
 
+#ifdef CONFIG_VDSO32
 	sym32 = find_symbol32(v32, "__kernel_datapage_offset");
 	if (sym32 == NULL) {
 		printk(KERN_ERR "vDSO32: Can't find symbol "
@@ -517,6 +540,7 @@ static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
 	*((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) =
 		(vdso32_pages << PAGE_SHIFT) -
 		(sym32->st_value - VDSO32_LBASE);
+#endif
 
 	return 0;
 }
@@ -550,6 +574,7 @@ static __init int vdso_fixup_features(struct lib32_elfinfo *v32,
 				 start, start + size);
 #endif /* CONFIG_PPC64 */
 
+#ifdef CONFIG_VDSO32
 	start = find_section32(v32->hdr, "__ftr_fixup", &size);
 	if (start)
 		do_feature_fixups(cur_cpu_spec->cpu_features,
@@ -571,6 +596,7 @@ static __init int vdso_fixup_features(struct lib32_elfinfo *v32,
 	if (start)
 		do_lwsync_fixups(cur_cpu_spec->cpu_features,
 				 start, start + size);
+#endif
 
 	return 0;
 }
@@ -732,11 +758,15 @@ static int __init vdso_init(void)
 #endif /* CONFIG_PPC64 */
 
 
+#ifdef CONFIG_VDSO32
+	vdso32_kbase = &vdso32_start;
+
 	/*
 	 * Calculate the size of the 32 bits vDSO
 	 */
 	vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT;
 	DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages);
+#endif
 
 
 	/*
@@ -757,6 +787,7 @@ static int __init vdso_init(void)
 		return 0;
 	}
 
+#ifdef CONFIG_VDSO32
 	/* Make sure pages are in the correct state */
 	vdso32_pagelist = kzalloc(sizeof(struct page *) * (vdso32_pages + 2),
 				  GFP_KERNEL);
@@ -769,6 +800,7 @@ static int __init vdso_init(void)
 	}
 	vdso32_pagelist[i++] = virt_to_page(vdso_data);
 	vdso32_pagelist[i] = NULL;
+#endif
 
 #ifdef CONFIG_PPC64
 	vdso64_pagelist = kzalloc(sizeof(struct page *) * (vdso64_pages + 2),
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 7264e91..724ecc7 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -405,6 +405,16 @@ config PPC_DOORBELL
 
 endmenu
 
+config VDSO32
+	def_bool y
+	depends on PPC32 || CPU_BIG_ENDIAN
+	help
+	  This symbol controls whether we build the 32-bit VDSO. We obviously
+	  want to do that if we're building a 32-bit kernel. If we're building
+	  a 64-bit kernel then we only want a 32-bit VDSO if we're building for
+	  big endian. That is because the only little endian configuration we
+	  support is ppc64le which is 64-bit only.
+
 choice
 	prompt "Endianness selection"
 	default CPU_BIG_ENDIAN
-- 
cgit v0.10.2


From a14ab6b6e0ad54bf872ce987f5b48f338d6780c8 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 1 May 2015 13:20:25 +1000
Subject: powerpc/cell: Drop cbe-oss-dev mailing list from MAINTAINERS

Traffic on the cbe-oss-dev list is more or less non-existent, other than
CC's from linuxppc.

It's seems like we may as well just send everyone to linuxppc and
archive the list.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Jeremy Kerr <jk@ozlabs.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index 590304b..562ae4e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2439,7 +2439,6 @@ F:	Documentation/devicetree/bindings/net/ieee802154/cc2520.txt
 CELL BROADBAND ENGINE ARCHITECTURE
 M:	Arnd Bergmann <arnd@arndb.de>
 L:	linuxppc-dev@lists.ozlabs.org
-L:	cbe-oss-dev@lists.ozlabs.org
 W:	http://www.ibm.com/developerworks/power/cell/
 S:	Supported
 F:	arch/powerpc/include/asm/cell*.h
@@ -7840,14 +7839,13 @@ F:	drivers/net/wireless/prism54/
 PS3 NETWORK SUPPORT
 M:	Geoff Levand <geoff@infradead.org>
 L:	netdev@vger.kernel.org
-L:	cbe-oss-dev@lists.ozlabs.org
+L:	linuxppc-dev@lists.ozlabs.org
 S:	Maintained
 F:	drivers/net/ethernet/toshiba/ps3_gelic_net.*
 
 PS3 PLATFORM SUPPORT
 M:	Geoff Levand <geoff@infradead.org>
 L:	linuxppc-dev@lists.ozlabs.org
-L:	cbe-oss-dev@lists.ozlabs.org
 S:	Maintained
 F:	arch/powerpc/boot/ps3*
 F:	arch/powerpc/include/asm/lv1call.h
@@ -7861,7 +7859,7 @@ F:	sound/ppc/snd_ps3*
 
 PS3VRAM DRIVER
 M:	Jim Paris <jim@jtan.com>
-L:	cbe-oss-dev@lists.ozlabs.org
+L:	linuxppc-dev@lists.ozlabs.org
 S:	Maintained
 F:	drivers/block/ps3vram.c
 
@@ -9337,7 +9335,6 @@ F:	drivers/net/ethernet/toshiba/spider_net*
 SPU FILE SYSTEM
 M:	Jeremy Kerr <jk@ozlabs.org>
 L:	linuxppc-dev@lists.ozlabs.org
-L:	cbe-oss-dev@lists.ozlabs.org
 W:	http://www.ibm.com/developerworks/power/cell/
 S:	Supported
 F:	Documentation/filesystems/spufs.txt
-- 
cgit v0.10.2


From 38c0488770983b2654f075540cc1fc71f55b6df5 Mon Sep 17 00:00:00 2001
From: Joel Stanley <joel@jms.id.au>
Date: Thu, 30 Apr 2015 13:50:07 +1000
Subject: powerpc/powernv: Silence SYSPARAM warning on boot

OpenPower BMC machines do not place any sysparams in the device tree, so
at every boot we get a warning:

 [    0.437176] SYSPARAM: Opal sysparam node not found

Remove the warning, and reorder the init so we don't peform allocations
when there is no sysparam node in the device tree.

Signed-off-by: Joel Stanley <joel@jms.id.au>
Acked-by: Neelesh Gupta <neelegup@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c
index 9d1acf2..2e52b47 100644
--- a/arch/powerpc/platforms/powernv/opal-sysparam.c
+++ b/arch/powerpc/platforms/powernv/opal-sysparam.c
@@ -162,10 +162,20 @@ void __init opal_sys_param_init(void)
 		goto out;
 	}
 
+	/* Some systems do not use sysparams; this is not an error */
+	sysparam = of_find_node_by_path("/ibm,opal/sysparams");
+	if (!sysparam)
+		goto out;
+
+	if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) {
+		pr_err("SYSPARAM: Opal sysparam node not compatible\n");
+		goto out_node_put;
+	}
+
 	sysparam_kobj = kobject_create_and_add("sysparams", opal_kobj);
 	if (!sysparam_kobj) {
 		pr_err("SYSPARAM: Failed to create sysparam kobject\n");
-		goto out;
+		goto out_node_put;
 	}
 
 	/* Allocate big enough buffer for any get/set transactions */
@@ -176,30 +186,19 @@ void __init opal_sys_param_init(void)
 		goto out_kobj_put;
 	}
 
-	sysparam = of_find_node_by_path("/ibm,opal/sysparams");
-	if (!sysparam) {
-		pr_err("SYSPARAM: Opal sysparam node not found\n");
-		goto out_param_buf;
-	}
-
-	if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) {
-		pr_err("SYSPARAM: Opal sysparam node not compatible\n");
-		goto out_node_put;
-	}
-
 	/* Number of parameters exposed through DT */
 	count = of_property_count_strings(sysparam, "param-name");
 	if (count < 0) {
 		pr_err("SYSPARAM: No string found of property param-name in "
 				"the node %s\n", sysparam->name);
-		goto out_node_put;
+		goto out_param_buf;
 	}
 
 	id = kzalloc(sizeof(*id) * count, GFP_KERNEL);
 	if (!id) {
 		pr_err("SYSPARAM: Failed to allocate memory to read parameter "
 				"id\n");
-		goto out_node_put;
+		goto out_param_buf;
 	}
 
 	size = kzalloc(sizeof(*size) * count, GFP_KERNEL);
@@ -293,12 +292,12 @@ out_free_size:
 	kfree(size);
 out_free_id:
 	kfree(id);
-out_node_put:
-	of_node_put(sysparam);
 out_param_buf:
 	kfree(param_data_buf);
 out_kobj_put:
 	kobject_put(sysparam_kobj);
+out_node_put:
+	of_node_put(sysparam);
 out:
 	return;
 }
-- 
cgit v0.10.2


From ed3e81ff2016b180d8f439a1981f2c8b59b0b53c Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Thu, 26 Mar 2015 16:42:07 +1100
Subject: powerpc/eeh: Move PE state constants around

There are two equivalent sets of PE state constants, defined in
arch/powerpc/include/asm/eeh.h and include/uapi/linux/vfio.h.
Though the names are different, their corresponding values are
exactly same. The former is used by EEH core and the latter is
used by userspace.

The patch moves those constants from arch/powerpc/include/asm/eeh.h
to arch/powerpc/include/uapi/asm/eeh.h, which are expected to be
used by userspace from now on. We can't delete those constants in
vfio.h as it's uncertain that those constants have been or will be
used by userspace.

Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index a52db28..7fd7b54 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -27,6 +27,8 @@
 #include <linux/time.h>
 #include <linux/atomic.h>
 
+#include <uapi/asm/eeh.h>
+
 struct pci_dev;
 struct pci_bus;
 struct pci_dn;
@@ -185,11 +187,6 @@ enum {
 #define EEH_STATE_DMA_ACTIVE	(1 << 4)	/* Active DMA		*/
 #define EEH_STATE_MMIO_ENABLED	(1 << 5)	/* MMIO enabled		*/
 #define EEH_STATE_DMA_ENABLED	(1 << 6)	/* DMA enabled		*/
-#define EEH_PE_STATE_NORMAL		0	/* Normal state		*/
-#define EEH_PE_STATE_RESET		1	/* PE reset asserted	*/
-#define EEH_PE_STATE_STOPPED_IO_DMA	2	/* Frozen PE		*/
-#define EEH_PE_STATE_STOPPED_DMA	4	/* Stopped DMA, Enabled IO */
-#define EEH_PE_STATE_UNAVAIL		5	/* Unavailable		*/
 #define EEH_RESET_DEACTIVATE	0	/* Deactivate the PE reset	*/
 #define EEH_RESET_HOT		1	/* Hot reset			*/
 #define EEH_RESET_FUNDAMENTAL	3	/* Fundamental reset		*/
diff --git a/arch/powerpc/include/uapi/asm/eeh.h b/arch/powerpc/include/uapi/asm/eeh.h
new file mode 100644
index 0000000..8bb34b0
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/eeh.h
@@ -0,0 +1,30 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2015
+ *
+ * Authors: Gavin Shan <gwshan@linux.vnet.ibm.com>
+ */
+
+#ifndef _ASM_POWERPC_EEH_H
+#define _ASM_POWERPC_EEH_H
+
+/* PE states */
+#define EEH_PE_STATE_NORMAL		0	/* Normal state		*/
+#define EEH_PE_STATE_RESET		1	/* PE reset asserted	*/
+#define EEH_PE_STATE_STOPPED_IO_DMA	2	/* Frozen PE		*/
+#define EEH_PE_STATE_STOPPED_DMA	4	/* Stopped DMA only	*/
+#define EEH_PE_STATE_UNAVAIL		5	/* Unavailable		*/
+
+#endif /* _ASM_POWERPC_EEH_H */
-- 
cgit v0.10.2


From ec33d36e5ab5d52d59a8f696f7efb682bfc58494 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Thu, 26 Mar 2015 16:42:08 +1100
Subject: powerpc/eeh: Introduce eeh_pe_inject_err()

The patch defines PCI error types and functions in uapi/asm/eeh.h
and exports function eeh_pe_inject_err(), which will be called by
VFIO driver to inject the specified PCI error to the indicated
PE for testing purpose.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 7fd7b54..c5eb86f 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -291,6 +291,8 @@ int eeh_pe_set_option(struct eeh_pe *pe, int option);
 int eeh_pe_get_state(struct eeh_pe *pe);
 int eeh_pe_reset(struct eeh_pe *pe, int option);
 int eeh_pe_configure(struct eeh_pe *pe);
+int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func,
+		      unsigned long addr, unsigned long mask);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/include/uapi/asm/eeh.h b/arch/powerpc/include/uapi/asm/eeh.h
index 8bb34b0..291b7d1 100644
--- a/arch/powerpc/include/uapi/asm/eeh.h
+++ b/arch/powerpc/include/uapi/asm/eeh.h
@@ -27,4 +27,30 @@
 #define EEH_PE_STATE_STOPPED_DMA	4	/* Stopped DMA only	*/
 #define EEH_PE_STATE_UNAVAIL		5	/* Unavailable		*/
 
+/* EEH error types and functions */
+#define EEH_ERR_TYPE_32			0       /* 32-bits error	*/
+#define EEH_ERR_TYPE_64			1       /* 64-bits error	*/
+#define EEH_ERR_FUNC_MIN		0
+#define EEH_ERR_FUNC_LD_MEM_ADDR	0	/* Memory load	*/
+#define EEH_ERR_FUNC_LD_MEM_DATA	1
+#define EEH_ERR_FUNC_LD_IO_ADDR		2	/* IO load	*/
+#define EEH_ERR_FUNC_LD_IO_DATA		3
+#define EEH_ERR_FUNC_LD_CFG_ADDR	4	/* Config load	*/
+#define EEH_ERR_FUNC_LD_CFG_DATA	5
+#define EEH_ERR_FUNC_ST_MEM_ADDR	6	/* Memory store	*/
+#define EEH_ERR_FUNC_ST_MEM_DATA	7
+#define EEH_ERR_FUNC_ST_IO_ADDR		8	/* IO store	*/
+#define EEH_ERR_FUNC_ST_IO_DATA		9
+#define EEH_ERR_FUNC_ST_CFG_ADDR	10	/* Config store	*/
+#define EEH_ERR_FUNC_ST_CFG_DATA	11
+#define EEH_ERR_FUNC_DMA_RD_ADDR	12	/* DMA read	*/
+#define EEH_ERR_FUNC_DMA_RD_DATA	13
+#define EEH_ERR_FUNC_DMA_RD_MASTER	14
+#define EEH_ERR_FUNC_DMA_RD_TARGET	15
+#define EEH_ERR_FUNC_DMA_WR_ADDR	16	/* DMA write	*/
+#define EEH_ERR_FUNC_DMA_WR_DATA	17
+#define EEH_ERR_FUNC_DMA_WR_MASTER	18
+#define EEH_ERR_FUNC_DMA_WR_TARGET	19
+#define EEH_ERR_FUNC_MAX		19
+
 #endif /* _ASM_POWERPC_EEH_H */
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9ee61d1..04b5d94 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1647,6 +1647,41 @@ int eeh_pe_configure(struct eeh_pe *pe)
 }
 EXPORT_SYMBOL_GPL(eeh_pe_configure);
 
+/**
+ * eeh_pe_inject_err - Injecting the specified PCI error to the indicated PE
+ * @pe: the indicated PE
+ * @type: error type
+ * @function: error function
+ * @addr: address
+ * @mask: address mask
+ *
+ * The routine is called to inject the specified PCI error, which
+ * is determined by @type and @function, to the indicated PE for
+ * testing purpose.
+ */
+int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func,
+		      unsigned long addr, unsigned long mask)
+{
+	/* Invalid PE ? */
+	if (!pe)
+		return -ENODEV;
+
+	/* Unsupported operation ? */
+	if (!eeh_ops || !eeh_ops->err_inject)
+		return -ENOENT;
+
+	/* Check on PCI error type */
+	if (type != EEH_ERR_TYPE_32 && type != EEH_ERR_TYPE_64)
+		return -EINVAL;
+
+	/* Check on PCI error function */
+	if (func < EEH_ERR_FUNC_MIN || func > EEH_ERR_FUNC_MAX)
+		return -EINVAL;
+
+	return eeh_ops->err_inject(pe, type, func, addr, mask);
+}
+EXPORT_SYMBOL_GPL(eeh_pe_inject_err);
+
 static int proc_eeh_show(struct seq_file *m, void *v)
 {
 	if (!eeh_enabled()) {
-- 
cgit v0.10.2


From 68cbbc3a9d1fc231810b2490bca73b3b444ef542 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Thu, 26 Mar 2015 16:42:09 +1100
Subject: drivers/vfio: Support EEH error injection

The patch adds one more EEH sub-command (VFIO_EEH_PE_INJECT_ERR)
to inject the specified EEH error, which is represented by
(struct vfio_eeh_pe_err), to the indicated PE for testing purpose.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 96978ec..4c746a7 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -385,6 +385,18 @@ The code flow from the example above should be slightly changed:
 
 	....
 
+	/* Inject EEH error, which is expected to be caused by 32-bits
+	 * config load.
+	 */
+	pe_op.op = VFIO_EEH_PE_INJECT_ERR;
+	pe_op.err.type = EEH_ERR_TYPE_32;
+	pe_op.err.func = EEH_ERR_FUNC_LD_CFG_ADDR;
+	pe_op.err.addr = 0ul;
+	pe_op.err.mask = 0ul;
+	ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+	....
+
 	/* When 0xFF's returned from reading PCI config space or IO BARs
 	 * of the PCI device. Check the PE's state to see if that has been
 	 * frozen.
diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c
index 5fa42db..38edeb4 100644
--- a/drivers/vfio/vfio_spapr_eeh.c
+++ b/drivers/vfio/vfio_spapr_eeh.c
@@ -85,6 +85,16 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 		case VFIO_EEH_PE_CONFIGURE:
 			ret = eeh_pe_configure(pe);
 			break;
+		case VFIO_EEH_PE_INJECT_ERR:
+			minsz = offsetofend(struct vfio_eeh_pe_op, err.mask);
+			if (op.argsz < minsz)
+				return -EINVAL;
+			if (copy_from_user(&op, (void __user *)arg, minsz))
+				return -EFAULT;
+
+			ret = eeh_pe_inject_err(pe, op.err.type, op.err.func,
+						op.err.addr, op.err.mask);
+			break;
 		default:
 			ret = -EINVAL;
 		}
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index b57b750..e4fa199 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -470,12 +470,23 @@ struct vfio_iommu_spapr_tce_info {
  * - unfreeze IO/DMA for frozen PE;
  * - read PE state;
  * - reset PE;
- * - configure PE.
+ * - configure PE;
+ * - inject EEH error.
  */
+struct vfio_eeh_pe_err {
+	__u32 type;
+	__u32 func;
+	__u64 addr;
+	__u64 mask;
+};
+
 struct vfio_eeh_pe_op {
 	__u32 argsz;
 	__u32 flags;
 	__u32 op;
+	union {
+		struct vfio_eeh_pe_err err;
+	};
 };
 
 #define VFIO_EEH_PE_DISABLE		0	/* Disable EEH functionality */
@@ -492,6 +503,7 @@ struct vfio_eeh_pe_op {
 #define VFIO_EEH_PE_RESET_HOT		6	/* Assert hot reset          */
 #define VFIO_EEH_PE_RESET_FUNDAMENTAL	7	/* Assert fundamental reset  */
 #define VFIO_EEH_PE_CONFIGURE		8	/* PE configuration          */
+#define VFIO_EEH_PE_INJECT_ERR		9	/* Inject EEH error          */
 
 #define VFIO_EEH_PE_OP			_IO(VFIO_TYPE, VFIO_BASE + 21)
 
-- 
cgit v0.10.2


From 3721352990ba5eafa7e92a1c9cf86af3a46de151 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@linux.vnet.ibm.com>
Date: Mon, 27 Apr 2015 09:25:09 +0800
Subject: powerpc/eeh: fix start/end/flags type in struct pci_io_addr_range{}

struct pci_io_addr_range{} stores the information of pci resources. It
would be better to keep these related fields have the same type as in
struct resource{}.

This patch fixes the start/end/flags type in struct pci_io_addr_range{} to
have the same type as in struct resource{}.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index eeabeab..a1e86e1 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -48,11 +48,11 @@
  */
 struct pci_io_addr_range {
 	struct rb_node rb_node;
-	unsigned long addr_lo;
-	unsigned long addr_hi;
+	resource_size_t addr_lo;
+	resource_size_t addr_hi;
 	struct eeh_dev *edev;
 	struct pci_dev *pcidev;
-	unsigned int flags;
+	unsigned long flags;
 };
 
 static struct pci_io_addr_cache {
@@ -125,8 +125,8 @@ static void eeh_addr_cache_print(struct pci_io_addr_cache *cache)
 
 /* Insert address range into the rb tree. */
 static struct pci_io_addr_range *
-eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
-		      unsigned long ahi, unsigned int flags)
+eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo,
+		      resource_size_t ahi, unsigned long flags)
 {
 	struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
 	struct rb_node *parent = NULL;
@@ -197,9 +197,9 @@ static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
 
 	/* Walk resources on this device, poke them into the tree */
 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-		unsigned long start = pci_resource_start(dev,i);
-		unsigned long end = pci_resource_end(dev,i);
-		unsigned int flags = pci_resource_flags(dev,i);
+		resource_size_t start = pci_resource_start(dev,i);
+		resource_size_t end = pci_resource_end(dev,i);
+		unsigned long flags = pci_resource_flags(dev,i);
 
 		/* We are interested only bus addresses, not dma or other stuff */
 		if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
-- 
cgit v0.10.2


From 2ac3990cc36b1e42feca733b25254fb6dae15431 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@linux.vnet.ibm.com>
Date: Mon, 27 Apr 2015 09:25:10 +0800
Subject: powerpc/eeh: fix comment for wait_state()

To retrieve the PCI slot state, EEH driver would set a timeout for that.
While current comment is not aligned to what the code does.

This patch fixes those comments according to the code.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 24768ff..89eb4bc 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -660,7 +660,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	eeh_pe_dev_traverse(pe, eeh_report_error, &result);
 
 	/* Get the current PCI slot state. This can take a long time,
-	 * sometimes over 3 seconds for certain systems.
+	 * sometimes over 300 seconds for certain systems.
 	 */
 	rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
 	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index ce738ab..d025471 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -979,7 +979,7 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
 /**
  * pnv_eeh_wait_state - Wait for PE state
  * @pe: EEH PE
- * @max_wait: maximal period in microsecond
+ * @max_wait: maximal period in millisecond
  *
  * Wait for the state of associated PE. It might take some time
  * to retrieve the PE's state.
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 2039397..1ba55d0 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -519,7 +519,7 @@ static int pseries_eeh_reset(struct eeh_pe *pe, int option)
 /**
  * pseries_eeh_wait_state - Wait for PE state
  * @pe: EEH PE
- * @max_wait: maximal period in microsecond
+ * @max_wait: maximal period in millisecond
  *
  * Wait for the state of associated PE. It might take some time
  * to retrieve the PE's state.
-- 
cgit v0.10.2


From e17866d5593dc6ea7bff9ee56751fae6c3f56223 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@linux.vnet.ibm.com>
Date: Mon, 27 Apr 2015 09:25:11 +0800
Subject: powerpc/eeh: fix powernv_eeh_wait_state delay logic

As the comment indicates, powernv_eeh_get_state() will inform EEH core to
delay 1 second. This means the delay doesn't happen when
powernv_eeh_get_state() returns.

This patch moves the delay subtraction just before msleep(), which is the
same logic in pseries_eeh_wait_state().

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index d025471..622f08c 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1000,13 +1000,13 @@ static int pnv_eeh_wait_state(struct eeh_pe *pe, int max_wait)
 		if (ret != EEH_STATE_UNAVAILABLE)
 			return ret;
 
-		max_wait -= mwait;
 		if (max_wait <= 0) {
 			pr_warn("%s: Timeout getting PE#%x's state (%d)\n",
 				__func__, pe->addr, max_wait);
 			return EEH_STATE_NOT_SUPPORT;
 		}
 
+		max_wait -= mwait;
 		msleep(mwait);
 	}
 
-- 
cgit v0.10.2


From f77ceb717d0955f1df20601683ea55675101bab6 Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@linux.vnet.ibm.com>
Date: Mon, 27 Apr 2015 09:25:12 +0800
Subject: powerpc/eeh: remove unused macro IS_BRIDGE

Currently, the macro IS_BRIDGE is not used any where.
This patch just removes it.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 04b5d94..23e0aa7 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -144,8 +144,6 @@ struct eeh_stats {
 
 static struct eeh_stats eeh_stats;
 
-#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
-
 static int __init eeh_setup(char *str)
 {
 	if (!strcmp(str, "off"))
-- 
cgit v0.10.2


From d4d4add9eacf0a3e642228a22df722f00921a184 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 22 Apr 2015 15:36:50 +1000
Subject: powerpc: Little endian should depend on PPC_BOOK3S_64

The only little endian configuration we support is ppc64le, all other
configurations are big endian.

So we should only offer a choice of endian if we're building for 64-bit
Book3S, ie. PPC_BOOK3S_64.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 724ecc7..c140e94 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -431,6 +431,7 @@ config CPU_BIG_ENDIAN
 
 config CPU_LITTLE_ENDIAN
 	bool "Build little endian kernel"
+	depends on PPC_BOOK3S_64
 	select PPC64_BOOT_WRAPPER
 	help
 	  Build a little endian kernel.
-- 
cgit v0.10.2


From 7978f76c4495c194183e03f65406f78cdda37882 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Date: Tue, 5 May 2015 17:30:21 +0200
Subject: powerpc: Enable sys_kcmp() for CRIU

The commit 8170a83f15ee ("powerpc: Wireup the kcmp syscall to sys_ni") has
disabled the kcmp syscall for powerpc.  This has been done due to the use
of unsigned long parameters which may require a dedicated wrapper to handle
32bit process on top of 64bit kernel.  However in the kcmp() case, the 2
unsigned long parameters are currently only used to carry file descriptors
from user space to the kernel.  Since such a parameter is passed through
register, and file descriptor doesn't need to get extended, there is,
today, no need for a wrapper.

In the case there will be a need to pass address in or out of this system
call, then a wrapper could be required, it will then be to care of it.

As today this is not the case, it is safe to enable kcmp() on powerpc.

Tested (by Laurent) on 64-bit, 32-bit, and 32-bit userspace on 64-bit
kernel using tools/testing/selftests/kcmp [mpe].

Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index f1863a1..71f2b3f 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -358,7 +358,7 @@ SYSCALL_SPU(setns)
 COMPAT_SYS(process_vm_readv)
 COMPAT_SYS(process_vm_writev)
 SYSCALL(finit_module)
-SYSCALL(ni_syscall) /* sys_kcmp */
+SYSCALL(kcmp) /* sys_kcmp */
 SYSCALL_SPU(sched_setattr)
 SYSCALL_SPU(sched_getattr)
 SYSCALL_SPU(renameat2)
-- 
cgit v0.10.2


From e602ffb2fa6b5533acd6847bd19a135e14ed4c54 Mon Sep 17 00:00:00 2001
From: "Shreyas B. Prabhu" <shreyas@linux.vnet.ibm.com>
Date: Mon, 20 Apr 2015 10:32:56 +0530
Subject: powerpc: Fix cpu_online_cores_map to return only online threads mask

Currently, cpu_online_cores_map returns a mask, which for every core with
at least one online thread, has the bit for thread 0 of the core set to 1,
and the bits for all other threads of the core set to 0. But thread 0 of
the core itself may not be online always. In such cases, if the returned
mask is used for IPI, then it'll cause IPIs to be skipped on cores where
the first thread is offline, because the IPI code refuses to send IPIs to
offline threads.

Fix this by setting the bit of the first online thread in the core.
This is done by fixing this in the underlying function
cpu_thread_mask_to_cores.

The result has the property that for all cores with online threads, there
is one bit set in the returned map. And further, all bits that are set in
the returned map correspond to online threads.

Signed-off-by: Shreyas B. Prabhu <shreyas@linux.vnet.ibm.com>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
[ Changelog from Michael Ellerman <mpe@ellerman.id.au> ]
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h
index 5be6c47..ba42e46 100644
--- a/arch/powerpc/include/asm/cputhreads.h
+++ b/arch/powerpc/include/asm/cputhreads.h
@@ -31,9 +31,9 @@ extern cpumask_t threads_core_mask;
 /* cpu_thread_mask_to_cores - Return a cpumask of one per cores
  *                            hit by the argument
  *
- * @threads:	a cpumask of threads
+ * @threads:	a cpumask of online threads
  *
- * This function returns a cpumask which will have one "cpu" (or thread)
+ * This function returns a cpumask which will have one online cpu's
  * bit set for each core that has at least one thread set in the argument.
  *
  * This can typically be used for things like IPI for tlb invalidations
@@ -42,13 +42,16 @@ extern cpumask_t threads_core_mask;
 static inline cpumask_t cpu_thread_mask_to_cores(const struct cpumask *threads)
 {
 	cpumask_t	tmp, res;
-	int		i;
+	int		i, cpu;
 
 	cpumask_clear(&res);
 	for (i = 0; i < NR_CPUS; i += threads_per_core) {
 		cpumask_shift_left(&tmp, &threads_core_mask, i);
-		if (cpumask_intersects(threads, &tmp))
-			cpumask_set_cpu(i, &res);
+		if (cpumask_intersects(threads, &tmp)) {
+			cpu = cpumask_next_and(-1, &tmp, cpu_online_mask);
+			if (cpu < nr_cpu_ids)
+				cpumask_set_cpu(cpu, &res);
+		}
 	}
 	return res;
 }
-- 
cgit v0.10.2


From d405a98c70ebbaa6ef276d7307d034a682321b95 Mon Sep 17 00:00:00 2001
From: "Shreyas B. Prabhu" <shreyas@linux.vnet.ibm.com>
Date: Mon, 20 Apr 2015 10:32:57 +0530
Subject: powerpc/powernv: Move cpuidle related code from setup.c to new file

This is a cleanup patch; doesn't change any functionality. Moves
all cpuidle related code from setup.c to a new file.

Signed-off-by: Shreyas B. Prabhu <shreyas@linux.vnet.ibm.com>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
[mpe: Fix the SMP=n build by including asm/smp.h in idle.c]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 33e44f3..bee9235 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,4 +1,4 @@
-obj-y			+= setup.o opal-wrappers.o opal.o opal-async.o
+obj-y			+= setup.o opal-wrappers.o opal.o opal-async.o idle.o
 obj-y			+= opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
 obj-y			+= opal-msglog.o opal-hmi.o opal-power.o
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
new file mode 100644
index 0000000..5c79932
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -0,0 +1,192 @@
+/*
+ * PowerNV cpuidle code
+ *
+ * Copyright 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+
+#include <asm/firmware.h>
+#include <asm/opal.h>
+#include <asm/cputhreads.h>
+#include <asm/cpuidle.h>
+#include <asm/code-patching.h>
+#include <asm/smp.h>
+
+#include "powernv.h"
+#include "subcore.h"
+
+static u32 supported_cpuidle_states;
+
+int pnv_save_sprs_for_winkle(void)
+{
+	int cpu;
+	int rc;
+
+	/*
+	 * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric accross
+	 * all cpus at boot. Get these reg values of current cpu and use the
+	 * same accross all cpus.
+	 */
+	uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
+	uint64_t hid0_val = mfspr(SPRN_HID0);
+	uint64_t hid1_val = mfspr(SPRN_HID1);
+	uint64_t hid4_val = mfspr(SPRN_HID4);
+	uint64_t hid5_val = mfspr(SPRN_HID5);
+	uint64_t hmeer_val = mfspr(SPRN_HMEER);
+
+	for_each_possible_cpu(cpu) {
+		uint64_t pir = get_hard_smp_processor_id(cpu);
+		uint64_t hsprg0_val = (uint64_t)&paca[cpu];
+
+		/*
+		 * HSPRG0 is used to store the cpu's pointer to paca. Hence last
+		 * 3 bits are guaranteed to be 0. Program slw to restore HSPRG0
+		 * with 63rd bit set, so that when a thread wakes up at 0x100 we
+		 * can use this bit to distinguish between fastsleep and
+		 * deep winkle.
+		 */
+		hsprg0_val |= 1;
+
+		rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
+		if (rc != 0)
+			return rc;
+
+		rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
+		if (rc != 0)
+			return rc;
+
+		/* HIDs are per core registers */
+		if (cpu_thread_in_core(cpu) == 0) {
+
+			rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val);
+			if (rc != 0)
+				return rc;
+		}
+	}
+
+	return 0;
+}
+
+static void pnv_alloc_idle_core_states(void)
+{
+	int i, j;
+	int nr_cores = cpu_nr_cores();
+	u32 *core_idle_state;
+
+	/*
+	 * core_idle_state - First 8 bits track the idle state of each thread
+	 * of the core. The 8th bit is the lock bit. Initially all thread bits
+	 * are set. They are cleared when the thread enters deep idle state
+	 * like sleep and winkle. Initially the lock bit is cleared.
+	 * The lock bit has 2 purposes
+	 * a. While the first thread is restoring core state, it prevents
+	 * other threads in the core from switching to process context.
+	 * b. While the last thread in the core is saving the core state, it
+	 * prevents a different thread from waking up.
+	 */
+	for (i = 0; i < nr_cores; i++) {
+		int first_cpu = i * threads_per_core;
+		int node = cpu_to_node(first_cpu);
+
+		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
+		*core_idle_state = PNV_CORE_IDLE_THREAD_BITS;
+
+		for (j = 0; j < threads_per_core; j++) {
+			int cpu = first_cpu + j;
+
+			paca[cpu].core_idle_state_ptr = core_idle_state;
+			paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
+			paca[cpu].thread_mask = 1 << j;
+		}
+	}
+
+	update_subcore_sibling_mask();
+
+	if (supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED)
+		pnv_save_sprs_for_winkle();
+}
+
+u32 pnv_get_supported_cpuidle_states(void)
+{
+	return supported_cpuidle_states;
+}
+EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
+
+static int __init pnv_init_idle_states(void)
+{
+	struct device_node *power_mgt;
+	int dt_idle_states;
+	u32 *flags;
+	int i;
+
+	supported_cpuidle_states = 0;
+
+	if (cpuidle_disable != IDLE_NO_OVERRIDE)
+		goto out;
+
+	if (!firmware_has_feature(FW_FEATURE_OPALv3))
+		goto out;
+
+	power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
+	if (!power_mgt) {
+		pr_warn("opal: PowerMgmt Node not found\n");
+		goto out;
+	}
+	dt_idle_states = of_property_count_u32_elems(power_mgt,
+			"ibm,cpu-idle-state-flags");
+	if (dt_idle_states < 0) {
+		pr_warn("cpuidle-powernv: no idle states found in the DT\n");
+		goto out;
+	}
+
+	flags = kzalloc(sizeof(*flags) * dt_idle_states, GFP_KERNEL);
+	if (of_property_read_u32_array(power_mgt,
+			"ibm,cpu-idle-state-flags", flags, dt_idle_states)) {
+		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n");
+		goto out_free;
+	}
+
+	for (i = 0; i < dt_idle_states; i++)
+		supported_cpuidle_states |= flags[i];
+
+	if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+		patch_instruction(
+			(unsigned int *)pnv_fastsleep_workaround_at_entry,
+			PPC_INST_NOP);
+		patch_instruction(
+			(unsigned int *)pnv_fastsleep_workaround_at_exit,
+			PPC_INST_NOP);
+	}
+	pnv_alloc_idle_core_states();
+out_free:
+	kfree(flags);
+out:
+	return 0;
+}
+
+subsys_initcall(pnv_init_idle_states);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 16fdcb2..509cdd2 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -35,12 +35,8 @@
 #include <asm/opal.h>
 #include <asm/kexec.h>
 #include <asm/smp.h>
-#include <asm/cputhreads.h>
-#include <asm/cpuidle.h>
-#include <asm/code-patching.h>
 
 #include "powernv.h"
-#include "subcore.h"
 
 static void __init pnv_setup_arch(void)
 {
@@ -277,173 +273,6 @@ static void __init pnv_setup_machdep_opal(void)
 	ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
 }
 
-static u32 supported_cpuidle_states;
-
-int pnv_save_sprs_for_winkle(void)
-{
-	int cpu;
-	int rc;
-
-	/*
-	 * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric accross
-	 * all cpus at boot. Get these reg values of current cpu and use the
-	 * same accross all cpus.
-	 */
-	uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
-	uint64_t hid0_val = mfspr(SPRN_HID0);
-	uint64_t hid1_val = mfspr(SPRN_HID1);
-	uint64_t hid4_val = mfspr(SPRN_HID4);
-	uint64_t hid5_val = mfspr(SPRN_HID5);
-	uint64_t hmeer_val = mfspr(SPRN_HMEER);
-
-	for_each_possible_cpu(cpu) {
-		uint64_t pir = get_hard_smp_processor_id(cpu);
-		uint64_t hsprg0_val = (uint64_t)&paca[cpu];
-
-		/*
-		 * HSPRG0 is used to store the cpu's pointer to paca. Hence last
-		 * 3 bits are guaranteed to be 0. Program slw to restore HSPRG0
-		 * with 63rd bit set, so that when a thread wakes up at 0x100 we
-		 * can use this bit to distinguish between fastsleep and
-		 * deep winkle.
-		 */
-		hsprg0_val |= 1;
-
-		rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
-		if (rc != 0)
-			return rc;
-
-		rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
-		if (rc != 0)
-			return rc;
-
-		/* HIDs are per core registers */
-		if (cpu_thread_in_core(cpu) == 0) {
-
-			rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val);
-			if (rc != 0)
-				return rc;
-
-			rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val);
-			if (rc != 0)
-				return rc;
-
-			rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
-			if (rc != 0)
-				return rc;
-
-			rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val);
-			if (rc != 0)
-				return rc;
-
-			rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val);
-			if (rc != 0)
-				return rc;
-		}
-	}
-
-	return 0;
-}
-
-static void pnv_alloc_idle_core_states(void)
-{
-	int i, j;
-	int nr_cores = cpu_nr_cores();
-	u32 *core_idle_state;
-
-	/*
-	 * core_idle_state - First 8 bits track the idle state of each thread
-	 * of the core. The 8th bit is the lock bit. Initially all thread bits
-	 * are set. They are cleared when the thread enters deep idle state
-	 * like sleep and winkle. Initially the lock bit is cleared.
-	 * The lock bit has 2 purposes
-	 * a. While the first thread is restoring core state, it prevents
-	 * other threads in the core from switching to process context.
-	 * b. While the last thread in the core is saving the core state, it
-	 * prevents a different thread from waking up.
-	 */
-	for (i = 0; i < nr_cores; i++) {
-		int first_cpu = i * threads_per_core;
-		int node = cpu_to_node(first_cpu);
-
-		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
-		*core_idle_state = PNV_CORE_IDLE_THREAD_BITS;
-
-		for (j = 0; j < threads_per_core; j++) {
-			int cpu = first_cpu + j;
-
-			paca[cpu].core_idle_state_ptr = core_idle_state;
-			paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
-			paca[cpu].thread_mask = 1 << j;
-		}
-	}
-
-	update_subcore_sibling_mask();
-
-	if (supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED)
-		pnv_save_sprs_for_winkle();
-}
-
-u32 pnv_get_supported_cpuidle_states(void)
-{
-	return supported_cpuidle_states;
-}
-EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
-
-static int __init pnv_init_idle_states(void)
-{
-	struct device_node *power_mgt;
-	int dt_idle_states;
-	u32 *flags;
-	int i;
-
-	supported_cpuidle_states = 0;
-
-	if (cpuidle_disable != IDLE_NO_OVERRIDE)
-		goto out;
-
-	if (!firmware_has_feature(FW_FEATURE_OPALv3))
-		goto out;
-
-	power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
-	if (!power_mgt) {
-		pr_warn("opal: PowerMgmt Node not found\n");
-		goto out;
-	}
-	dt_idle_states = of_property_count_u32_elems(power_mgt,
-			"ibm,cpu-idle-state-flags");
-	if (dt_idle_states < 0) {
-		pr_warn("cpuidle-powernv: no idle states found in the DT\n");
-		goto out;
-	}
-
-	flags = kzalloc(sizeof(*flags) * dt_idle_states, GFP_KERNEL);
-	if (of_property_read_u32_array(power_mgt,
-			"ibm,cpu-idle-state-flags", flags, dt_idle_states)) {
-		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n");
-		goto out_free;
-	}
-
-	for (i = 0; i < dt_idle_states; i++)
-		supported_cpuidle_states |= flags[i];
-
-	if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
-		patch_instruction(
-			(unsigned int *)pnv_fastsleep_workaround_at_entry,
-			PPC_INST_NOP);
-		patch_instruction(
-			(unsigned int *)pnv_fastsleep_workaround_at_exit,
-			PPC_INST_NOP);
-	}
-	pnv_alloc_idle_core_states();
-out_free:
-	kfree(flags);
-out:
-	return 0;
-}
-
-subsys_initcall(pnv_init_idle_states);
-
 static int __init pnv_probe(void)
 {
 	unsigned long root = of_get_flat_dt_root();
-- 
cgit v0.10.2


From 5703d2f4a1da6d23b3be896947ce255226fc4295 Mon Sep 17 00:00:00 2001
From: "Shreyas B. Prabhu" <shreyas@linux.vnet.ibm.com>
Date: Mon, 20 Apr 2015 10:32:58 +0530
Subject: powerpc/powernv: Introduce sysfs control for fastsleep workaround
 behavior

Fastsleep is one of the idle state which cpuidle subsystem currently
uses on power8 machines. In this state L2 cache is brought down to a
threshold voltage. Therefore when the core is in fastsleep, the
communication between L2 and L3 needs to be fenced. But there is a bug
in the current power8 chips surrounding this fencing.

OPAL provides a workaround which precludes the possibility of hitting
this bug. But running with this workaround applied causes checkstop
if any correctable error in L2 cache directory is detected. Hence OPAL
also provides a way to undo the workaround.

In the existing implementation, workaround is applied by the last thread
of the core entering fastsleep and undone by the first thread waking up.
But this has a performance cost. These OPAL calls account for roughly
4000 cycles everytime the core has to enter or wakeup from fastsleep.

This patch introduces a sysfs attribute (fastsleep_workaround_applyonce)
to choose the behavior of this workaround.

By default, fastsleep_workaround_applyonce = 0. In this case, workaround
is applied/undone everytime the core enters/exits fastsleep.

fastsleep_workaround_applyonce = 1. In this case the workaround is
applied once on all the cores and never undone. This can be triggered by
echo 1 > /sys/devices/system/cpu/fastsleep_workaround_applyonce

For simplicity this attribute can be modified only once. Implying, once
fastsleep_workaround_applyonce is changed to 1, it cannot be reverted
to the default state.

Signed-off-by: Shreyas B. Prabhu <shreyas@linux.vnet.ibm.com>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 0321a90..a49e5fa 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -165,6 +165,13 @@
 #define OPAL_PM_WINKLE_ENABLED		0x00040000
 #define OPAL_PM_SLEEP_ENABLED_ER1	0x00080000 /* with workaround */
 
+/*
+ * OPAL_CONFIG_CPU_IDLE_STATE parameters
+ */
+#define OPAL_CONFIG_IDLE_FASTSLEEP	1
+#define OPAL_CONFIG_IDLE_UNDO		0
+#define OPAL_CONFIG_IDLE_APPLY		1
+
 #ifndef __ASSEMBLY__
 
 /* Other enums */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 042af1a..9a47813 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -186,6 +186,7 @@ int64_t opal_handle_hmi(void);
 int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end);
 int64_t opal_unregister_dump_region(uint32_t id);
 int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val);
+int64_t opal_config_cpu_idle_state(uint64_t state, uint64_t flag);
 int64_t opal_pci_set_phb_cxl_mode(uint64_t phb_id, uint64_t mode, uint64_t pe_number);
 int64_t opal_ipmi_send(uint64_t interface, struct opal_ipmi_msg *msg,
 		uint64_t msg_len);
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 5c79932..bd39a12 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -13,6 +13,8 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/of.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
 
 #include <asm/firmware.h>
 #include <asm/opal.h>
@@ -137,6 +139,96 @@ u32 pnv_get_supported_cpuidle_states(void)
 }
 EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
 
+
+static void pnv_fastsleep_workaround_apply(void *info)
+
+{
+	int rc;
+	int *err = info;
+
+	rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
+					OPAL_CONFIG_IDLE_APPLY);
+	if (rc)
+		*err = 1;
+}
+
+/*
+ * Used to store fastsleep workaround state
+ * 0 - Workaround applied/undone at fastsleep entry/exit path (Default)
+ * 1 - Workaround applied once, never undone.
+ */
+static u8 fastsleep_workaround_applyonce;
+
+static ssize_t show_fastsleep_workaround_applyonce(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", fastsleep_workaround_applyonce);
+}
+
+static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
+		struct device_attribute *attr, const char *buf,
+		size_t count)
+{
+	cpumask_t primary_thread_mask;
+	int err;
+	u8 val;
+
+	if (kstrtou8(buf, 0, &val) || val != 1)
+		return -EINVAL;
+
+	if (fastsleep_workaround_applyonce == 1)
+		return count;
+
+	/*
+	 * fastsleep_workaround_applyonce = 1 implies
+	 * fastsleep workaround needs to be left in 'applied' state on all
+	 * the cores. Do this by-
+	 * 1. Patching out the call to 'undo' workaround in fastsleep exit path
+	 * 2. Sending ipi to all the cores which have atleast one online thread
+	 * 3. Patching out the call to 'apply' workaround in fastsleep entry
+	 * path
+	 * There is no need to send ipi to cores which have all threads
+	 * offlined, as last thread of the core entering fastsleep or deeper
+	 * state would have applied workaround.
+	 */
+	err = patch_instruction(
+		(unsigned int *)pnv_fastsleep_workaround_at_exit,
+		PPC_INST_NOP);
+	if (err) {
+		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_exit");
+		goto fail;
+	}
+
+	get_online_cpus();
+	primary_thread_mask = cpu_online_cores_map();
+	on_each_cpu_mask(&primary_thread_mask,
+				pnv_fastsleep_workaround_apply,
+				&err, 1);
+	put_online_cpus();
+	if (err) {
+		pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply");
+		goto fail;
+	}
+
+	err = patch_instruction(
+		(unsigned int *)pnv_fastsleep_workaround_at_entry,
+		PPC_INST_NOP);
+	if (err) {
+		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_entry");
+		goto fail;
+	}
+
+	fastsleep_workaround_applyonce = 1;
+
+	return count;
+fail:
+	return -EIO;
+}
+
+static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
+			show_fastsleep_workaround_applyonce,
+			store_fastsleep_workaround_applyonce);
+
 static int __init pnv_init_idle_states(void)
 {
 	struct device_node *power_mgt;
@@ -181,7 +273,16 @@ static int __init pnv_init_idle_states(void)
 		patch_instruction(
 			(unsigned int *)pnv_fastsleep_workaround_at_exit,
 			PPC_INST_NOP);
+	} else {
+		/*
+		 * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
+		 * workaround is needed to use fastsleep. Provide sysfs
+		 * control to choose how this workaround has to be applied.
+		 */
+		device_create_file(cpu_subsys.dev_root,
+				&dev_attr_fastsleep_workaround_applyonce);
 	}
+
 	pnv_alloc_idle_core_states();
 out_free:
 	kfree(flags);
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index a7ade94..bf15ead 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -283,6 +283,7 @@ OPAL_CALL(opal_sensor_read,			OPAL_SENSOR_READ);
 OPAL_CALL(opal_get_param,			OPAL_GET_PARAM);
 OPAL_CALL(opal_set_param,			OPAL_SET_PARAM);
 OPAL_CALL(opal_handle_hmi,			OPAL_HANDLE_HMI);
+OPAL_CALL(opal_config_cpu_idle_state,		OPAL_CONFIG_CPU_IDLE_STATE);
 OPAL_CALL(opal_slw_set_reg,			OPAL_SLW_SET_REG);
 OPAL_CALL(opal_register_dump_region,		OPAL_REGISTER_DUMP_REGION);
 OPAL_CALL(opal_unregister_dump_region,		OPAL_UNREGISTER_DUMP_REGION);
-- 
cgit v0.10.2


From 96e023e7534c16ab54e236c114340e2447c36d2f Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Fri, 15 May 2015 14:06:36 +1000
Subject: powerpc/powernv: Reorder OPAL subsystem initialisation

Most of the OPAL subsystems are always compiled in for PowerNV and
many of them need to be initialised before or after other OPAL
subsystems. Rather than trying to control this ordering through
machine initcalls it is clearer and easier to control initialisation
order with explicit calls in opal_init.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Cc: Mahesh Jagannath Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 9a47813..dcdf83c 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -240,6 +240,9 @@ extern int opal_elog_init(void);
 extern void opal_platform_dump_init(void);
 extern void opal_sys_param_init(void);
 extern void opal_msglog_init(void);
+extern int opal_async_comp_init(void);
+extern int opal_sensor_init(void);
+extern int opal_hmi_handler_init(void);
 
 extern int opal_machine_check(struct pt_regs *regs);
 extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index 693b6cd..bdc8c0c 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -151,7 +151,7 @@ static struct notifier_block opal_async_comp_nb = {
 		.priority	= 0,
 };
 
-static int __init opal_async_comp_init(void)
+int __init opal_async_comp_init(void)
 {
 	struct device_node *opal_node;
 	const __be32 *async;
@@ -205,4 +205,3 @@ out_opal_node:
 out:
 	return err;
 }
-machine_subsys_initcall(powernv, opal_async_comp_init);
diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
index b322bfb..a8f49d3 100644
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -170,7 +170,7 @@ static struct notifier_block opal_hmi_handler_nb = {
 	.priority	= 0,
 };
 
-static int __init opal_hmi_handler_init(void)
+int __init opal_hmi_handler_init(void)
 {
 	int ret;
 
@@ -186,4 +186,3 @@ static int __init opal_hmi_handler_init(void)
 	}
 	return 0;
 }
-machine_subsys_initcall(powernv, opal_hmi_handler_init);
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c
index 43db213..00a2943 100644
--- a/arch/powerpc/platforms/powernv/opal-memory-errors.c
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -144,4 +144,4 @@ static int __init opal_mem_err_init(void)
 	}
 	return 0;
 }
-machine_subsys_initcall(powernv, opal_mem_err_init);
+machine_device_initcall(powernv, opal_mem_err_init);
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index 6552504..a06059d 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -77,7 +77,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(opal_get_sensor_data);
 
-static __init int opal_sensor_init(void)
+int __init opal_sensor_init(void)
 {
 	struct platform_device *pdev;
 	struct device_node *sensor;
@@ -93,4 +93,3 @@ static __init int opal_sensor_init(void)
 
 	return PTR_ERR_OR_ZERO(pdev);
 }
-machine_subsys_initcall(powernv, opal_sensor_init);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 2241565..eb3decc 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -393,7 +393,6 @@ static int __init opal_message_init(void)
 	}
 	return 0;
 }
-machine_early_initcall(powernv, opal_message_init);
 
 int opal_get_chars(uint32_t vtermno, char *buf, int count)
 {
@@ -807,6 +806,18 @@ static int __init opal_init(void)
 		of_node_put(consoles);
 	}
 
+	/* Initialise OPAL messaging system */
+	opal_message_init();
+
+	/* Initialise OPAL asynchronous completion interface */
+	opal_async_comp_init();
+
+	/* Initialise OPAL sensor interface */
+	opal_sensor_init();
+
+	/* Initialise OPAL hypervisor maintainence interrupt handling */
+	opal_hmi_handler_init();
+
 	/* Create i2c platform devices */
 	opal_i2c_create_devs();
 
-- 
cgit v0.10.2


From 9f0fd0499d30dbd61632463f293e2e826fa363b1 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Fri, 15 May 2015 14:06:37 +1000
Subject: powerpc/powernv: Add a virtual irqchip for opal events

Whenever an interrupt is received for opal the linux kernel gets a
bitfield indicating certain events that have occurred and need handling
by the various device drivers. Currently this is handled using a
notifier interface where we call every device driver that has
registered to receive opal events.

This approach has several drawbacks. For example each driver has to do
its own checking to see if the event is relevant as well as event
masking. There is also no easy method of recording the number of times
we receive particular events.

This patch solves these issues by exposing opal events via the
standard interrupt APIs by adding a new interrupt chip and
domain. Drivers can then register for the appropriate events using
standard kernel calls such as irq_of_parse_and_map().

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index dcdf83c..1412814 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -243,6 +243,7 @@ extern void opal_msglog_init(void);
 extern int opal_async_comp_init(void);
 extern int opal_sensor_init(void);
 extern int opal_hmi_handler_init(void);
+extern int opal_event_init(void);
 
 extern int opal_machine_check(struct pt_regs *regs);
 extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
@@ -254,6 +255,8 @@ extern int opal_resync_timebase(void);
 
 extern void opal_lpc_init(void);
 
+extern int opal_event_request(unsigned int opal_event_nr);
+
 struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
 					     unsigned long vmalloc_size);
 void opal_free_sg_list(struct opal_sg_list *sg);
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index bee9235..26bd7e4 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,7 +1,7 @@
 obj-y			+= setup.o opal-wrappers.o opal.o opal-async.o idle.o
 obj-y			+= opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
-obj-y			+= opal-msglog.o opal-hmi.o opal-power.o
+obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
new file mode 100644
index 0000000..bd5125d
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -0,0 +1,253 @@
+/*
+ * This file implements an irqchip for OPAL events. Whenever there is
+ * an interrupt that is handled by OPAL we get passed a list of events
+ * that Linux needs to do something about. These basically look like
+ * interrupts to Linux so we implement an irqchip to handle them.
+ *
+ * Copyright Alistair Popple, IBM Corporation 2014.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/irq_work.h>
+
+#include <asm/machdep.h>
+#include <asm/opal.h>
+
+#include "powernv.h"
+
+/* Maximum number of events supported by OPAL firmware */
+#define MAX_NUM_EVENTS 64
+
+struct opal_event_irqchip {
+	struct irq_chip irqchip;
+	struct irq_domain *domain;
+	unsigned long mask;
+};
+static struct opal_event_irqchip opal_event_irqchip;
+
+static unsigned int opal_irq_count;
+static unsigned int *opal_irqs;
+
+static void opal_handle_irq_work(struct irq_work *work);
+static __be64 last_outstanding_events;
+static struct irq_work opal_event_irq_work = {
+	.func = opal_handle_irq_work,
+};
+
+static void opal_event_mask(struct irq_data *d)
+{
+	clear_bit(d->hwirq, &opal_event_irqchip.mask);
+}
+
+static void opal_event_unmask(struct irq_data *d)
+{
+	set_bit(d->hwirq, &opal_event_irqchip.mask);
+
+	opal_poll_events(&last_outstanding_events);
+	if (last_outstanding_events & opal_event_irqchip.mask)
+		/* Need to retrigger the interrupt */
+		irq_work_queue(&opal_event_irq_work);
+}
+
+static int opal_event_set_type(struct irq_data *d, unsigned int flow_type)
+{
+	/*
+	 * For now we only support level triggered events. The irq
+	 * handler will be called continuously until the event has
+	 * been cleared in OPAL.
+	 */
+	if (flow_type != IRQ_TYPE_LEVEL_HIGH)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct opal_event_irqchip opal_event_irqchip = {
+	.irqchip = {
+		.name = "OPAL EVT",
+		.irq_mask = opal_event_mask,
+		.irq_unmask = opal_event_unmask,
+		.irq_set_type = opal_event_set_type,
+	},
+	.mask = 0,
+};
+
+static int opal_event_map(struct irq_domain *d, unsigned int irq,
+			irq_hw_number_t hwirq)
+{
+	irq_set_chip_data(irq, &opal_event_irqchip);
+	irq_set_chip_and_handler(irq, &opal_event_irqchip.irqchip,
+				handle_level_irq);
+
+	return 0;
+}
+
+void opal_handle_events(uint64_t events)
+{
+	int virq, hwirq = 0;
+	u64 mask = opal_event_irqchip.mask;
+	u64 notifier_mask = 0;
+
+	if (!in_irq() && (events & mask)) {
+		last_outstanding_events = events;
+		irq_work_queue(&opal_event_irq_work);
+		return;
+	}
+
+	while (events) {
+		hwirq = fls64(events) - 1;
+		virq = irq_find_mapping(opal_event_irqchip.domain,
+					hwirq);
+		if (virq) {
+			if (BIT_ULL(hwirq) & mask)
+				generic_handle_irq(virq);
+		} else
+			notifier_mask |= BIT_ULL(hwirq);
+		events &= ~BIT_ULL(hwirq);
+	}
+
+	opal_do_notifier(notifier_mask);
+}
+
+static irqreturn_t opal_interrupt(int irq, void *data)
+{
+	__be64 events;
+
+	opal_handle_interrupt(virq_to_hw(irq), &events);
+	opal_handle_events(be64_to_cpu(events));
+
+	return IRQ_HANDLED;
+}
+
+static void opal_handle_irq_work(struct irq_work *work)
+{
+	opal_handle_events(be64_to_cpu(last_outstanding_events));
+}
+
+static int opal_event_match(struct irq_domain *h, struct device_node *node)
+{
+	return h->of_node == node;
+}
+
+static int opal_event_xlate(struct irq_domain *h, struct device_node *np,
+			   const u32 *intspec, unsigned int intsize,
+			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+{
+	*out_hwirq = intspec[0];
+	*out_flags = IRQ_TYPE_LEVEL_HIGH;
+
+	return 0;
+}
+
+static const struct irq_domain_ops opal_event_domain_ops = {
+	.match	= opal_event_match,
+	.map	= opal_event_map,
+	.xlate	= opal_event_xlate,
+};
+
+void opal_event_shutdown(void)
+{
+	unsigned int i;
+
+	/* First free interrupts, which will also mask them */
+	for (i = 0; i < opal_irq_count; i++) {
+		if (opal_irqs[i])
+			free_irq(opal_irqs[i], NULL);
+		opal_irqs[i] = 0;
+	}
+}
+
+int __init opal_event_init(void)
+{
+	struct device_node *dn, *opal_node;
+	const __be32 *irqs;
+	int i, irqlen, rc = 0;
+
+	opal_node = of_find_node_by_path("/ibm,opal");
+	if (!opal_node) {
+		pr_warn("opal: Node not found\n");
+		return -ENODEV;
+	}
+
+	/* If dn is NULL it means the domain won't be linked to a DT
+	 * node so therefore irq_of_parse_and_map(...) wont work. But
+	 * that shouldn't be problem because if we're running a
+	 * version of skiboot that doesn't have the dn then the
+	 * devices won't have the correct properties and will have to
+	 * fall back to the legacy method (opal_event_request(...))
+	 * anyway. */
+	dn = of_find_compatible_node(NULL, NULL, "ibm,opal-event");
+	opal_event_irqchip.domain = irq_domain_add_linear(dn, MAX_NUM_EVENTS,
+				&opal_event_domain_ops, &opal_event_irqchip);
+	of_node_put(dn);
+	if (!opal_event_irqchip.domain) {
+		pr_warn("opal: Unable to create irq domain\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Get interrupt property */
+	irqs = of_get_property(opal_node, "opal-interrupts", &irqlen);
+	opal_irq_count = irqs ? (irqlen / 4) : 0;
+	pr_debug("Found %d interrupts reserved for OPAL\n", opal_irq_count);
+
+	/* Install interrupt handlers */
+	opal_irqs = kcalloc(opal_irq_count, sizeof(*opal_irqs), GFP_KERNEL);
+	for (i = 0; irqs && i < opal_irq_count; i++, irqs++) {
+		unsigned int irq, virq;
+
+		/* Get hardware and virtual IRQ */
+		irq = be32_to_cpup(irqs);
+		virq = irq_create_mapping(NULL, irq);
+		if (virq == NO_IRQ) {
+			pr_warn("Failed to map irq 0x%x\n", irq);
+			continue;
+		}
+
+		/* Install interrupt handler */
+		rc = request_irq(virq, opal_interrupt, 0, "opal", NULL);
+		if (rc) {
+			irq_dispose_mapping(virq);
+			pr_warn("Error %d requesting irq %d (0x%x)\n",
+				 rc, virq, irq);
+			continue;
+		}
+
+		/* Cache IRQ */
+		opal_irqs[i] = virq;
+	}
+
+out:
+	of_node_put(opal_node);
+	return rc;
+}
+
+/**
+ * opal_event_request(unsigned int opal_event_nr) - Request an event
+ * @opal_event_nr: the opal event number to request
+ *
+ * This routine can be used to find the linux virq number which can
+ * then be passed to request_irq to assign a handler for a particular
+ * opal event. This should only be used by legacy devices which don't
+ * have proper device tree bindings. Most devices should use
+ * irq_of_parse_and_map() instead.
+ */
+int opal_event_request(unsigned int opal_event_nr)
+{
+	return irq_create_mapping(opal_event_irqchip.domain, opal_event_nr);
+}
+EXPORT_SYMBOL(opal_event_request);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index eb3decc..3baca71 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -53,8 +53,6 @@ static int mc_recoverable_range_len;
 
 struct device_node *opal_node;
 static DEFINE_SPINLOCK(opal_write_lock);
-static unsigned int *opal_irqs;
-static unsigned int opal_irq_count;
 static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
 static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
 static DEFINE_SPINLOCK(opal_notifier_lock);
@@ -251,7 +249,7 @@ int opal_notifier_unregister(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(opal_notifier_unregister);
 
-static void opal_do_notifier(uint64_t events)
+void opal_do_notifier(uint64_t events)
 {
 	unsigned long flags;
 	uint64_t changed_mask;
@@ -571,8 +569,10 @@ int opal_handle_hmi_exception(struct pt_regs *regs)
 
 	local_paca->hmi_event_available = 0;
 	rc = opal_poll_events(&evt);
-	if (rc == OPAL_SUCCESS && evt)
+	if (rc == OPAL_SUCCESS && evt) {
 		opal_do_notifier(be64_to_cpu(evt));
+		opal_handle_events(be64_to_cpu(evt));
+	}
 
 	return 1;
 }
@@ -609,17 +609,6 @@ out:
 	return !!recover_addr;
 }
 
-static irqreturn_t opal_interrupt(int irq, void *data)
-{
-	__be64 events;
-
-	opal_handle_interrupt(virq_to_hw(irq), &events);
-
-	opal_do_notifier(be64_to_cpu(events));
-
-	return IRQ_HANDLED;
-}
-
 static int opal_sysfs_init(void)
 {
 	opal_kobj = kobject_create_and_add("opal", firmware_kobj);
@@ -718,52 +707,15 @@ static void opal_i2c_create_devs(void)
 		of_platform_device_create(np, NULL, NULL);
 }
 
-static void __init opal_irq_init(struct device_node *dn)
-{
-	const __be32 *irqs;
-	int i, irqlen;
-
-	/* Get interrupt property */
-	irqs = of_get_property(opal_node, "opal-interrupts", &irqlen);
-	opal_irq_count = irqs ? (irqlen / 4) : 0;
-	pr_debug("Found %d interrupts reserved for OPAL\n", opal_irq_count);
-	if (!opal_irq_count)
-		return;
-
-	/* Install interrupt handlers */
-	opal_irqs = kzalloc(opal_irq_count * sizeof(unsigned int), GFP_KERNEL);
-	for (i = 0; irqs && i < opal_irq_count; i++, irqs++) {
-		unsigned int irq, virq;
-		int rc;
-
-		/* Get hardware and virtual IRQ */
-		irq = be32_to_cpup(irqs);
-		virq = irq_create_mapping(NULL, irq);
-		if (virq == NO_IRQ) {
-			pr_warn("Failed to map irq 0x%x\n", irq);
-			continue;
-		}
-
-		/* Install interrupt handler */
-		rc = request_irq(virq, opal_interrupt, 0, "opal", NULL);
-		if (rc) {
-			irq_dispose_mapping(virq);
-			pr_warn("Error %d requesting irq %d (0x%x)\n",
-				 rc, virq, irq);
-			continue;
-		}
-
-		/* Cache IRQ */
-		opal_irqs[i] = virq;
-	}
-}
-
 static int kopald(void *unused)
 {
+	__be64 events;
+
 	set_freezable();
 	do {
 		try_to_freeze();
-		opal_poll_events(NULL);
+		opal_poll_events(&events);
+		opal_handle_events(be64_to_cpu(events));
 		msleep_interruptible(opal_heartbeat);
 	} while (!kthread_should_stop());
 
@@ -792,6 +744,9 @@ static int __init opal_init(void)
 		return -ENODEV;
 	}
 
+	/* Initialise OPAL events */
+	opal_event_init();
+
 	/* Register OPAL consoles if any ports */
 	if (firmware_has_feature(FW_FEATURE_OPALv2))
 		consoles = of_find_node_by_path("/ibm,opal/consoles");
@@ -824,9 +779,6 @@ static int __init opal_init(void)
 	/* Setup a heatbeat thread if requested by OPAL */
 	opal_init_heartbeat();
 
-	/* Find all OPAL interrupts and request them */
-	opal_irq_init(opal_node);
-
 	/* Create "opal" kobject under /sys/firmware */
 	rc = opal_sysfs_init();
 	if (rc == 0) {
@@ -857,15 +809,9 @@ machine_subsys_initcall(powernv, opal_init);
 
 void opal_shutdown(void)
 {
-	unsigned int i;
 	long rc = OPAL_BUSY;
 
-	/* First free interrupts, which will also mask them */
-	for (i = 0; i < opal_irq_count; i++) {
-		if (opal_irqs[i])
-			free_irq(opal_irqs[i], NULL);
-		opal_irqs[i] = 0;
-	}
+	opal_event_shutdown();
 
 	/*
 	 * Then sync with OPAL which ensure anything that can
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index 826d2c9..221d4c8 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -35,6 +35,10 @@ extern u32 pnv_get_supported_cpuidle_states(void);
 
 extern void pnv_lpc_init(void);
 
+extern void opal_do_notifier(uint64_t events);
+extern void opal_handle_events(uint64_t events);
+extern void opal_event_shutdown(void);
+
 bool cpu_core_split_required(void);
 
 #endif /* _POWERNV_H */
-- 
cgit v0.10.2


From dce143c3381c355ef73be3dd97cf3ca1b15359b8 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Fri, 15 May 2015 14:06:38 +1000
Subject: ipmi/powernv: Convert to irq event interface

Convert the opal ipmi driver to use the new irq interface for events.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Acked-by: Corey Minyard <cminyard@mvista.com>
Cc: Corey Minyard <minyard@acm.org>
Cc: openipmi-developer@lists.sourceforge.net
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/char/ipmi/ipmi_powernv.c b/drivers/char/ipmi/ipmi_powernv.c
index 8753b0f..9b409c0 100644
--- a/drivers/char/ipmi/ipmi_powernv.c
+++ b/drivers/char/ipmi/ipmi_powernv.c
@@ -15,6 +15,8 @@
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/interrupt.h>
 
 #include <asm/opal.h>
 
@@ -23,8 +25,7 @@ struct ipmi_smi_powernv {
 	u64			interface_id;
 	struct ipmi_device_id	ipmi_id;
 	ipmi_smi_t		intf;
-	u64			event;
-	struct notifier_block	event_nb;
+	unsigned int		irq;
 
 	/**
 	 * We assume that there can only be one outstanding request, so
@@ -197,15 +198,12 @@ static struct ipmi_smi_handlers ipmi_powernv_smi_handlers = {
 	.poll			= ipmi_powernv_poll,
 };
 
-static int ipmi_opal_event(struct notifier_block *nb,
-			  unsigned long events, void *change)
+static irqreturn_t ipmi_opal_event(int irq, void *data)
 {
-	struct ipmi_smi_powernv *smi = container_of(nb,
-					struct ipmi_smi_powernv, event_nb);
+	struct ipmi_smi_powernv *smi = data;
 
-	if (events & smi->event)
-		ipmi_powernv_recv(smi);
-	return 0;
+	ipmi_powernv_recv(smi);
+	return IRQ_HANDLED;
 }
 
 static int ipmi_powernv_probe(struct platform_device *pdev)
@@ -240,13 +238,16 @@ static int ipmi_powernv_probe(struct platform_device *pdev)
 		goto err_free;
 	}
 
-	ipmi->event = 1ull << prop;
-	ipmi->event_nb.notifier_call = ipmi_opal_event;
+	ipmi->irq = irq_of_parse_and_map(dev->of_node, 0);
+	if (!ipmi->irq) {
+		dev_info(dev, "Unable to map irq from device tree\n");
+		ipmi->irq = opal_event_request(prop);
+	}
 
-	rc = opal_notifier_register(&ipmi->event_nb);
-	if (rc) {
-		dev_warn(dev, "OPAL notifier registration failed (%d)\n", rc);
-		goto err_free;
+	if (request_irq(ipmi->irq, ipmi_opal_event, IRQ_TYPE_LEVEL_HIGH,
+				"opal-ipmi", ipmi)) {
+		dev_warn(dev, "Unable to request irq\n");
+		goto err_dispose;
 	}
 
 	ipmi->opal_msg = devm_kmalloc(dev,
@@ -271,7 +272,9 @@ static int ipmi_powernv_probe(struct platform_device *pdev)
 err_free_msg:
 	devm_kfree(dev, ipmi->opal_msg);
 err_unregister:
-	opal_notifier_unregister(&ipmi->event_nb);
+	free_irq(ipmi->irq, ipmi);
+err_dispose:
+	irq_dispose_mapping(ipmi->irq);
 err_free:
 	devm_kfree(dev, ipmi);
 	return rc;
@@ -282,7 +285,9 @@ static int ipmi_powernv_remove(struct platform_device *pdev)
 	struct ipmi_smi_powernv *smi = dev_get_drvdata(&pdev->dev);
 
 	ipmi_unregister_smi(smi->intf);
-	opal_notifier_unregister(&smi->event_nb);
+	free_irq(smi->irq, smi);
+	irq_dispose_mapping(smi->irq);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 2def86a7200ca3a9828926eef927a89adb82ce32 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Fri, 15 May 2015 14:06:39 +1000
Subject: hvc: Convert to using interrupts instead of opal events

Convert the opal hvc driver to use the new irqchip to register for
opal events. As older firmware versions may not have device tree
bindings for the interrupt parent we just use a hardcoded hwirq based
on the event number.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/tty/hvc/hvc_opal.c b/drivers/tty/hvc/hvc_opal.c
index 543b234..47b54c6 100644
--- a/drivers/tty/hvc/hvc_opal.c
+++ b/drivers/tty/hvc/hvc_opal.c
@@ -29,6 +29,7 @@
 #include <linux/of.h>
 #include <linux/of_platform.h>
 #include <linux/export.h>
+#include <linux/interrupt.h>
 
 #include <asm/hvconsole.h>
 #include <asm/prom.h>
@@ -61,7 +62,6 @@ static struct hvc_opal_priv *hvc_opal_privs[MAX_NR_HVC_CONSOLES];
 /* For early boot console */
 static struct hvc_opal_priv hvc_opal_boot_priv;
 static u32 hvc_opal_boot_termno;
-static bool hvc_opal_event_registered;
 
 static const struct hv_ops hvc_opal_raw_ops = {
 	.get_chars = opal_get_chars,
@@ -162,28 +162,15 @@ static const struct hv_ops hvc_opal_hvsi_ops = {
 	.tiocmset = hvc_opal_hvsi_tiocmset,
 };
 
-static int hvc_opal_console_event(struct notifier_block *nb,
-				  unsigned long events, void *change)
-{
-	if (events & OPAL_EVENT_CONSOLE_INPUT)
-		hvc_kick();
-	return 0;
-}
-
-static struct notifier_block hvc_opal_console_nb = {
-	.notifier_call	= hvc_opal_console_event,
-};
-
 static int hvc_opal_probe(struct platform_device *dev)
 {
 	const struct hv_ops *ops;
 	struct hvc_struct *hp;
 	struct hvc_opal_priv *pv;
 	hv_protocol_t proto;
-	unsigned int termno, boot = 0;
+	unsigned int termno, irq, boot = 0;
 	const __be32 *reg;
 
-
 	if (of_device_is_compatible(dev->dev.of_node, "ibm,opal-console-raw")) {
 		proto = HV_PROTOCOL_RAW;
 		ops = &hvc_opal_raw_ops;
@@ -227,18 +214,18 @@ static int hvc_opal_probe(struct platform_device *dev)
 		dev->dev.of_node->full_name,
 		boot ? " (boot console)" : "");
 
-	/* We don't do IRQ ... */
-	hp = hvc_alloc(termno, 0, ops, MAX_VIO_PUT_CHARS);
+	irq = opal_event_request(ilog2(OPAL_EVENT_CONSOLE_INPUT));
+	if (!irq) {
+		pr_err("hvc_opal: Unable to map interrupt for device %s\n",
+			dev->dev.of_node->full_name);
+		return irq;
+	}
+
+	hp = hvc_alloc(termno, irq, ops, MAX_VIO_PUT_CHARS);
 	if (IS_ERR(hp))
 		return PTR_ERR(hp);
 	dev_set_drvdata(&dev->dev, hp);
 
-	/* ...  but we use OPAL event to kick the console */
-	if (!hvc_opal_event_registered) {
-		opal_notifier_register(&hvc_opal_console_nb);
-		hvc_opal_event_registered = true;
-	}
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From 79231448c929cc0870f6e5cc10c485661dfb4a9f Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Fri, 15 May 2015 14:06:40 +1000
Subject: powernv/eeh: Update the EEH code to use the opal irq domain

The eeh code currently uses the old notifier method to get eeh events
from OPAL. It also contains some logic to filter opal events which has
been moved into the virtual irqchip. This patch converts the eeh code
to the new event interface which simplifies event handling.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 622f08c..5cf5e6e 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -16,6 +16,7 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/msi.h>
 #include <linux/of.h>
@@ -40,6 +41,7 @@
 #include "pci.h"
 
 static bool pnv_eeh_nb_init = false;
+static int eeh_event_irq = -EINVAL;
 
 /**
  * pnv_eeh_init - EEH platform dependent initialization
@@ -88,34 +90,22 @@ static int pnv_eeh_init(void)
 	return 0;
 }
 
-static int pnv_eeh_event(struct notifier_block *nb,
-			 unsigned long events, void *change)
+static irqreturn_t pnv_eeh_event(int irq, void *data)
 {
-	uint64_t changed_evts = (uint64_t)change;
-
 	/*
-	 * We simply send special EEH event if EEH has
-	 * been enabled, or clear pending events in
-	 * case that we enable EEH soon
+	 * We simply send a special EEH event if EEH has been
+	 * enabled. We don't care about EEH events until we've
+	 * finished processing the outstanding ones. Event processing
+	 * gets unmasked in next_error() if EEH is enabled.
 	 */
-	if (!(changed_evts & OPAL_EVENT_PCI_ERROR) ||
-	    !(events & OPAL_EVENT_PCI_ERROR))
-		return 0;
+	disable_irq_nosync(irq);
 
 	if (eeh_enabled())
 		eeh_send_failure_event(NULL);
-	else
-		opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
 
-	return 0;
+	return IRQ_HANDLED;
 }
 
-static struct notifier_block pnv_eeh_nb = {
-	.notifier_call	= pnv_eeh_event,
-	.next		= NULL,
-	.priority	= 0
-};
-
 #ifdef CONFIG_DEBUG_FS
 static ssize_t pnv_eeh_ei_write(struct file *filp,
 				const char __user *user_buf,
@@ -237,16 +227,28 @@ static int pnv_eeh_post_init(void)
 
 	/* Register OPAL event notifier */
 	if (!pnv_eeh_nb_init) {
-		ret = opal_notifier_register(&pnv_eeh_nb);
-		if (ret) {
-			pr_warn("%s: Can't register OPAL event notifier (%d)\n",
-				__func__, ret);
+		eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
+		if (eeh_event_irq < 0) {
+			pr_err("%s: Can't register OPAL event interrupt (%d)\n",
+			       __func__, eeh_event_irq);
+			return eeh_event_irq;
+		}
+
+		ret = request_irq(eeh_event_irq, pnv_eeh_event,
+				IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL);
+		if (ret < 0) {
+			irq_dispose_mapping(eeh_event_irq);
+			pr_err("%s: Can't request OPAL event interrupt (%d)\n",
+			       __func__, eeh_event_irq);
 			return ret;
 		}
 
 		pnv_eeh_nb_init = true;
 	}
 
+	if (!eeh_enabled())
+		disable_irq(eeh_event_irq);
+
 	list_for_each_entry(hose, &hose_list, list_node) {
 		phb = hose->private_data;
 
@@ -1303,12 +1305,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 	int state, ret = EEH_NEXT_ERR_NONE;
 
 	/*
-	 * While running here, it's safe to purge the event queue.
-	 * And we should keep the cached OPAL notifier event sychronized
-	 * between the kernel and firmware.
+	 * While running here, it's safe to purge the event queue. The
+	 * event should still be masked.
 	 */
 	eeh_remove_event(NULL, false);
-	opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
 
 	list_for_each_entry(hose, &hose_list, list_node) {
 		/*
@@ -1477,6 +1477,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 			break;
 	}
 
+	/* Unmask the event */
+	if (eeh_enabled())
+		enable_irq(eeh_event_irq);
+
 	return ret;
 }
 
-- 
cgit v0.10.2


From a295af24d0d2af238d74a994603407b72de157b3 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Fri, 15 May 2015 14:06:41 +1000
Subject: powernv/opal: Convert opal message events to opal irq domain

This patch converts the opal message event to use the new opal irq
domain.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 3baca71..cd5718b 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -362,33 +362,34 @@ static void opal_handle_message(void)
 	opal_message_do_notify(type, (void *)&msg);
 }
 
-static int opal_message_notify(struct notifier_block *nb,
-			  unsigned long events, void *change)
+static irqreturn_t opal_message_notify(int irq, void *data)
 {
-	if (events & OPAL_EVENT_MSG_PENDING)
-		opal_handle_message();
-	return 0;
+	opal_handle_message();
+	return IRQ_HANDLED;
 }
 
-static struct notifier_block opal_message_nb = {
-	.notifier_call	= opal_message_notify,
-	.next		= NULL,
-	.priority	= 0,
-};
-
 static int __init opal_message_init(void)
 {
-	int ret, i;
+	int ret, i, irq;
 
 	for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
 		ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
 
-	ret = opal_notifier_register(&opal_message_nb);
+	irq = opal_event_request(ilog2(OPAL_EVENT_MSG_PENDING));
+	if (!irq) {
+		pr_err("%s: Can't register OPAL event irq (%d)\n",
+		       __func__, irq);
+		return irq;
+	}
+
+	ret = request_irq(irq, opal_message_notify,
+			IRQ_TYPE_LEVEL_HIGH, "opal-msg", NULL);
 	if (ret) {
-		pr_err("%s: Can't register OPAL event notifier (%d)\n",
+		pr_err("%s: Can't request OPAL event irq (%d)\n",
 		       __func__, ret);
 		return ret;
 	}
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 74159a70283ba685cd4d856cfc4438596524b1f2 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Fri, 15 May 2015 14:06:42 +1000
Subject: powernv/elog: Convert elog to opal irq domain

This patch converts the elog code to use the opal irq domain instead
of notifier events.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c
index 38ce757..4949ef0 100644
--- a/arch/powerpc/platforms/powernv/opal-elog.c
+++ b/arch/powerpc/platforms/powernv/opal-elog.c
@@ -10,6 +10,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/of.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
@@ -276,24 +277,15 @@ static void elog_work_fn(struct work_struct *work)
 
 static DECLARE_WORK(elog_work, elog_work_fn);
 
-static int elog_event(struct notifier_block *nb,
-				unsigned long events, void *change)
+static irqreturn_t elog_event(int irq, void *data)
 {
-	/* check for error log event */
-	if (events & OPAL_EVENT_ERROR_LOG_AVAIL)
-		schedule_work(&elog_work);
-	return 0;
+	schedule_work(&elog_work);
+	return IRQ_HANDLED;
 }
 
-static struct notifier_block elog_nb = {
-	.notifier_call  = elog_event,
-	.next           = NULL,
-	.priority       = 0
-};
-
 int __init opal_elog_init(void)
 {
-	int rc = 0;
+	int rc = 0, irq;
 
 	/* ELOG not supported by firmware */
 	if (!opal_check_token(OPAL_ELOG_READ))
@@ -305,10 +297,18 @@ int __init opal_elog_init(void)
 		return -1;
 	}
 
-	rc = opal_notifier_register(&elog_nb);
+	irq = opal_event_request(ilog2(OPAL_EVENT_ERROR_LOG_AVAIL));
+	if (!irq) {
+		pr_err("%s: Can't register OPAL event irq (%d)\n",
+		       __func__, irq);
+		return irq;
+	}
+
+	rc = request_irq(irq, elog_event,
+			IRQ_TYPE_LEVEL_HIGH, "opal-elog", NULL);
 	if (rc) {
-		pr_err("%s: Can't register OPAL event notifier (%d)\n",
-		__func__, rc);
+		pr_err("%s: Can't request OPAL event irq (%d)\n",
+		       __func__, rc);
 		return rc;
 	}
 
-- 
cgit v0.10.2


From 8034f715f0a5fce97b14bacd47eefb1513316b17 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Fri, 15 May 2015 14:06:43 +1000
Subject: powernv/opal-dump: Convert to irq domain

Convert the opal dump driver to the new opal irq domain.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c
index 5aa9c1c..2ee9643 100644
--- a/arch/powerpc/platforms/powernv/opal-dump.c
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -15,6 +15,7 @@
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/delay.h>
+#include <linux/interrupt.h>
 
 #include <asm/opal.h>
 
@@ -60,7 +61,7 @@ static ssize_t dump_type_show(struct dump_obj *dump_obj,
 			      struct dump_attribute *attr,
 			      char *buf)
 {
-	
+
 	return sprintf(buf, "0x%x %s\n", dump_obj->type,
 		       dump_type_to_string(dump_obj->type));
 }
@@ -363,7 +364,7 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
 	return dump;
 }
 
-static int process_dump(void)
+static irqreturn_t process_dump(int irq, void *data)
 {
 	int rc;
 	uint32_t dump_id, dump_size, dump_type;
@@ -387,45 +388,13 @@ static int process_dump(void)
 	if (!dump)
 		return -1;
 
-	return 0;
-}
-
-static void dump_work_fn(struct work_struct *work)
-{
-	process_dump();
+	return IRQ_HANDLED;
 }
 
-static DECLARE_WORK(dump_work, dump_work_fn);
-
-static void schedule_process_dump(void)
-{
-	schedule_work(&dump_work);
-}
-
-/*
- * New dump available notification
- *
- * Once we get notification, we add sysfs entries for it.
- * We only fetch the dump on demand, and create sysfs asynchronously.
- */
-static int dump_event(struct notifier_block *nb,
-		      unsigned long events, void *change)
-{
-	if (events & OPAL_EVENT_DUMP_AVAIL)
-		schedule_process_dump();
-
-	return 0;
-}
-
-static struct notifier_block dump_nb = {
-	.notifier_call  = dump_event,
-	.next           = NULL,
-	.priority       = 0
-};
-
 void __init opal_platform_dump_init(void)
 {
 	int rc;
+	int dump_irq;
 
 	/* ELOG not supported by firmware */
 	if (!opal_check_token(OPAL_DUMP_READ))
@@ -445,10 +414,19 @@ void __init opal_platform_dump_init(void)
 		return;
 	}
 
-	rc = opal_notifier_register(&dump_nb);
+	dump_irq = opal_event_request(ilog2(OPAL_EVENT_DUMP_AVAIL));
+	if (!dump_irq) {
+		pr_err("%s: Can't register OPAL event irq (%d)\n",
+		       __func__, dump_irq);
+		return;
+	}
+
+	rc = request_threaded_irq(dump_irq, NULL, process_dump,
+				IRQF_TRIGGER_HIGH | IRQF_ONESHOT,
+				"opal-dump", NULL);
 	if (rc) {
-		pr_warn("%s: Can't register OPAL event notifier (%d)\n",
-			__func__, rc);
+		pr_err("%s: Can't request OPAL event irq (%d)\n",
+		       __func__, rc);
 		return;
 	}
 
-- 
cgit v0.10.2


From 81f2f7ce4c5bb688ad691cb3ee37e81ca26a8a3b Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Fri, 15 May 2015 14:06:44 +1000
Subject: opal: Remove events notifier

All users of the old opal events notifier have been converted over to
the irq domain so remove the event notifier functions.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
index bd5125d..841135f 100644
--- a/arch/powerpc/platforms/powernv/opal-irqchip.c
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -100,7 +100,6 @@ void opal_handle_events(uint64_t events)
 {
 	int virq, hwirq = 0;
 	u64 mask = opal_event_irqchip.mask;
-	u64 notifier_mask = 0;
 
 	if (!in_irq() && (events & mask)) {
 		last_outstanding_events = events;
@@ -108,19 +107,16 @@ void opal_handle_events(uint64_t events)
 		return;
 	}
 
-	while (events) {
+	while (events & mask) {
 		hwirq = fls64(events) - 1;
-		virq = irq_find_mapping(opal_event_irqchip.domain,
-					hwirq);
-		if (virq) {
-			if (BIT_ULL(hwirq) & mask)
+		if (BIT_ULL(hwirq) & mask) {
+			virq = irq_find_mapping(opal_event_irqchip.domain,
+						hwirq);
+			if (virq)
 				generic_handle_irq(virq);
-		} else
-			notifier_mask |= BIT_ULL(hwirq);
+		}
 		events &= ~BIT_ULL(hwirq);
 	}
-
-	opal_do_notifier(notifier_mask);
 }
 
 static irqreturn_t opal_interrupt(int irq, void *data)
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index cd5718b..8403307 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -53,11 +53,7 @@ static int mc_recoverable_range_len;
 
 struct device_node *opal_node;
 static DEFINE_SPINLOCK(opal_write_lock);
-static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
 static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
-static DEFINE_SPINLOCK(opal_notifier_lock);
-static uint64_t last_notified_mask = 0x0ul;
-static atomic_t opal_notifier_hold = ATOMIC_INIT(0);
 static uint32_t opal_heartbeat;
 
 static void opal_reinit_cores(void)
@@ -223,82 +219,6 @@ static int __init opal_register_exception_handlers(void)
 }
 machine_early_initcall(powernv, opal_register_exception_handlers);
 
-int opal_notifier_register(struct notifier_block *nb)
-{
-	if (!nb) {
-		pr_warning("%s: Invalid argument (%p)\n",
-			   __func__, nb);
-		return -EINVAL;
-	}
-
-	atomic_notifier_chain_register(&opal_notifier_head, nb);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(opal_notifier_register);
-
-int opal_notifier_unregister(struct notifier_block *nb)
-{
-	if (!nb) {
-		pr_warning("%s: Invalid argument (%p)\n",
-			   __func__, nb);
-		return -EINVAL;
-	}
-
-	atomic_notifier_chain_unregister(&opal_notifier_head, nb);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(opal_notifier_unregister);
-
-void opal_do_notifier(uint64_t events)
-{
-	unsigned long flags;
-	uint64_t changed_mask;
-
-	if (atomic_read(&opal_notifier_hold))
-		return;
-
-	spin_lock_irqsave(&opal_notifier_lock, flags);
-	changed_mask = last_notified_mask ^ events;
-	last_notified_mask = events;
-	spin_unlock_irqrestore(&opal_notifier_lock, flags);
-
-	/*
-	 * We feed with the event bits and changed bits for
-	 * enough information to the callback.
-	 */
-	atomic_notifier_call_chain(&opal_notifier_head,
-				   events, (void *)changed_mask);
-}
-
-void opal_notifier_update_evt(uint64_t evt_mask,
-			      uint64_t evt_val)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&opal_notifier_lock, flags);
-	last_notified_mask &= ~evt_mask;
-	last_notified_mask |= evt_val;
-	spin_unlock_irqrestore(&opal_notifier_lock, flags);
-}
-
-void opal_notifier_enable(void)
-{
-	int64_t rc;
-	__be64 evt = 0;
-
-	atomic_set(&opal_notifier_hold, 0);
-
-	/* Process pending events */
-	rc = opal_poll_events(&evt);
-	if (rc == OPAL_SUCCESS && evt)
-		opal_do_notifier(be64_to_cpu(evt));
-}
-
-void opal_notifier_disable(void)
-{
-	atomic_set(&opal_notifier_hold, 1);
-}
-
 /*
  * Opal message notifier based on message type. Allow subscribers to get
  * notified for specific messgae type.
@@ -570,10 +490,8 @@ int opal_handle_hmi_exception(struct pt_regs *regs)
 
 	local_paca->hmi_event_available = 0;
 	rc = opal_poll_events(&evt);
-	if (rc == OPAL_SUCCESS && evt) {
-		opal_do_notifier(be64_to_cpu(evt));
+	if (rc == OPAL_SUCCESS && evt)
 		opal_handle_events(be64_to_cpu(evt));
-	}
 
 	return 1;
 }
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index 221d4c8..f907f0a 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -35,7 +35,6 @@ extern u32 pnv_get_supported_cpuidle_states(void);
 
 extern void pnv_lpc_init(void);
 
-extern void opal_do_notifier(uint64_t events);
 extern void opal_handle_events(uint64_t events);
 extern void opal_event_shutdown(void);
 
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 509cdd2..15e9337 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -107,7 +107,7 @@ static void pnv_prepare_going_down(void)
 	 * Disable all notifiers from OPAL, we can't
 	 * service interrupts anymore anyway
 	 */
-	opal_notifier_disable();
+	opal_event_shutdown();
 
 	/* Soft disable interrupts */
 	local_irq_disable();
-- 
cgit v0.10.2


From e059b105d157d0231e2f0a7fba996724d856114b Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 14 Apr 2015 14:27:54 +1000
Subject: powerpc: Add MSI operations to pci_controller_ops struct

Add MSI setup and teardown functions to pci_controller_ops.

Patch the callsites (arch_{setup,teardown}_msi_irqs) to prefer the
controller ops version if it's available.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 1811c44..a3b6252 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -30,6 +30,12 @@ struct pci_controller_ops {
 	/* Called during PCI resource reassignment */
 	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
 	void		(*reset_secondary_bus)(struct pci_dev *dev);
+
+#ifdef CONFIG_PCI_MSI
+	int		(*setup_msi_irqs)(struct pci_dev *dev,
+					  int nvec, int type);
+	void		(*teardown_msi_irqs)(struct pci_dev *dev);
+#endif
 };
 
 /*
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index 71bd161..3d452f7 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -15,7 +15,11 @@
 
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	if (!ppc_md.setup_msi_irqs || !ppc_md.teardown_msi_irqs) {
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	if ((!phb->controller_ops.setup_msi_irqs ||
+	     !phb->controller_ops.teardown_msi_irqs) &&
+	    (!ppc_md.setup_msi_irqs || !ppc_md.teardown_msi_irqs)) {
 		pr_debug("msi: Platform doesn't provide MSI callbacks.\n");
 		return -ENOSYS;
 	}
@@ -24,10 +28,18 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
-	return ppc_md.setup_msi_irqs(dev, nvec, type);
+	if (phb->controller_ops.setup_msi_irqs)
+		return phb->controller_ops.setup_msi_irqs(dev, nvec, type);
+	else
+		return ppc_md.setup_msi_irqs(dev, nvec, type);
 }
 
 void arch_teardown_msi_irqs(struct pci_dev *dev)
 {
-	ppc_md.teardown_msi_irqs(dev);
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	if (phb->controller_ops.teardown_msi_irqs)
+		phb->controller_ops.teardown_msi_irqs(dev);
+	else
+		ppc_md.teardown_msi_irqs(dev);
 }
-- 
cgit v0.10.2


From d6381119a415316c5602710cf7fbf388d15d6d3c Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 14 Apr 2015 14:27:55 +1000
Subject: powerpc/powernv: Move MSI-related ops to pci_controller_ops

Move the PowerNV/BML platform to use the pci_controller_ops structure
rather than ppc_md for MSI related PCI controller operations.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index bca2aeb..0f04940 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -768,16 +768,14 @@ void __init pnv_pci_init(void)
 	ppc_md.tce_free_rm = pnv_tce_free_rm;
 	ppc_md.tce_get = pnv_tce_get;
 	set_pci_dma_ops(&dma_iommu_ops);
-
-	/* Configure MSIs */
-#ifdef CONFIG_PCI_MSI
-	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
-#endif
 }
 
 machine_subsys_initcall_sync(powernv, tce_iommu_bus_notifier_init);
 
 struct pci_controller_ops pnv_pci_controller_ops = {
 	.dma_dev_setup = pnv_pci_dma_dev_setup,
+#ifdef CONFIG_PCI_MSI
+	.setup_msi_irqs = pnv_setup_msi_irqs,
+	.teardown_msi_irqs = pnv_teardown_msi_irqs,
+#endif
 };
-- 
cgit v0.10.2


From 7e3d6c5a4be45769a9d439a4b62ad85cfe9e6754 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 14 Apr 2015 14:27:56 +1000
Subject: powerpc/cell: Move MSI-related ops to pci_controller_ops

Move the Cell platform to use the pci_controller_ops structure rather
than ppc_md for MSI related PCI controller operations.

We can be confident that the functions will be added to the platform's
ops struct before any PCI controller's ops struct is populated
because:

1) These ops are added to the struct in a subsys initcall.

We populate the ops in axon_msi_probe, which is the probe call for the
axon-msi driver. However the driver is registered in axon_msi_init,
which is a subsys initcall, so this will happen at the subsys level.

2) The controller recieves the struct later, in a device initcall.

Cell populates the controller in cell_setup_phb, which is hooked up to
ppc_md.pci_setup_phb. ppc_md.pci_setup_phb is only ever called in
of_platform.c, as part of the OpenFirmware PCI driver's probe
routine. That driver is registered in a device initcall, so it will
occur *after* the struct is properly populated.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index 623bd96..d9b8a44 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -22,6 +22,7 @@
 #include <asm/machdep.h>
 #include <asm/prom.h>
 
+#include "cell.h"
 
 /*
  * MSIC registers, specified as offsets from dcr_base
@@ -406,8 +407,8 @@ static int axon_msi_probe(struct platform_device *device)
 
 	dev_set_drvdata(&device->dev, msic);
 
-	ppc_md.setup_msi_irqs = axon_msi_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = axon_msi_teardown_msi_irqs;
+	cell_pci_controller_ops.setup_msi_irqs = axon_msi_setup_msi_irqs;
+	cell_pci_controller_ops.teardown_msi_irqs = axon_msi_teardown_msi_irqs;
 
 	axon_msi_debug_setup(dn, msic);
 
-- 
cgit v0.10.2


From 1d14b8755f0a7d8110f0bdc5b74987f8cc96c18e Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 14 Apr 2015 14:27:57 +1000
Subject: powerpc/pseries: Move MSI-related ops to pci_controller_ops

Move the pseries platform to use the pci_controller_ops structure
rather than ppc_md for MSI related PCI controller operations

We need to iterate all PHBs because the MSI setup happens later than
find_and_init_phbs() - mpe.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index c8d24f9..c22bb64 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -18,6 +18,8 @@
 #include <asm/ppc-pci.h>
 #include <asm/machdep.h>
 
+#include "pseries.h"
+
 static int query_token, change_token;
 
 #define RTAS_QUERY_FN		0
@@ -505,6 +507,8 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
 
 static int rtas_msi_init(void)
 {
+	struct pci_controller *phb;
+
 	query_token  = rtas_token("ibm,query-interrupt-source-number");
 	change_token = rtas_token("ibm,change-msi");
 
@@ -516,9 +520,15 @@ static int rtas_msi_init(void)
 
 	pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n");
 
-	WARN_ON(ppc_md.setup_msi_irqs);
-	ppc_md.setup_msi_irqs = rtas_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = rtas_teardown_msi_irqs;
+	WARN_ON(pseries_pci_controller_ops.setup_msi_irqs);
+	pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
+	pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
+
+	list_for_each_entry(phb, &hose_list, list_node) {
+		WARN_ON(phb->controller_ops.setup_msi_irqs);
+		phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
+		phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
+	}
 
 	WARN_ON(ppc_md.pci_irq_fixup);
 	ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup;
-- 
cgit v0.10.2


From 00e25397032f590d0a4d0ee89e236a4d1f8c0580 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 14 Apr 2015 14:27:58 +1000
Subject: powerpc/fsl_msi: Move MSI-related ops to pci_controller_ops

Move the fsl_msi subsystem to use the pci_controller_ops structure
rather than ppc_md for MSI related PCI controller operations.

Previously, MSI ops were added to ppc_md at the subsys level. However,
in fsl_pci.c, PCI controllers are created at the at arch level. So,
unlike in e.g. PowerNV/pSeries/Cell, we can't simply populate a
platform-level controller ops structure and have it copied into the
controllers when they are created.

Instead, walk every phb, and attempt to populate it with the MSI ops.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index f086c6f..5236e54 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -405,6 +405,7 @@ static int fsl_of_msi_probe(struct platform_device *dev)
 	const struct fsl_msi_feature *features;
 	int len;
 	u32 offset;
+	struct pci_controller *phb;
 
 	match = of_match_device(fsl_of_msi_ids, &dev->dev);
 	if (!match)
@@ -541,14 +542,20 @@ static int fsl_of_msi_probe(struct platform_device *dev)
 
 	list_add_tail(&msi->list, &msi_head);
 
-	/* The multiple setting ppc_md.setup_msi_irqs will not harm things */
-	if (!ppc_md.setup_msi_irqs) {
-		ppc_md.setup_msi_irqs = fsl_setup_msi_irqs;
-		ppc_md.teardown_msi_irqs = fsl_teardown_msi_irqs;
-	} else if (ppc_md.setup_msi_irqs != fsl_setup_msi_irqs) {
-		dev_err(&dev->dev, "Different MSI driver already installed!\n");
-		err = -ENODEV;
-		goto error_out;
+	/*
+	 * Apply the MSI ops to all the controllers.
+	 * It doesn't hurt to reassign the same ops,
+	 * but bail out if we find another MSI driver.
+	 */
+	list_for_each_entry(phb, &hose_list, list_node) {
+		if (!phb->controller_ops.setup_msi_irqs) {
+			phb->controller_ops.setup_msi_irqs = fsl_setup_msi_irqs;
+			phb->controller_ops.teardown_msi_irqs = fsl_teardown_msi_irqs;
+		} else if (phb->controller_ops.setup_msi_irqs != fsl_setup_msi_irqs) {
+			dev_err(&dev->dev, "Different MSI driver already installed!\n");
+			err = -ENODEV;
+			goto error_out;
+		}
 	}
 	return 0;
 error_out:
-- 
cgit v0.10.2


From b7eba2ffccb51d7676a4ddc26658687331feb6a3 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 14 Apr 2015 14:27:59 +1000
Subject: powerpc/ppc4xx_msi: Move MSI-related ops to pci_controller_ops

Move the ppc4xx msi subsystem to use the pci_controller_ops structure
rather than ppc_md for MSI related PCI controller operations.

As with fsl_msi, operations are plugged in at the subsys level, after
controller creation. Again, we iterate over all controllers and
populate them with the MSI ops.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/sysdev/ppc4xx_msi.c b/arch/powerpc/sysdev/ppc4xx_msi.c
index 6e2e6aa..6eb21f2 100644
--- a/arch/powerpc/sysdev/ppc4xx_msi.c
+++ b/arch/powerpc/sysdev/ppc4xx_msi.c
@@ -218,6 +218,7 @@ static int ppc4xx_msi_probe(struct platform_device *dev)
 	struct ppc4xx_msi *msi;
 	struct resource res;
 	int err = 0;
+	struct pci_controller *phb;
 
 	dev_dbg(&dev->dev, "PCIE-MSI: Setting up MSI support...\n");
 
@@ -250,8 +251,10 @@ static int ppc4xx_msi_probe(struct platform_device *dev)
 	}
 	ppc4xx_msi = *msi;
 
-	ppc_md.setup_msi_irqs = ppc4xx_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = ppc4xx_teardown_msi_irqs;
+	list_for_each_entry(phb, &hose_list, list_node) {
+		phb->controller_ops.setup_msi_irqs = ppc4xx_setup_msi_irqs;
+		phb->controller_ops.teardown_msi_irqs = ppc4xx_teardown_msi_irqs;
+	}
 	return err;
 
 error_out:
-- 
cgit v0.10.2


From f2c800aaceb6f26fe78590f971169b1d3d6fe322 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 14 Apr 2015 14:28:00 +1000
Subject: powerpc/ppc4xx_hsta_msi: Move MSI-related ops to pci_controller_ops

Move the ppc4xx hsta msi subsystem to use the pci_controller_ops
structure rather than ppc_md for MSI related PCI controller
operations.

As with fsl_msi, operations are plugged in at the subsys level, after
controller creation. Again, we iterate over all controllers and
populate them with the MSI ops.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/sysdev/ppc4xx_hsta_msi.c b/arch/powerpc/sysdev/ppc4xx_hsta_msi.c
index f366d2d..2bc3367 100644
--- a/arch/powerpc/sysdev/ppc4xx_hsta_msi.c
+++ b/arch/powerpc/sysdev/ppc4xx_hsta_msi.c
@@ -128,6 +128,7 @@ static int hsta_msi_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct resource *mem;
 	int irq, ret, irq_count;
+	struct pci_controller *phb;
 
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (IS_ERR(mem)) {
@@ -171,8 +172,10 @@ static int hsta_msi_probe(struct platform_device *pdev)
 		}
 	}
 
-	ppc_md.setup_msi_irqs = hsta_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = hsta_teardown_msi_irqs;
+	list_for_each_entry(phb, &hose_list, list_node) {
+		phb->controller_ops.setup_msi_irqs = hsta_setup_msi_irqs;
+		phb->controller_ops.teardown_msi_irqs = hsta_teardown_msi_irqs;
+	}
 	return 0;
 
 out2:
-- 
cgit v0.10.2


From 83922966973e19a48b6e59f9fa1259aa790a33c1 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 14 Apr 2015 14:28:01 +1000
Subject: powerpc/pasemi: Move MSI-related ops to pci_controller_ops

Move the PaSemi MPIC msi subsystem to use the pci_controller_ops
structure rather than ppc_md for MSI related PCI controller
operations.

As with fsl_msi, operations are plugged in at the subsys level, after
controller creation. Again, we iterate over all controllers and
populate them with the MSI ops.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/pasemi/msi.c b/arch/powerpc/platforms/pasemi/msi.c
index 0b37066..27f2b18 100644
--- a/arch/powerpc/platforms/pasemi/msi.c
+++ b/arch/powerpc/platforms/pasemi/msi.c
@@ -142,6 +142,7 @@ static int pasemi_msi_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 int mpic_pasemi_msi_init(struct mpic *mpic)
 {
 	int rc;
+	struct pci_controller *phb;
 
 	if (!mpic->irqhost->of_node ||
 	    !of_device_is_compatible(mpic->irqhost->of_node,
@@ -157,9 +158,11 @@ int mpic_pasemi_msi_init(struct mpic *mpic)
 	pr_debug("pasemi_msi: Registering PA Semi MPIC MSI callbacks\n");
 
 	msi_mpic = mpic;
-	WARN_ON(ppc_md.setup_msi_irqs);
-	ppc_md.setup_msi_irqs = pasemi_msi_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = pasemi_msi_teardown_msi_irqs;
+	list_for_each_entry(phb, &hose_list, list_node) {
+		WARN_ON(phb->controller_ops.setup_msi_irqs);
+		phb->controller_ops.setup_msi_irqs = pasemi_msi_setup_msi_irqs;
+		phb->controller_ops.teardown_msi_irqs = pasemi_msi_teardown_msi_irqs;
+	}
 
 	return 0;
 }
-- 
cgit v0.10.2


From 14f95acda2f3238f3470beea0ddbf78081b3268d Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 14 Apr 2015 14:28:02 +1000
Subject: powerpc/mpic_u3msi: Move MSI-related ops to pci_controller_ops

Move the u3 MPIC msi subsystem to use the pci_controller_ops structure
rather than ppc_md for MSI related PCI controller operations.

As with fsl_msi, operations are plugged in at the subsys level, after
controller creation. Again, we iterate over all controllers and
populate them with the MSI ops.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/sysdev/mpic_u3msi.c b/arch/powerpc/sysdev/mpic_u3msi.c
index b2cef18..fc46ef3 100644
--- a/arch/powerpc/sysdev/mpic_u3msi.c
+++ b/arch/powerpc/sysdev/mpic_u3msi.c
@@ -181,6 +181,7 @@ static int u3msi_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 int mpic_u3msi_init(struct mpic *mpic)
 {
 	int rc;
+	struct pci_controller *phb;
 
 	rc = mpic_msi_init_allocator(mpic);
 	if (rc) {
@@ -193,9 +194,11 @@ int mpic_u3msi_init(struct mpic *mpic)
 	BUG_ON(msi_mpic);
 	msi_mpic = mpic;
 
-	WARN_ON(ppc_md.setup_msi_irqs);
-	ppc_md.setup_msi_irqs = u3msi_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = u3msi_teardown_msi_irqs;
+	list_for_each_entry(phb, &hose_list, list_node) {
+		WARN_ON(phb->controller_ops.setup_msi_irqs);
+		phb->controller_ops.setup_msi_irqs = u3msi_setup_msi_irqs;
+		phb->controller_ops.teardown_msi_irqs = u3msi_teardown_msi_irqs;
+	}
 
 	return 0;
 }
-- 
cgit v0.10.2


From 1f88d5860e0b8244b28e21b63a521915e5c15313 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 14 Apr 2015 14:28:03 +1000
Subject: powerpc: Remove MSI-related PCI controller ops from ppc_md

Remove unneeded ppc_md functions. Patch callsites to use pci_controller_ops
functions exclusively.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index ef889943..0d387d0 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -131,12 +131,6 @@ struct machdep_calls {
 	/* To setup PHBs when using automatic OF platform driver for PCI */
 	int		(*pci_setup_phb)(struct pci_controller *host);
 
-#ifdef CONFIG_PCI_MSI
-	int		(*setup_msi_irqs)(struct pci_dev *dev,
-					  int nvec, int type);
-	void		(*teardown_msi_irqs)(struct pci_dev *dev);
-#endif
-
 	void		(*restart)(char *cmd);
 	void		(*halt)(void);
 	void		(*panic)(char *str);
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index 3d452f7..dab616a 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -17,9 +17,8 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
 	struct pci_controller *phb = pci_bus_to_host(dev->bus);
 
-	if ((!phb->controller_ops.setup_msi_irqs ||
-	     !phb->controller_ops.teardown_msi_irqs) &&
-	    (!ppc_md.setup_msi_irqs || !ppc_md.teardown_msi_irqs)) {
+	if (!phb->controller_ops.setup_msi_irqs ||
+	    !phb->controller_ops.teardown_msi_irqs) {
 		pr_debug("msi: Platform doesn't provide MSI callbacks.\n");
 		return -ENOSYS;
 	}
@@ -28,18 +27,12 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
-	if (phb->controller_ops.setup_msi_irqs)
-		return phb->controller_ops.setup_msi_irqs(dev, nvec, type);
-	else
-		return ppc_md.setup_msi_irqs(dev, nvec, type);
+	return phb->controller_ops.setup_msi_irqs(dev, nvec, type);
 }
 
 void arch_teardown_msi_irqs(struct pci_dev *dev)
 {
 	struct pci_controller *phb = pci_bus_to_host(dev->bus);
 
-	if (phb->controller_ops.teardown_msi_irqs)
-		phb->controller_ops.teardown_msi_irqs(dev);
-	else
-		ppc_md.teardown_msi_irqs(dev);
+	phb->controller_ops.teardown_msi_irqs(dev);
 }
-- 
cgit v0.10.2


From 92ae03532619dc24fdb7a5ae8ea63785fbd39f86 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 28 Apr 2015 15:12:05 +1000
Subject: powerpc/powernv: Specialise pci_controller_ops for each controller
 type

Remove powernv generic PCI controller operations. Replace it with
controller ops for each of the two supported PHBs.

As an added bonus, make the two new structs const, which will help
guard against bugs such as the one introduced in 65ebf4b63
("powerpc/powernv: Move controller ops from ppc_md to controller_ops")

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index f8bc950..23125f4 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2648,6 +2648,17 @@ static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
 		       OPAL_ASSERT_RESET);
 }
 
+static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
+       .dma_dev_setup = pnv_pci_dma_dev_setup,
+#ifdef CONFIG_PCI_MSI
+       .setup_msi_irqs = pnv_setup_msi_irqs,
+       .teardown_msi_irqs = pnv_teardown_msi_irqs,
+#endif
+       .enable_device_hook = pnv_pci_enable_device_hook,
+       .window_alignment = pnv_pci_window_alignment,
+       .reset_secondary_bus = pnv_pci_reset_secondary_bus,
+};
+
 static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 					 u64 hub_id, int ioda_type)
 {
@@ -2808,10 +2819,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	 * the child P2P bridges) can form individual PE.
 	 */
 	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
-	pnv_pci_controller_ops.enable_device_hook = pnv_pci_enable_device_hook;
-	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
-	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
-	hose->controller_ops = pnv_pci_controller_ops;
+	hose->controller_ops = pnv_pci_ioda_controller_ops;
 
 #ifdef CONFIG_PCI_IOV
 	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 4729ca7..7f826e8 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -95,6 +95,14 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 	set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table);
 }
 
+static const struct pci_controller_ops pnv_pci_p5ioc2_controller_ops = {
+	.dma_dev_setup = pnv_pci_dma_dev_setup,
+#ifdef CONFIG_PCI_MSI
+       .setup_msi_irqs = pnv_setup_msi_irqs,
+       .teardown_msi_irqs = pnv_teardown_msi_irqs,
+#endif
+};
+
 static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 					   void *tce_mem, u64 tce_size)
 {
@@ -133,7 +141,7 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 	phb->hose->first_busno = 0;
 	phb->hose->last_busno = 0xff;
 	phb->hose->private_data = phb;
-	phb->hose->controller_ops = pnv_pci_controller_ops;
+	phb->hose->controller_ops = pnv_pci_p5ioc2_controller_ops;
 	phb->hub_id = hub_id;
 	phb->opal_id = phb_id;
 	phb->type = PNV_PHB_P5IOC2;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 0f04940..dfad2ba 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -45,7 +45,7 @@
 //#define cfg_dbg(fmt...)	printk(fmt)
 
 #ifdef CONFIG_PCI_MSI
-static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 	struct pnv_phb *phb = hose->private_data;
@@ -94,7 +94,7 @@ static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 	return 0;
 }
 
-static void pnv_teardown_msi_irqs(struct pci_dev *pdev)
+void pnv_teardown_msi_irqs(struct pci_dev *pdev)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 	struct pnv_phb *phb = hose->private_data;
@@ -662,7 +662,7 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 	tbl->it_type = TCE_PCI;
 }
 
-static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
+void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 	struct pnv_phb *phb = hose->private_data;
@@ -771,11 +771,3 @@ void __init pnv_pci_init(void)
 }
 
 machine_subsys_initcall_sync(powernv, tce_iommu_bus_notifier_init);
-
-struct pci_controller_ops pnv_pci_controller_ops = {
-	.dma_dev_setup = pnv_pci_dma_dev_setup,
-#ifdef CONFIG_PCI_MSI
-	.setup_msi_irqs = pnv_setup_msi_irqs,
-	.teardown_msi_irqs = pnv_teardown_msi_irqs,
-#endif
-};
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 070ee88..15fde52 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -218,4 +218,8 @@ extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
+extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
+extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
+extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
+
 #endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index f907f0a..addd15c 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -29,8 +29,6 @@ static inline u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev)
 }
 #endif
 
-extern struct pci_controller_ops pnv_pci_controller_ops;
-
 extern u32 pnv_get_supported_cpuidle_states(void);
 
 extern void pnv_lpc_init(void);
-- 
cgit v0.10.2


From 3405c2570fd68fc5ccc703c8de9c23abf5e95819 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 28 Apr 2015 15:12:06 +1000
Subject: powerpc/pci: add dma_set_mask to pci_controller_ops

Some systems only need to deal with DMA masks for PCI devices.
For these systems, we can avoid the need for a platform hook and
instead use a pci controller based hook.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index a3b6252..6d17bb8 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -36,6 +36,8 @@ struct pci_controller_ops {
 					  int nvec, int type);
 	void		(*teardown_msi_irqs)(struct pci_dev *dev);
 #endif
+
+	int             (*dma_set_mask)(struct pci_dev *dev, u64 dma_mask);
 };
 
 /*
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 484b2d4..35e4dcc 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -248,6 +248,14 @@ int dma_set_mask(struct device *dev, u64 dma_mask)
 {
 	if (ppc_md.dma_set_mask)
 		return ppc_md.dma_set_mask(dev, dma_mask);
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+		struct pci_controller *phb = pci_bus_to_host(pdev->bus);
+		if (phb->controller_ops.dma_set_mask)
+			return phb->controller_ops.dma_set_mask(pdev, dma_mask);
+	}
+
 	return __dma_set_mask(dev, dma_mask);
 }
 EXPORT_SYMBOL(dma_set_mask);
-- 
cgit v0.10.2


From 763d2d8df1ee2b92ff09cd58f6034021e2cabf6d Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Tue, 28 Apr 2015 15:12:07 +1000
Subject: powerpc/powernv: Move dma_set_mask() from pnv_phb to
 pci_controller_ops

Previously, dma_set_mask() on powernv was convoluted:
 0) Call dma_set_mask() (a/p/kernel/dma.c)
 1) In dma_set_mask(), ppc_md.dma_set_mask() exists, so call it.
 2) On powernv, that function pointer is pnv_dma_set_mask().
    In pnv_dma_set_mask(), the device is pci, so call pnv_pci_dma_set_mask().
 3) In pnv_pci_dma_set_mask(), call pnv_phb->set_dma_mask() if it exists.
 4) It only exists in the ioda case, where it points to
    pnv_pci_ioda_dma_set_mask(), which is the final function.

So the call chain is:
 dma_set_mask() ->
  pnv_dma_set_mask() ->
   pnv_pci_dma_set_mask() ->
    pnv_pci_ioda_dma_set_mask()

Both ppc_md and pnv_phb function pointers are used.

Rip out the ppc_md call, pnv_dma_set_mask() and pnv_pci_dma_set_mask().

Instead:
 0) Call dma_set_mask() (a/p/kernel/dma.c)
 1) In dma_set_mask(), the device is pci, and pci_controller_ops.dma_set_mask()
    exists, so call pci_controller_ops.dma_set_mask()
 2) In the ioda case, that points to pnv_pci_ioda_dma_set_mask().

The new call chain is
 dma_set_mask() ->
  pnv_pci_ioda_dma_set_mask()

Now only the pci_controller_ops function pointer is used.

The fallback paths for p5ioc2 are the same.

Previously, pnv_pci_dma_set_mask() would find no pnv_phb->set_dma_mask()
function, to it would call __set_dma_mask().

Now, dma_set_mask() finds no ppc_md call or pci_controller_ops call,
so it calls __set_dma_mask().

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 23125f4..508f530 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1601,9 +1601,10 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 	set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table);
 }
 
-static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
-				     struct pci_dev *pdev, u64 dma_mask)
+static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 {
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
 	struct pci_dn *pdn = pci_get_pdn(pdev);
 	struct pnv_ioda_pe *pe;
 	uint64_t top;
@@ -2657,6 +2658,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
        .enable_device_hook = pnv_pci_enable_device_hook,
        .window_alignment = pnv_pci_window_alignment,
        .reset_secondary_bus = pnv_pci_reset_secondary_bus,
+       .dma_set_mask = pnv_pci_ioda_dma_set_mask,
 };
 
 static void __init pnv_pci_init_ioda_phb(struct device_node *np,
@@ -2802,7 +2804,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 
 	/* Setup TCEs */
 	phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
-	phb->dma_set_mask = pnv_pci_ioda_dma_set_mask;
 	phb->dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask;
 
 	/* Setup shutdown function for kexec */
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index dfad2ba..8a557b5 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -689,16 +689,6 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
 		phb->dma_dev_setup(phb, pdev);
 }
 
-int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
-{
-	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-	struct pnv_phb *phb = hose->private_data;
-
-	if (phb && phb->dma_set_mask)
-		return phb->dma_set_mask(phb, pdev, dma_mask);
-	return __dma_set_mask(&pdev->dev, dma_mask);
-}
-
 u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 15fde52..ac8686c 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -106,8 +106,6 @@ struct pnv_phb {
 			 unsigned int hwirq, unsigned int virq,
 			 unsigned int is_64, struct msi_msg *msg);
 	void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
-	int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev,
-			    u64 dma_mask);
 	u64 (*dma_get_required_mask)(struct pnv_phb *phb,
 				     struct pci_dev *pdev);
 	void (*fixup_phb)(struct pci_controller *hose);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index addd15c..9269e30 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -12,17 +12,11 @@ struct pci_dev;
 #ifdef CONFIG_PCI
 extern void pnv_pci_init(void);
 extern void pnv_pci_shutdown(void);
-extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask);
 extern u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev);
 #else
 static inline void pnv_pci_init(void) { }
 static inline void pnv_pci_shutdown(void) { }
 
-static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
-{
-	return -ENODEV;
-}
-
 static inline u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev)
 {
 	return 0;
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 15e9337..53737e0 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -165,13 +165,6 @@ static void pnv_progress(char *s, unsigned short hex)
 {
 }
 
-static int pnv_dma_set_mask(struct device *dev, u64 dma_mask)
-{
-	if (dev_is_pci(dev))
-		return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask);
-	return __dma_set_mask(dev, dma_mask);
-}
-
 static u64 pnv_dma_get_required_mask(struct device *dev)
 {
 	if (dev_is_pci(dev))
@@ -321,7 +314,6 @@ define_machine(powernv) {
 	.machine_shutdown	= pnv_shutdown,
 	.power_save             = power7_idle,
 	.calibrate_decr		= generic_calibrate_decr,
-	.dma_set_mask		= pnv_dma_set_mask,
 	.dma_get_required_mask	= pnv_dma_get_required_mask,
 #ifdef CONFIG_KEXEC
 	.kexec_cpu_down		= pnv_kexec_cpu_down,
-- 
cgit v0.10.2


From 09f3f326cda6b6e2dfc6471c7f8ac77696c87419 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 1 Jun 2015 21:11:35 +1000
Subject: powerpc/mm: Fix build break with STRICT_MM_TYPECHECKS &&
 DEBUG_PAGEALLOC

If both STRICT_MM_TYPECHECKS and DEBUG_PAGEALLOC are enabled, the code
in kernel_map_linear_page() is built, and so we fail with:

  arch/powerpc/mm/hash_utils_64.c:1478:2:
  error: incompatible type for argument 1 of 'htab_convert_pte_flags'

Fix it by using pgprot_val().

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index fda236f..a04ca92 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1475,7 +1475,7 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
 	unsigned long hash;
 	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
 	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
-	unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
+	unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
 	long ret;
 
 	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-- 
cgit v0.10.2


From 72e349f1124a114435e599479c9b8d14bfd1ebcd Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 26 May 2015 15:10:24 +1000
Subject: powerpc/perf: Fix book3s kernel to userspace backtraces

When we take a PMU exception or a software event we call
perf_read_regs(). This overloads regs->result with a boolean that
describes if we should use the sampled instruction address register
(SIAR) or the regs.

If the exception is in kernel, we start with the kernel regs and
backtrace through the kernel stack. At this point we switch to the
userspace regs and backtrace the user stack with perf_callchain_user().

Unfortunately these regs have not got the perf_read_regs() treatment,
so regs->result could be anything. If it is non zero,
perf_instruction_pointer() decides to use the SIAR, and we get issues
like this:

0.11%  qemu-system-ppc  [kernel.kallsyms]        [k] _raw_spin_lock_irqsave
       |
       ---_raw_spin_lock_irqsave
          |
          |--52.35%-- 0
          |          |
          |          |--46.39%-- __hrtimer_start_range_ns
          |          |          kvmppc_run_core
          |          |          kvmppc_vcpu_run_hv
          |          |          kvmppc_vcpu_run
          |          |          kvm_arch_vcpu_ioctl_run
          |          |          kvm_vcpu_ioctl
          |          |          do_vfs_ioctl
          |          |          sys_ioctl
          |          |          system_call
          |          |          |
          |          |          |--67.08%-- _raw_spin_lock_irqsave <--- hi mum
          |          |          |          |
          |          |          |           --100.00%-- 0x7e714
          |          |          |                     0x7e714

Notice the bogus _raw_spin_irqsave when we transition from kernel
(system_call) to userspace (0x7e714). We inserted what was in the SIAR.

Add a check in regs_use_siar() to check that the regs in question
are from a PMU exception. With this fix the backtrace makes sense:

     0.47%  qemu-system-ppc  [kernel.vmlinux]         [k] _raw_spin_lock_irqsave
            |
            ---_raw_spin_lock_irqsave
               |
               |--53.83%-- 0
               |          |
               |          |--44.73%-- hrtimer_try_to_cancel
               |          |          kvmppc_start_thread
               |          |          kvmppc_run_core
               |          |          kvmppc_vcpu_run_hv
               |          |          kvmppc_vcpu_run
               |          |          kvm_arch_vcpu_ioctl_run
               |          |          kvm_vcpu_ioctl
               |          |          do_vfs_ioctl
               |          |          sys_ioctl
               |          |          system_call
               |          |          __ioctl
               |          |          0x7e714
               |          |          0x7e714

Cc: stable@vger.kernel.org
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 12b6384..d90893b 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -131,7 +131,16 @@ static void pmao_restore_workaround(bool ebb) { }
 
 static bool regs_use_siar(struct pt_regs *regs)
 {
-	return !!regs->result;
+	/*
+	 * When we take a performance monitor exception the regs are setup
+	 * using perf_read_regs() which overloads some fields, in particular
+	 * regs->result to tell us whether to use SIAR.
+	 *
+	 * However if the regs are from another exception, eg. a syscall, then
+	 * they have not been setup using perf_read_regs() and so regs->result
+	 * is something random.
+	 */
+	return ((TRAP(regs) == 0xf00) && regs->result);
 }
 
 /*
-- 
cgit v0.10.2


From 05b05f28fb3835087b3d4e741fd561b9826fe461 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 26 May 2015 15:46:54 +1000
Subject: powerpc: Relocatable system call no longer uses the LR

We had some code to restore the LR in the relocatable system call path
back when we used the LR to do an indirect branch.

Commit 6a404806dfce ("powerpc: Avoid link stack corruption in MMU
on syscall entry path") changed this to use the CTR which is volatile
across system calls so does not need restoring.

Remove the stale comment and the restore of the LR.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 9519e6b..cf8c1b5 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -59,14 +59,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 
 #if defined(CONFIG_RELOCATABLE)
 	/*
-	 * We can't branch directly; in the direct case we use LR
-	 * and system_call_entry restores LR.  (We thus need to move
-	 * LR to r10 in the RFID case too.)
+	 * We can't branch directly so we do it via the CTR which
+	 * is volatile across system calls.
 	 */
 #define SYSCALL_PSERIES_2_DIRECT				\
 	mflr	r10 ;						\
 	ld	r12,PACAKBASE(r13) ; 				\
-	LOAD_HANDLER(r12, system_call_entry_direct) ;		\
+	LOAD_HANDLER(r12, system_call_entry) ;			\
 	mtctr	r12 ;						\
 	mfspr	r12,SPRN_SRR1 ;					\
 	/* Re-use of r13... No spare regs to do this */	\
@@ -80,7 +79,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 	mfspr	r12,SPRN_SRR1 ;					\
 	li	r10,MSR_RI ;					\
 	mtmsrd 	r10,1 ;			/* Set RI (EE=0) */	\
-	b	system_call_entry_direct ;
+	b	system_call_entry ;
 #endif
 
 /*
@@ -969,13 +968,6 @@ hv_facility_unavailable_relon_trampoline:
 __end_interrupts:
 
 	.align	7
-system_call_entry_direct:
-#if defined(CONFIG_RELOCATABLE)
-	/* The first level prologue may have used LR to get here, saving
-	 * orig in r10.  To save hacking/ifdeffing common code, restore here.
-	 */
-	mtlr	r10
-#endif
 system_call_entry:
 	b	system_call_common
 
-- 
cgit v0.10.2


From d20be433e6a8892ecf59ef62636cd1333975347d Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 26 May 2015 15:46:55 +1000
Subject: powerpc: Non relocatable system call doesn't need a trampoline

We need to use a trampoline when using LOAD_HANDLER(), because the
destination needs to be in the first 64kB. An absolute branch has
no such limitations, so just jump there.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index cf8c1b5..0a0399c2 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -79,7 +79,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 	mfspr	r12,SPRN_SRR1 ;					\
 	li	r10,MSR_RI ;					\
 	mtmsrd 	r10,1 ;			/* Set RI (EE=0) */	\
-	b	system_call_entry ;
+	b	system_call_common ;
 #endif
 
 /*
-- 
cgit v0.10.2


From c1231a784acc25ec01d35a019533da9ab8593e2e Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Wed, 20 May 2015 17:59:52 +0800
Subject: powerpc: Use irq_desc_get_xxx() to avoid redundant lookup of irq_desc

Use irq_desc_get_xxx() to avoid redundant lookup of irq_desc while we
already have a pointer to corresponding irq_desc.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index c949ca0..6301662 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -193,7 +193,7 @@ static struct irq_chip mpc52xx_gpt_irq_chip = {
 
 void mpc52xx_gpt_irq_cascade(unsigned int virq, struct irq_desc *desc)
 {
-	struct mpc52xx_gpt_priv *gpt = irq_get_handler_data(virq);
+	struct mpc52xx_gpt_priv *gpt = irq_desc_get_handler_data(desc);
 	int sub_virq;
 	u32 status;
 
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index d9b8a44..fe51de4 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -96,7 +96,7 @@ static void msic_dcr_write(struct axon_msic *msic, unsigned int dcr_n, u32 val)
 static void axon_msi_cascade(unsigned int irq, struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
-	struct axon_msic *msic = irq_get_handler_data(irq);
+	struct axon_msic *msic = irq_desc_get_handler_data(desc);
 	u32 write_offset, msi;
 	int idx;
 	int retry = 0;
diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
index c269cae..9dd154d 100644
--- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
+++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
@@ -124,7 +124,7 @@ static void hlwd_pic_irq_cascade(unsigned int cascade_virq,
 				      struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
-	struct irq_domain *irq_domain = irq_get_handler_data(cascade_virq);
+	struct irq_domain *irq_domain = irq_desc_get_handler_data(desc);
 	unsigned int virq;
 
 	raw_spin_lock(&desc->lock);
diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c
index 1cd057f..d773453 100644
--- a/arch/powerpc/sysdev/uic.c
+++ b/arch/powerpc/sysdev/uic.c
@@ -198,7 +198,7 @@ void uic_irq_cascade(unsigned int virq, struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 	struct irq_data *idata = irq_desc_get_irq_data(desc);
-	struct uic *uic = irq_get_handler_data(virq);
+	struct uic *uic = irq_desc_get_handler_data(desc);
 	u32 msr;
 	int src;
 	int subvirq;
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index 5bc5889..08c248e 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -227,7 +227,7 @@ void xics_migrate_irqs_away(void)
 
 		/* Locate interrupt server */
 		server = -1;
-		ics = irq_get_chip_data(virq);
+		ics = irq_desc_get_chip_data(desc);
 		if (ics)
 			server = ics->get_server(ics, irq);
 		if (server < 0) {
-- 
cgit v0.10.2


From a1c97df2787f6542257f1fd85e6a9b055e1125be Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Tue, 26 May 2015 11:36:56 +1000
Subject: powerpc/configs: Merge pseries_defconfig and pseries_le_defconfig

These two configs should be identical with the exception of big or little
endian.

The big endian version has XMON_DEFAULT turned on while the little endian
has XMON_DEFAULT not set. It makes the most sense for defconfigs not to use
xmon by default, production systems should get back up as quickly as
possible, not sit in xmon.

In the event debugging is required, the option can be enabled or xmon=on
can be specified on commandline.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index c2e39f6..4da8260 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -297,7 +297,6 @@ CONFIG_CODE_PATCHING_SELFTEST=y
 CONFIG_FTR_FIXUP_SELFTEST=y
 CONFIG_MSI_BITMAP_SELFTEST=y
 CONFIG_XMON=y
-CONFIG_XMON_DEFAULT=y
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
-- 
cgit v0.10.2


From ea4d1a87e6dee541a525c822b0b5da160940fc67 Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Tue, 26 May 2015 11:36:57 +1000
Subject: powerpc/configs: Replace pseries_le_defconfig with a Makefile target
 using merge_config

Rather than continuing to maintain a copy of pseries_defconfig with
CONFIG_CPU_LITTLE_ENDIAN enabled, use the generic merge_config script
and use an le.config to enable little endian on top of pseries_defconfig
without the need for a duplicated _defconfig file.

This method will require less maintenance in the future and will ensure
that both 'defconfigs' are always in sync.

It is worth noting that the seemingly more simple approach of:

  pseries_le_defconfig: pseries_defconfig
  	$(Q)$(MAKE) le.config

Will not work when building using O=builddir.

The obvious fix to that:

  pseries_le_defconfig:
  	$(Q)$(MAKE) -f $(srctree)/Makefile pseries_defconfig le.config

Also does not work. This is because if we have for example:

config FOO
	depends on CPU_BIG_ENDIAN
	select BAR

Then BAR will be enabled by the first call to kconfig (via
pseries_defconfig), and then will remain enabled after we merge
le.config, even though FOO will have been turned off.

The solution is to ensure to only invoke the kconfig logic once, after
we have merged all the config fragments. This ensures nothing is
select'ed on that should then be disabled by the later merged configs.
This is done through the explicit call to make olddefconfig

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Reviewed-by: Samuel Mendoza-Jonas <sam.mj@au1.ibm.com>
[mpe: Massage change log, fix white space and use ARCH not SRCARCH]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index a314cb0..edf39ce 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -269,6 +269,21 @@ bootwrapper_install:
 %.dtb: scripts
 	$(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 
+# Used to create 'merged defconfigs'
+# To use it $(call) it with the first argument as the base defconfig
+# and the second argument as a space separated list of .config files to merge,
+# without the .config suffix.
+define merge_into_defconfig
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh \
+		-m -O $(objtree) $(srctree)/arch/$(ARCH)/configs/$(1) \
+		$(foreach config,$(2),$(srctree)/arch/$(ARCH)/configs/$(config).config)
+	+$(Q)$(MAKE) -f $(srctree)/Makefile olddefconfig
+endef
+
+PHONY += pseries_le_defconfig
+pseries_le_defconfig:
+	$(call merge_into_defconfig,pseries_defconfig,le)
+
 define archhelp
   @echo '* zImage          - Build default images selected by kernel config'
   @echo '  zImage.*        - Compressed kernel image (arch/$(ARCH)/boot/zImage.*)'
diff --git a/arch/powerpc/configs/le.config b/arch/powerpc/configs/le.config
new file mode 100644
index 0000000..ee43fdb
--- /dev/null
+++ b/arch/powerpc/configs/le.config
@@ -0,0 +1 @@
+CONFIG_CPU_LITTLE_ENDIAN=y
diff --git a/arch/powerpc/configs/pseries_le_defconfig b/arch/powerpc/configs/pseries_le_defconfig
deleted file mode 100644
index 09bc96e..0000000
--- a/arch/powerpc/configs/pseries_le_defconfig
+++ /dev/null
@@ -1,319 +0,0 @@
-CONFIG_PPC64=y
-CONFIG_SMP=y
-CONFIG_NR_CPUS=2048
-CONFIG_CPU_LITTLE_ENDIAN=y
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_FHANDLE=y
-CONFIG_AUDIT=y
-CONFIG_AUDITSYSCALL=y
-CONFIG_IRQ_DOMAIN_DEBUG=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_TASK_XACCT=y
-CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_NUMA_BALANCING=y
-CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_DEVICE=y
-CONFIG_CPUSETS=y
-CONFIG_CGROUP_CPUACCT=y
-CONFIG_MEMCG=y
-CONFIG_MEMCG_SWAP=y
-CONFIG_CGROUP_PERF=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_USER_NS=y
-CONFIG_BLK_DEV_INITRD=y
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
-CONFIG_KPROBES=y
-CONFIG_JUMP_LABEL=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_MODULE_SRCVERSION_ALL=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_PPC_SPLPAR=y
-CONFIG_SCANLOG=m
-CONFIG_PPC_SMLPAR=y
-CONFIG_DTL=y
-# CONFIG_PPC_PMAC is not set
-CONFIG_RTAS_FLASH=m
-CONFIG_IBMEBUS=y
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_HZ_100=y
-CONFIG_BINFMT_MISC=m
-CONFIG_PPC_TRANSACTIONAL_MEM=y
-CONFIG_KEXEC=y
-CONFIG_IRQ_ALL_CPUS=y
-CONFIG_MEMORY_HOTPLUG=y
-CONFIG_MEMORY_HOTREMOVE=y
-CONFIG_KSM=y
-CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_PPC_64K_PAGES=y
-CONFIG_PPC_SUBPAGE_PROT=y
-CONFIG_SCHED_SMT=y
-CONFIG_HOTPLUG_PCI=y
-CONFIG_HOTPLUG_PCI_RPA=m
-CONFIG_HOTPLUG_PCI_RPA_DLPAR=m
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=m
-CONFIG_NET_KEY=m
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_NET_IPIP=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=m
-CONFIG_INET_ESP=m
-CONFIG_INET_IPCOMP=m
-# CONFIG_IPV6 is not set
-CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_ADVANCED is not set
-CONFIG_BRIDGE=m
-CONFIG_VLAN_8021Q=m
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-CONFIG_DEVTMPFS=y
-CONFIG_DEVTMPFS_MOUNT=y
-CONFIG_PARPORT=m
-CONFIG_PARPORT_PC=m
-CONFIG_BLK_DEV_FD=m
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_SIZE=65536
-CONFIG_VIRTIO_BLK=m
-CONFIG_IDE=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_GENERIC=y
-CONFIG_BLK_DEV_AMD74XX=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=y
-CONFIG_BLK_DEV_SR=y
-CONFIG_BLK_DEV_SR_VENDOR=y
-CONFIG_CHR_DEV_SG=y
-CONFIG_SCSI_CONSTANTS=y
-CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_CXGB3_ISCSI=m
-CONFIG_SCSI_CXGB4_ISCSI=m
-CONFIG_SCSI_BNX2_ISCSI=m
-CONFIG_BE2ISCSI=m
-CONFIG_SCSI_MPT2SAS=m
-CONFIG_SCSI_IBMVSCSI=y
-CONFIG_SCSI_IBMVFC=m
-CONFIG_SCSI_SYM53C8XX_2=y
-CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
-CONFIG_SCSI_IPR=y
-CONFIG_SCSI_QLA_FC=m
-CONFIG_SCSI_QLA_ISCSI=m
-CONFIG_SCSI_LPFC=m
-CONFIG_SCSI_VIRTIO=m
-CONFIG_SCSI_DH=m
-CONFIG_SCSI_DH_RDAC=m
-CONFIG_SCSI_DH_ALUA=m
-CONFIG_ATA=y
-CONFIG_SATA_AHCI=y
-# CONFIG_ATA_SFF is not set
-CONFIG_MD=y
-CONFIG_BLK_DEV_MD=y
-CONFIG_MD_LINEAR=y
-CONFIG_MD_RAID0=y
-CONFIG_MD_RAID1=y
-CONFIG_MD_RAID10=m
-CONFIG_MD_RAID456=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_MD_FAULTY=m
-CONFIG_BLK_DEV_DM=y
-CONFIG_DM_CRYPT=m
-CONFIG_DM_SNAPSHOT=m
-CONFIG_DM_THIN_PROVISIONING=m
-CONFIG_DM_MIRROR=m
-CONFIG_DM_ZERO=m
-CONFIG_DM_MULTIPATH=m
-CONFIG_DM_MULTIPATH_QL=m
-CONFIG_DM_MULTIPATH_ST=m
-CONFIG_DM_UEVENT=y
-CONFIG_BONDING=m
-CONFIG_DUMMY=m
-CONFIG_MACVLAN=m
-CONFIG_MACVTAP=m
-CONFIG_VXLAN=m
-CONFIG_NETCONSOLE=y
-CONFIG_TUN=m
-CONFIG_VETH=m
-CONFIG_VIRTIO_NET=m
-CONFIG_VHOST_NET=m
-CONFIG_VORTEX=y
-CONFIG_ACENIC=m
-CONFIG_ACENIC_OMIT_TIGON_I=y
-CONFIG_PCNET32=y
-CONFIG_TIGON3=y
-CONFIG_CHELSIO_T1=m
-CONFIG_BE2NET=m
-CONFIG_S2IO=m
-CONFIG_IBMVETH=y
-CONFIG_EHEA=y
-CONFIG_E100=y
-CONFIG_E1000=y
-CONFIG_E1000E=y
-CONFIG_IXGB=m
-CONFIG_IXGBE=m
-CONFIG_MLX4_EN=m
-CONFIG_MYRI10GE=m
-CONFIG_QLGE=m
-CONFIG_NETXEN_NIC=m
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPPOE=m
-CONFIG_PPP_ASYNC=m
-CONFIG_PPP_SYNC_TTY=m
-# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
-CONFIG_INPUT_EVDEV=m
-CONFIG_INPUT_MISC=y
-CONFIG_INPUT_PCSPKR=m
-# CONFIG_SERIO_SERPORT is not set
-CONFIG_DEVPTS_MULTIPLE_INSTANCES=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_ICOM=m
-CONFIG_SERIAL_JSM=m
-CONFIG_HVC_CONSOLE=y
-CONFIG_HVC_RTAS=y
-CONFIG_HVCS=m
-CONFIG_VIRTIO_CONSOLE=m
-CONFIG_IBM_BSR=m
-CONFIG_GEN_RTC=y
-CONFIG_RAW_DRIVER=y
-CONFIG_MAX_RAW_DEVS=1024
-CONFIG_FB=y
-CONFIG_FIRMWARE_EDID=y
-CONFIG_FB_OF=y
-CONFIG_FB_MATROX=y
-CONFIG_FB_MATROX_MILLENIUM=y
-CONFIG_FB_MATROX_MYSTIQUE=y
-CONFIG_FB_MATROX_G=y
-CONFIG_FB_RADEON=y
-CONFIG_FB_IBM_GXT4500=y
-CONFIG_LCD_PLATFORM=m
-# CONFIG_VGA_CONSOLE is not set
-CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_LOGO=y
-CONFIG_HID_GYRATION=y
-CONFIG_HID_PANTHERLORD=y
-CONFIG_HID_PETALYNX=y
-CONFIG_HID_SAMSUNG=y
-CONFIG_HID_SUNPLUS=y
-CONFIG_USB_HIDDEV=y
-CONFIG_USB=y
-CONFIG_USB_MON=m
-CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_HCD_PPC_OF is not set
-CONFIG_USB_OHCI_HCD=y
-CONFIG_USB_STORAGE=m
-CONFIG_INFINIBAND=m
-CONFIG_INFINIBAND_USER_MAD=m
-CONFIG_INFINIBAND_USER_ACCESS=m
-CONFIG_INFINIBAND_MTHCA=m
-CONFIG_INFINIBAND_EHCA=m
-CONFIG_INFINIBAND_CXGB3=m
-CONFIG_INFINIBAND_CXGB4=m
-CONFIG_MLX4_INFINIBAND=m
-CONFIG_INFINIBAND_IPOIB=m
-CONFIG_INFINIBAND_IPOIB_CM=y
-CONFIG_INFINIBAND_SRP=m
-CONFIG_INFINIBAND_ISER=m
-CONFIG_VIRTIO_PCI=m
-CONFIG_VIRTIO_BALLOON=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT2_FS_POSIX_ACL=y
-CONFIG_EXT2_FS_SECURITY=y
-CONFIG_EXT2_FS_XIP=y
-CONFIG_EXT3_FS=y
-CONFIG_EXT3_FS_POSIX_ACL=y
-CONFIG_EXT3_FS_SECURITY=y
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_POSIX_ACL=y
-CONFIG_EXT4_FS_SECURITY=y
-CONFIG_REISERFS_FS=y
-CONFIG_REISERFS_FS_XATTR=y
-CONFIG_REISERFS_FS_POSIX_ACL=y
-CONFIG_REISERFS_FS_SECURITY=y
-CONFIG_JFS_FS=m
-CONFIG_JFS_POSIX_ACL=y
-CONFIG_JFS_SECURITY=y
-CONFIG_XFS_FS=m
-CONFIG_XFS_POSIX_ACL=y
-CONFIG_BTRFS_FS=m
-CONFIG_BTRFS_FS_POSIX_ACL=y
-CONFIG_NILFS2_FS=m
-CONFIG_AUTOFS4_FS=m
-CONFIG_FUSE_FS=m
-CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=y
-CONFIG_UDF_FS=m
-CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=y
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_TMPFS_POSIX_ACL=y
-CONFIG_HUGETLBFS=y
-CONFIG_CRAMFS=m
-CONFIG_SQUASHFS=m
-CONFIG_SQUASHFS_XATTR=y
-CONFIG_SQUASHFS_LZO=y
-CONFIG_SQUASHFS_XZ=y
-CONFIG_PSTORE=y
-CONFIG_NFS_FS=y
-CONFIG_NFS_V3_ACL=y
-CONFIG_NFS_V4=y
-CONFIG_NFSD=m
-CONFIG_NFSD_V3_ACL=y
-CONFIG_NFSD_V4=y
-CONFIG_CIFS=m
-CONFIG_CIFS_XATTR=y
-CONFIG_CIFS_POSIX=y
-CONFIG_NLS_DEFAULT="utf8"
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_ASCII=y
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_UTF8=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_STACK_USAGE=y
-CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
-CONFIG_LATENCYTOP=y
-CONFIG_SCHED_TRACER=y
-CONFIG_BLK_DEV_IO_TRACE=y
-CONFIG_CODE_PATCHING_SELFTEST=y
-CONFIG_FTR_FIXUP_SELFTEST=y
-CONFIG_MSI_BITMAP_SELFTEST=y
-CONFIG_XMON=y
-CONFIG_CRYPTO_TEST=m
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_TGR192=m
-CONFIG_CRYPTO_WP512=m
-CONFIG_CRYPTO_ANUBIS=m
-CONFIG_CRYPTO_BLOWFISH=m
-CONFIG_CRYPTO_CAST6=m
-CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
-CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_TEA=m
-CONFIG_CRYPTO_TWOFISH=m
-CONFIG_CRYPTO_LZO=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
-CONFIG_VIRTUALIZATION=y
-CONFIG_KVM_BOOK3S_64=m
-CONFIG_KVM_BOOK3S_64_HV=m
-- 
cgit v0.10.2


From 989898b707f74716ec27a4ae2b57e404ca54cbd2 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 1 Jun 2015 19:44:17 +1000
Subject: selftests/powerpc: Add install support to more powerpc tests

These tests were merged in parallel to the install support, update them
now to use it.

This also adds cross compile support for the VPHN test which was missing
it.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile
index 081473db..e21d106 100644
--- a/tools/testing/selftests/powerpc/switch_endian/Makefile
+++ b/tools/testing/selftests/powerpc/switch_endian/Makefile
@@ -1,9 +1,8 @@
-CC := $(CROSS_COMPILE)gcc
-PROGS := switch_endian_test
+TEST_PROGS := switch_endian_test
 
 ASFLAGS += -O2 -Wall -g -nostdlib -m64
 
-all: $(PROGS)
+all: $(TEST_PROGS)
 
 switch_endian_test: check-reversed.S
 
@@ -13,12 +12,7 @@ check-reversed.o: check.o
 check-reversed.S: check-reversed.o
 	hexdump -v -e '/1 ".byte 0x%02X\n"' $< > $@
 
-run_tests: all
-	@-for PROG in $(PROGS); do \
-		./$$PROG; \
-	done;
+include ../../lib.mk
 
 clean:
-	rm -f $(PROGS) *.o check-reversed.S
-
-.PHONY: all run_tests clean
+	rm -f $(TEST_PROGS) *.o check-reversed.S
diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile
index e539f77..a485f2e 100644
--- a/tools/testing/selftests/powerpc/vphn/Makefile
+++ b/tools/testing/selftests/powerpc/vphn/Makefile
@@ -1,15 +1,12 @@
-PROG := test-vphn
+TEST_PROGS := test-vphn
 
 CFLAGS += -m64
 
-all: $(PROG)
+all: $(TEST_PROGS)
 
-$(PROG): ../harness.c
+$(TEST_PROGS): ../harness.c
 
-run_tests: all
-	./$(PROG)
+include ../../lib.mk
 
 clean:
-	rm -f $(PROG)
-
-.PHONY: all run_tests clean
+	rm -f $(TEST_PROGS)
-- 
cgit v0.10.2


From fb326e9841b2000a8d1b115ccd7f5b3b61a4d123 Mon Sep 17 00:00:00 2001
From: Igal Liberman <Igal.Liberman@freescale.com>
Date: Mon, 12 Jan 2015 08:03:57 +0200
Subject: powerpc/dts: Unify B4 mux nodes

Signed-off-by: Igal Liberman <Igal.Liberman@freescale.com>
Change-Id: Ic5f28f7b492b708f00a5ff74dda723ce5e1da0ba
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/boot/dts/fsl/b4420si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4420si-post.dtsi
index 86161ae..1ea8602 100644
--- a/arch/powerpc/boot/dts/fsl/b4420si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/b4420si-post.dtsi
@@ -80,20 +80,9 @@
 		compatible = "fsl,b4420-device-config", "fsl,qoriq-device-config-2.0";
 	};
 
-/include/ "qoriq-clockgen2.dtsi"
 	global-utilities@e1000 {
-		compatible = "fsl,b4420-clockgen", "fsl,qoriq-clockgen-2.0";
-
-		mux0: mux0@0 {
-			#clock-cells = <0>;
-			reg = <0x0 0x4>;
-			compatible = "fsl,qoriq-core-mux-2.0";
-			clocks = <&pll0 0>, <&pll0 1>, <&pll0 2>,
-				<&pll1 0>, <&pll1 1>, <&pll1 2>;
-			clock-names = "pll0", "pll0-div2", "pll0-div4",
-				"pll1", "pll1-div2", "pll1-div4";
-			clock-output-names = "cmux0";
-		};
+		compatible = "fsl,b4420-clockgen", "fsl,b4-clockgen",
+			      "fsl,qoriq-clockgen-2.0";
 	};
 
 	rcpm: global-utilities@e2000 {
diff --git a/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
index f35e9e0..68b9a05 100644
--- a/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
@@ -182,20 +182,9 @@
 		compatible = "fsl,b4860-device-config", "fsl,qoriq-device-config-2.0";
 	};
 
-/include/ "qoriq-clockgen2.dtsi"
 	global-utilities@e1000 {
-		compatible = "fsl,b4860-clockgen", "fsl,qoriq-clockgen-2.0";
-
-		mux0: mux0@0 {
-			#clock-cells = <0>;
-			reg = <0x0 0x4>;
-			compatible = "fsl,qoriq-core-mux-2.0";
-			clocks = <&pll0 0>, <&pll0 1>, <&pll0 2>,
-				<&pll1 0>, <&pll1 1>, <&pll1 2>;
-			clock-names = "pll0", "pll0-div2", "pll0-div4",
-				"pll1", "pll1-div2", "pll1-div4";
-			clock-output-names = "cmux0";
-		};
+		compatible = "fsl,b4860-clockgen", "fsl,b4-clockgen",
+			      "fsl,qoriq-clockgen-2.0";
 	};
 
 	rcpm: global-utilities@e2000 {
diff --git a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
index 73136c0..c6bab55 100644
--- a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
@@ -296,9 +296,21 @@
 		fsl,liodn-bits = <12>;
 	};
 
+/include/ "qoriq-clockgen2.dtsi"
 	clockgen: global-utilities@e1000 {
 		compatible = "fsl,b4-clockgen", "fsl,qoriq-clockgen-2.0";
 		reg = <0xe1000 0x1000>;
+
+		mux0: mux0@0 {
+			#clock-cells = <0>;
+			reg = <0x0 0x4>;
+			compatible = "fsl,qoriq-core-mux-2.0";
+			clocks = <&pll0 0>, <&pll0 1>, <&pll0 2>,
+				<&pll1 0>, <&pll1 1>, <&pll1 2>;
+			clock-names = "pll0", "pll0-div2", "pll0-div4",
+				"pll1", "pll1-div2", "pll1-div4";
+			clock-output-names = "cmux0";
+		};
 	};
 
 	rcpm: global-utilities@e2000 {
-- 
cgit v0.10.2


From c89ca8ab7457d891abdffca7e6f5e74cef670d6d Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Thu, 2 Apr 2015 22:14:11 -0500
Subject: powerpc/e6500: Optimize hugepage TLB misses

Some workloads take a lot of TLB misses despite using traditional
hugepages.  Handle these TLB misses in the asm fastpath rather than
going through a bunch of C code.  With this patch I measured around a
5x speedup in handling hugepage TLB misses.

Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index 89bf95b..765b419 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -398,18 +398,18 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
 	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
 	clrrdi	r15,r15,3
 	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_e6500	/* Bad pgd entry or hugepage; bail */
+	bge	tlb_miss_huge_e6500	/* Bad pgd entry or hugepage; bail */
 	ldx	r14,r14,r15		/* grab pud entry */
 
 	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
 	clrrdi	r15,r15,3
 	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_e6500
+	bge	tlb_miss_huge_e6500
 	ldx	r14,r14,r15		/* Grab pmd entry */
 
 	mfspr	r10,SPRN_MAS0
 	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_e6500
+	bge	tlb_miss_huge_e6500
 
 	/* Now we build the MAS for a 2M indirect page:
 	 *
@@ -428,6 +428,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
 	clrrdi	r15,r16,21		/* make EA 2M-aligned */
 	mtspr	SPRN_MAS2,r15
 
+tlb_miss_huge_done_e6500:
 	lbz	r15,TCD_ESEL_NEXT(r11)
 	lbz	r16,TCD_ESEL_MAX(r11)
 	lbz	r14,TCD_ESEL_FIRST(r11)
@@ -456,6 +457,50 @@ END_FTR_SECTION_IFSET(CPU_FTR_SMT)
 	tlb_epilog_bolted
 	rfi
 
+tlb_miss_huge_e6500:
+	beq	tlb_miss_fault_e6500
+	li	r10,1
+	andi.	r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */
+	rldimi	r14,r10,63,0		/* Set PD_HUGE */
+	xor	r14,r14,r15		/* Clear size bits */
+	ldx	r14,0,r14
+
+	/*
+	 * Now we build the MAS for a huge page.
+	 *
+	 * MAS 0   :	ESEL needs to be filled by software round-robin
+	 *		 - can be handled by indirect code
+	 * MAS 1   :	Need to clear IND and set TSIZE
+	 * MAS 2,3+7:	Needs to be redone similar to non-tablewalk handler
+	 */
+
+	subi	r15,r15,10		/* Convert psize to tsize */
+	mfspr	r10,SPRN_MAS1
+	rlwinm	r10,r10,0,~MAS1_IND
+	rlwimi	r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK
+	mtspr	SPRN_MAS1,r10
+
+	li	r10,-0x400
+	sld	r15,r10,r15		/* Generate mask based on size */
+	and	r10,r16,r15
+	rldicr	r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+	rlwimi	r10,r14,32-19,27,31	/* Insert WIMGE */
+	clrldi	r15,r15,PAGE_SHIFT	/* Clear crap at the top */
+	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
+	mtspr	SPRN_MAS2,r10
+	andi.	r10,r14,_PAGE_DIRTY
+	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
+
+	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+	bne	1f
+	li	r10,MAS3_SW|MAS3_UW
+	andc	r15,r15,r10
+1:
+	mtspr	SPRN_MAS7_MAS3,r15
+
+	mfspr	r10,SPRN_MAS0
+	b	tlb_miss_huge_done_e6500
+
 tlb_miss_kernel_e6500:
 	ld	r14,PACA_KERNELPGD(r13)
 	cmpldi	cr1,r15,8		/* Check for vmalloc region */
-- 
cgit v0.10.2


From 86d63363defc0b2f22288b287f75837d48b561fc Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Mon, 6 Apr 2015 22:00:38 -0500
Subject: powerpc/e500mc: Remove dead L2 flushing code in idle_e500.S

This code can never be executed as it is only built when
CONFIG_PPC_E500MC is unset, but the only CPUs that have CPU_FTR_L2CSR
require CONFIG_PPC_E500MC and do not have the MSR/HID0-based nap
mechanism that this file uses.

Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S
index 1544866..b9b6ef5 100644
--- a/arch/powerpc/kernel/idle_e500.S
+++ b/arch/powerpc/kernel/idle_e500.S
@@ -58,15 +58,6 @@ BEGIN_FTR_SECTION
 	mtlr	r0
 	lis	r3,HID0_NAP@h
 END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
-BEGIN_FTR_SECTION
-	msync
-	li	r7,L2CSR0_L2FL@l
-	mtspr	SPRN_L2CSR0,r7
-2:
-	mfspr	r7,SPRN_L2CSR0
-	andi.	r4,r7,L2CSR0_L2FL@l
-	bne	2b
-END_FTR_SECTION_IFSET(CPU_FTR_L2CSR|CPU_FTR_CAN_NAP)
 1:
 	/* Go to NAP or DOZE now */
 	mfspr	r4,SPRN_HID0
-- 
cgit v0.10.2


From ec66a97d153306cd35250da419f7f8ead0bc142f Mon Sep 17 00:00:00 2001
From: Shengzhou Liu <Shengzhou.Liu@freescale.com>
Date: Wed, 15 Apr 2015 11:48:08 +0800
Subject: powerpc/fsl-booke: Add device tree support for T1024/T1023 SoC

The T1024 SoC includes the following function and features:
- Two 64-bit Power architecture e5500 cores, up to 1.4GHz
- private 256KB L2 cache each core and shared 256KB CoreNet platform cache (CPC)
- 32-/64-bit DDR3L/DDR4 SDRAM memory controller with ECC and interleaving support
- Data Path Acceleration Architecture (DPAA) incorporating acceleration
- Four MAC for 1G/2.5G/10G network interfaces (RGMII, SGMII, QSGMII, XFI)
- High-speed peripheral interfaces
  - Three PCI Express 2.0 controllers
- Additional peripheral interfaces
  - One SATA 2.0 controller
  - Two USB 2.0 controllers with integrated PHY
  - Enhanced secure digital host controller (SD/eSDHC/eMMC)
  - Enhanced serial peripheral interface (eSPI)
  - Four I2C controllers
  - Four 2-pin UARTs or two 4-pin UARTs
  - Integrated Flash Controller supporting NAND and NOR flash
- Two 8-channel DMA engines
- Multicore programmable interrupt controller (PIC)
- LCD interface (DIU) with 12 bit dual data rate
- QUICC Engine block supporting TDM, HDLC, and UART
- Deep Sleep power implementaion (wakeup from GPIO/Timer/Ethernet/USB)
- Support for hardware virtualization and partitioning enforcement
- QorIQ Platform's Trust Architecture 2.0

Signed-off-by: Shengzhou Liu <Shengzhou.Liu@freescale.com>
[scottwood@freescale.com: whitespace fixes]
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
new file mode 100644
index 0000000..dbe6578
--- /dev/null
+++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
@@ -0,0 +1,330 @@
+/*
+ * T1023 Silicon/SoC Device Tree Source (post include)
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *	 notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in the
+ *	 documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *	 names of its contributors may be used to endorse or promote products
+ *	 derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+&ifc {
+	#address-cells = <2>;
+	#size-cells = <1>;
+	compatible = "fsl,ifc", "simple-bus";
+	interrupts = <25 2 0 0>;
+};
+
+&pci0 {
+	compatible = "fsl,t1023-pcie", "fsl,qoriq-pcie-v2.4", "fsl,qoriq-pcie";
+	device_type = "pci";
+	#size-cells = <2>;
+	#address-cells = <3>;
+	bus-range = <0x0 0xff>;
+	interrupts = <20 2 0 0>;
+	fsl,iommu-parent = <&pamu0>;
+	pcie@0 {
+		reg = <0 0 0 0 0>;
+		#interrupt-cells = <1>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		device_type = "pci";
+		interrupts = <20 2 0 0>;
+		interrupt-map-mask = <0xf800 0 0 7>;
+		interrupt-map = <
+			/* IDSEL 0x0 */
+			0000 0 0 1 &mpic 40 1 0 0
+			0000 0 0 2 &mpic 1 1 0 0
+			0000 0 0 3 &mpic 2 1 0 0
+			0000 0 0 4 &mpic 3 1 0 0
+			>;
+	};
+};
+
+&pci1 {
+	compatible = "fsl,t1023-pcie", "fsl,qoriq-pcie-v2.4", "fsl,qoriq-pcie";
+	device_type = "pci";
+	#size-cells = <2>;
+	#address-cells = <3>;
+	bus-range = <0 0xff>;
+	interrupts = <21 2 0 0>;
+	fsl,iommu-parent = <&pamu0>;
+	pcie@0 {
+		reg = <0 0 0 0 0>;
+		#interrupt-cells = <1>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		device_type = "pci";
+		interrupts = <21 2 0 0>;
+		interrupt-map-mask = <0xf800 0 0 7>;
+		interrupt-map = <
+			/* IDSEL 0x0 */
+			0000 0 0 1 &mpic 41 1 0 0
+			0000 0 0 2 &mpic 5 1 0 0
+			0000 0 0 3 &mpic 6 1 0 0
+			0000 0 0 4 &mpic 7 1 0 0
+			>;
+	};
+};
+
+&pci2 {
+	compatible = "fsl,t1023-pcie", "fsl,qoriq-pcie-v2.4", "fsl,qoriq-pcie";
+	device_type = "pci";
+	#size-cells = <2>;
+	#address-cells = <3>;
+	bus-range = <0x0 0xff>;
+	interrupts = <22 2 0 0>;
+	fsl,iommu-parent = <&pamu0>;
+	pcie@0 {
+		reg = <0 0 0 0 0>;
+		#interrupt-cells = <1>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		device_type = "pci";
+		interrupts = <22 2 0 0>;
+		interrupt-map-mask = <0xf800 0 0 7>;
+		interrupt-map = <
+			/* IDSEL 0x0 */
+			0000 0 0 1 &mpic 42 1 0 0
+			0000 0 0 2 &mpic 9 1 0 0
+			0000 0 0 3 &mpic 10 1 0 0
+			0000 0 0 4 &mpic 11 1 0 0
+			>;
+	};
+};
+
+&dcsr {
+	#address-cells = <1>;
+	#size-cells = <1>;
+	compatible = "fsl,dcsr", "simple-bus";
+
+	dcsr-epu@0 {
+		compatible = "fsl,t1023-dcsr-epu", "fsl,dcsr-epu";
+		interrupts = <52 2 0 0
+			      84 2 0 0
+			      85 2 0 0>;
+		reg = <0x0 0x1000>;
+	};
+	dcsr-npc {
+		compatible = "fsl,t1023-dcsr-cnpc", "fsl,dcsr-cnpc";
+		reg = <0x1000 0x1000 0x1002000 0x10000>;
+	};
+	dcsr-nxc@2000 {
+		compatible = "fsl,dcsr-nxc";
+		reg = <0x2000 0x1000>;
+	};
+	dcsr-corenet {
+		compatible = "fsl,dcsr-corenet";
+		reg = <0x8000 0x1000 0x1A000 0x1000>;
+	};
+	dcsr-ocn@11000 {
+		compatible = "fsl,t1023-dcsr-ocn", "fsl,dcsr-ocn";
+		reg = <0x11000 0x1000>;
+	};
+	dcsr-ddr@12000 {
+		compatible = "fsl,dcsr-ddr";
+		dev-handle = <&ddr1>;
+		reg = <0x12000 0x1000>;
+	};
+	dcsr-nal@18000 {
+		compatible = "fsl,t1023-dcsr-nal", "fsl,dcsr-nal";
+		reg = <0x18000 0x1000>;
+	};
+	dcsr-rcpm@22000 {
+		compatible = "fsl,t1023-dcsr-rcpm", "fsl,dcsr-rcpm";
+		reg = <0x22000 0x1000>;
+	};
+	dcsr-snpc@30000 {
+		compatible = "fsl,t1023-dcsr-snpc", "fsl,dcsr-snpc";
+		reg = <0x30000 0x1000 0x1022000 0x10000>;
+	};
+	dcsr-snpc@31000 {
+		compatible = "fsl,t1023-dcsr-snpc", "fsl,dcsr-snpc";
+		reg = <0x31000 0x1000 0x1042000 0x10000>;
+	};
+	dcsr-cpu-sb-proxy@100000 {
+		compatible = "fsl,dcsr-e5500-sb-proxy", "fsl,dcsr-cpu-sb-proxy";
+		cpu-handle = <&cpu0>;
+		reg = <0x100000 0x1000 0x101000 0x1000>;
+	};
+	dcsr-cpu-sb-proxy@108000 {
+		compatible = "fsl,dcsr-e5500-sb-proxy", "fsl,dcsr-cpu-sb-proxy";
+		cpu-handle = <&cpu1>;
+		reg = <0x108000 0x1000 0x109000 0x1000>;
+	};
+};
+
+&soc {
+	#address-cells = <1>;
+	#size-cells = <1>;
+	device_type = "soc";
+	compatible = "simple-bus";
+
+	soc-sram-error {
+		compatible = "fsl,soc-sram-error";
+		interrupts = <16 2 1 29>;
+	};
+
+	corenet-law@0 {
+		compatible = "fsl,corenet-law";
+		reg = <0x0 0x1000>;
+		fsl,num-laws = <16>;
+	};
+
+	ddr1: memory-controller@8000 {
+		compatible = "fsl,qoriq-memory-controller-v5.0",
+				"fsl,qoriq-memory-controller";
+		reg = <0x8000 0x1000>;
+		interrupts = <16 2 1 23>;
+	};
+
+	cpc: l3-cache-controller@10000 {
+		compatible = "fsl,t1023-l3-cache-controller", "cache";
+		reg = <0x10000 0x1000>;
+		interrupts = <16 2 1 27>;
+	};
+
+	corenet-cf@18000 {
+		compatible = "fsl,corenet2-cf";
+		reg = <0x18000 0x1000>;
+		interrupts = <16 2 1 31>;
+	};
+
+	iommu@20000 {
+		compatible = "fsl,pamu-v1.0", "fsl,pamu";
+		reg = <0x20000 0x1000>;
+		ranges = <0 0x20000 0x1000>;
+		#address-cells = <1>;
+		#size-cells = <1>;
+		interrupts = <
+			24 2 0 0
+			16 2 1 30>;
+		pamu0: pamu@0 {
+			reg = <0 0x1000>;
+			fsl,primary-cache-geometry = <128 1>;
+			fsl,secondary-cache-geometry = <32 2>;
+		};
+	};
+
+/include/ "qoriq-mpic.dtsi"
+
+	guts: global-utilities@e0000 {
+		compatible = "fsl,t1023-device-config", "fsl,qoriq-device-config-2.0";
+		reg = <0xe0000 0xe00>;
+		fsl,has-rstcr;
+		fsl,liodn-bits = <12>;
+	};
+
+/include/ "qoriq-clockgen2.dtsi"
+	global-utilities@e1000 {
+		compatible = "fsl,t1023-clockgen", "fsl,qoriq-clockgen-2.0";
+		mux0: mux0@0 {
+			#clock-cells = <0>;
+			reg = <0x0 4>;
+			compatible = "fsl,core-mux-clock";
+			clocks = <&pll0 0>, <&pll0 1>;
+			clock-names = "pll0_0", "pll0_1";
+			clock-output-names = "cmux0";
+		};
+		mux1: mux1@20 {
+			#clock-cells = <0>;
+			reg = <0x20 4>;
+			compatible = "fsl,core-mux-clock";
+			clocks = <&pll0 0>, <&pll0 1>;
+			clock-names = "pll0_0", "pll0_1";
+			clock-output-names = "cmux1";
+		};
+	};
+
+	rcpm: global-utilities@e2000 {
+		compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.0";
+		reg = <0xe2000 0x1000>;
+	};
+
+	sfp: sfp@e8000 {
+		compatible = "fsl,t1023-sfp";
+		reg = <0xe8000 0x1000>;
+	};
+
+	serdes: serdes@ea000 {
+		compatible = "fsl,t1023-serdes";
+		reg = <0xea000 0x4000>;
+	};
+
+	scfg: global-utilities@fc000 {
+		compatible = "fsl,t1023-scfg";
+		reg = <0xfc000 0x1000>;
+	};
+
+/include/ "elo3-dma-0.dtsi"
+/include/ "elo3-dma-1.dtsi"
+
+/include/ "qoriq-espi-0.dtsi"
+	spi@110000 {
+		fsl,espi-num-chipselects = <4>;
+	};
+
+/include/ "qoriq-esdhc-0.dtsi"
+	sdhc@114000 {
+		compatible = "fsl,t1023-esdhc", "fsl,esdhc";
+		fsl,iommu-parent = <&pamu0>;
+		fsl,liodn-reg = <&guts 0x530>; /* eSDHCLIODNR */
+		sdhci,auto-cmd12;
+		no-1-8-v;
+	};
+/include/ "qoriq-i2c-0.dtsi"
+/include/ "qoriq-i2c-1.dtsi"
+/include/ "qoriq-duart-0.dtsi"
+/include/ "qoriq-duart-1.dtsi"
+/include/ "qoriq-gpio-0.dtsi"
+/include/ "qoriq-gpio-1.dtsi"
+/include/ "qoriq-gpio-2.dtsi"
+/include/ "qoriq-gpio-3.dtsi"
+/include/ "qoriq-usb2-mph-0.dtsi"
+		usb0: usb@210000 {
+			compatible = "fsl-usb2-mph-v2.5", "fsl-usb2-mph";
+			fsl,iommu-parent = <&pamu0>;
+			fsl,liodn-reg = <&guts 0x520>; /* USB1LIODNR */
+			phy_type = "utmi";
+			port0;
+		};
+/include/ "qoriq-usb2-dr-0.dtsi"
+		usb1: usb@211000 {
+			compatible = "fsl-usb2-dr-v2.5", "fsl-usb2-dr";
+			fsl,iommu-parent = <&pamu0>;
+			fsl,liodn-reg = <&guts 0x524>; /* USB2LIODNR */
+			dr_mode = "host";
+			phy_type = "utmi";
+		};
+/include/ "qoriq-sata2-0.dtsi"
+sata@220000 {
+			fsl,iommu-parent = <&pamu0>;
+			fsl,liodn-reg = <&guts 0x550>; /* SATA1LIODNR */
+};
+
+/include/ "qoriq-sec5.0-0.dtsi"
+};
diff --git a/arch/powerpc/boot/dts/fsl/t1024si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1024si-post.dtsi
new file mode 100644
index 0000000..95e3af8
--- /dev/null
+++ b/arch/powerpc/boot/dts/fsl/t1024si-post.dtsi
@@ -0,0 +1,100 @@
+/*
+ * T1024 Silicon/SoC Device Tree Source (post include)
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/include/ "t1023si-post.dtsi"
+
+/ {
+	aliases {
+		vga = &display;
+		display = &display;
+	};
+
+	qe:qe@ffe140000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		device_type = "qe";
+		compatible = "fsl,qe";
+		ranges = <0x0 0xf 0xfe140000 0x40000>;
+		reg = <0xf 0xfe140000 0 0x480>;
+		fsl,qe-num-riscs = <1>;
+		fsl,qe-num-snums = <28>;
+		brg-frequency = <0>;
+		bus-frequency = <0>;
+	};
+};
+
+&soc {
+	display:display@180000 {
+		compatible = "fsl,t1024-diu", "fsl,diu";
+		reg = <0x180000 1000>;
+		interrupts = <74 2 0 0>;
+	};
+};
+
+&qe {
+	qeic: interrupt-controller@80 {
+		interrupt-controller;
+		compatible = "fsl,qe-ic";
+		#address-cells = <0>;
+		#interrupt-cells = <1>;
+		reg = <0x80 0x80>;
+		interrupts = <95 2 0 0  94 2 0 0>; //high:79 low:78
+	};
+
+	ucc@2000 {
+		cell-index = <1>;
+		reg = <0x2000 0x200>;
+		interrupts = <32>;
+		interrupt-parent = <&qeic>;
+	};
+
+	ucc@2200 {
+		cell-index = <3>;
+		reg = <0x2200 0x200>;
+		interrupts = <34>;
+		interrupt-parent = <&qeic>;
+	};
+
+	muram@10000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "fsl,qe-muram", "fsl,cpm-muram";
+		ranges = <0x0 0x10000 0x6000>;
+
+		data-only@0 {
+			compatible = "fsl,qe-muram-data", "fsl,cpm-muram-data";
+			reg = <0x0 0x6000>;
+		};
+	};
+};
diff --git a/arch/powerpc/boot/dts/fsl/t102xsi-pre.dtsi b/arch/powerpc/boot/dts/fsl/t102xsi-pre.dtsi
new file mode 100644
index 0000000..1f1a9f8
--- /dev/null
+++ b/arch/powerpc/boot/dts/fsl/t102xsi-pre.dtsi
@@ -0,0 +1,87 @@
+/*
+ * T1024/T1023 Silicon/SoC Device Tree Source (pre include)
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *	 notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in the
+ *	 documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *	 names of its contributors may be used to endorse or promote products
+ *	 derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/dts-v1/;
+
+/include/ "e5500_power_isa.dtsi"
+
+/ {
+	#address-cells = <2>;
+	#size-cells = <2>;
+	interrupt-parent = <&mpic>;
+
+	aliases {
+		ccsr = &soc;
+		dcsr = &dcsr;
+
+		dma0 = &dma0;
+		dma1 = &dma1;
+		serial0 = &serial0;
+		serial1 = &serial1;
+		serial2 = &serial2;
+		serial3 = &serial3;
+		pci0 = &pci0;
+		pci1 = &pci1;
+		pci2 = &pci2;
+		usb0 = &usb0;
+		usb1 = &usb1;
+		sdhc = &sdhc;
+
+		crypto = &crypto;
+	};
+
+	cpus {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		cpu0: PowerPC,e5500@0 {
+			device_type = "cpu";
+			reg = <0>;
+			clocks = <&mux0>;
+			next-level-cache = <&L2_1>;
+			L2_1: l2-cache {
+				next-level-cache = <&cpc>;
+			};
+		};
+		cpu1: PowerPC,e5500@1 {
+			device_type = "cpu";
+			reg = <1>;
+			clocks = <&mux1>;
+			next-level-cache = <&L2_2>;
+			L2_2: l2-cache {
+				next-level-cache = <&cpc>;
+			};
+		};
+	};
+};
-- 
cgit v0.10.2


From 2b6029e2e026771fb4e2add9b38e91f34725813c Mon Sep 17 00:00:00 2001
From: Shengzhou Liu <Shengzhou.Liu@freescale.com>
Date: Thu, 9 Apr 2015 16:07:43 +0800
Subject: powerpc/fsl-booke: Add T1024 QDS board support

Add support for Freescale T1024/T1023 QorIQ Development System Board.

T1024QDS is a high-performance computing evaluation, development and
test platform for T1024 QorIQ Power Architecture processor.

Signed-off-by: Shengzhou Liu <Shengzhou.Liu@freescale.com>
[scottwood: vendor prefix: s/at24/atmel/ and trimmed detailed
 board description with too-long lines]
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
index dbe6578..df1f068 100644
--- a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
@@ -305,26 +305,26 @@
 /include/ "qoriq-gpio-2.dtsi"
 /include/ "qoriq-gpio-3.dtsi"
 /include/ "qoriq-usb2-mph-0.dtsi"
-		usb0: usb@210000 {
-			compatible = "fsl-usb2-mph-v2.5", "fsl-usb2-mph";
-			fsl,iommu-parent = <&pamu0>;
-			fsl,liodn-reg = <&guts 0x520>; /* USB1LIODNR */
-			phy_type = "utmi";
-			port0;
-		};
+	usb0: usb@210000 {
+		compatible = "fsl-usb2-mph-v2.5", "fsl-usb2-mph";
+		fsl,iommu-parent = <&pamu0>;
+		fsl,liodn-reg = <&guts 0x520>; /* USB1LIODNR */
+		phy_type = "utmi";
+		port0;
+	};
 /include/ "qoriq-usb2-dr-0.dtsi"
-		usb1: usb@211000 {
-			compatible = "fsl-usb2-dr-v2.5", "fsl-usb2-dr";
-			fsl,iommu-parent = <&pamu0>;
-			fsl,liodn-reg = <&guts 0x524>; /* USB2LIODNR */
-			dr_mode = "host";
-			phy_type = "utmi";
-		};
+	usb1: usb@211000 {
+		compatible = "fsl-usb2-dr-v2.5", "fsl-usb2-dr";
+		fsl,iommu-parent = <&pamu0>;
+		fsl,liodn-reg = <&guts 0x524>; /* USB2LIODNR */
+		dr_mode = "host";
+		phy_type = "utmi";
+	};
 /include/ "qoriq-sata2-0.dtsi"
-sata@220000 {
-			fsl,iommu-parent = <&pamu0>;
-			fsl,liodn-reg = <&guts 0x550>; /* SATA1LIODNR */
-};
+	sata@220000 {
+		fsl,iommu-parent = <&pamu0>;
+		fsl,liodn-reg = <&guts 0x550>; /* SATA1LIODNR */
+	};
 
 /include/ "qoriq-sec5.0-0.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/t1024qds.dts b/arch/powerpc/boot/dts/t1024qds.dts
new file mode 100644
index 0000000..f31fabb
--- /dev/null
+++ b/arch/powerpc/boot/dts/t1024qds.dts
@@ -0,0 +1,251 @@
+/*
+ * T1024 QDS Device Tree Source
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *	 notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in the
+ *	 documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *	 names of its contributors may be used to endorse or promote products
+ *	 derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/include/ "fsl/t102xsi-pre.dtsi"
+
+/ {
+	model = "fsl,T1024QDS";
+	compatible = "fsl,T1024QDS";
+	#address-cells = <2>;
+	#size-cells = <2>;
+	interrupt-parent = <&mpic>;
+
+	ifc: localbus@ffe124000 {
+		reg = <0xf 0xfe124000 0 0x2000>;
+		ranges = <0 0 0xf 0xe8000000 0x08000000
+			  2 0 0xf 0xff800000 0x00010000
+			  3 0 0xf 0xffdf0000 0x00008000>;
+
+		nor@0,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "cfi-flash";
+			reg = <0x0 0x0 0x8000000>;
+			bank-width = <2>;
+			device-width = <1>;
+		};
+
+		nand@2,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,ifc-nand";
+			reg = <0x2 0x0 0x10000>;
+		};
+
+		board-control@3,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,tetra-fpga", "fsl,fpga-qixis";
+			reg = <3 0 0x300>;
+			ranges = <0 3 0 0x300>;
+		};
+	};
+
+	memory {
+		device_type = "memory";
+	};
+
+	dcsr: dcsr@f00000000 {
+		ranges = <0x00000000 0xf 0x00000000 0x01072000>;
+	};
+
+	soc: soc@ffe000000 {
+		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
+		reg = <0xf 0xfe000000 0 0x00001000>;
+		spi@110000 {
+			flash@0 {
+				#address-cells = <1>;
+				#size-cells = <1>;
+				compatible = "micron,n25q128a11";  /* 16MB */
+				reg = <0>;
+				spi-max-frequency = <10000000>;
+			};
+
+			flash@1 {
+				#address-cells = <1>;
+				#size-cells = <1>;
+				compatible = "sst,sst25wf040";  /* 512KB */
+				reg = <1>;
+				spi-max-frequency = <10000000>;
+			};
+
+			flash@2 {
+				#address-cells = <1>;
+				#size-cells = <1>;
+				compatible = "eon,en25s64";   /* 8MB */
+				reg = <2>;
+				spi-max-frequency = <10000000>;
+			};
+
+			slic@2 {
+				compatible = "maxim,ds26522";
+				reg = <2>;
+				spi-max-frequency = <2000000>;
+			};
+
+			slic@3 {
+				compatible = "maxim,ds26522";
+				reg = <3>;
+				spi-max-frequency = <2000000>;
+			};
+		};
+
+		i2c@118000 {
+			pca9547@77 {
+				compatible = "nxp,pca9547";
+				reg = <0x77>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+
+				i2c@0 {
+					#address-cells = <1>;
+					#size-cells = <0>;
+					reg = <0x0>;
+
+					eeprom@50 {
+						compatible = "atmel,24c512";
+						reg = <0x50>;
+					};
+
+					eeprom@51 {
+						compatible = "atmel,24c02";
+						reg = <0x51>;
+					};
+
+					eeprom@57 {
+						compatible = "atmel,24c02";
+						reg = <0x57>;
+					};
+				};
+
+				i2c@2 {
+					#address-cells = <1>;
+					#size-cells = <0>;
+					reg = <0x2>;
+
+					ina220@40 {
+						compatible = "ti,ina220";
+						reg = <0x40>;
+						shunt-resistor = <1000>;
+					};
+
+					ina220@41 {
+						compatible = "ti,ina220";
+						reg = <0x41>;
+						shunt-resistor = <1000>;
+					};
+				};
+
+				i2c@3 {
+					#address-cells = <1>;
+					#size-cells = <0>;
+					reg = <0x3>;
+
+					adt7461@4c {
+						/* Thermal Monitor */
+						compatible = "adi,adt7461";
+						reg = <0x4c>;
+					};
+
+					eeprom@55 {
+						compatible = "atmel,24c02";
+						reg = <0x55>;
+					};
+
+					eeprom@56 {
+						compatible = "atmel,24c512";
+						reg = <0x56>;
+					};
+
+					eeprom@57 {
+						compatible = "atmel,24c512";
+						reg = <0x57>;
+					};
+				};
+			};
+			rtc@68 {
+				compatible = "dallas,ds3232";
+				reg = <0x68>;
+				interrupts = <0x5 0x1 0 0>;
+			};
+		};
+	};
+
+	pci0: pcie@ffe240000 {
+		reg = <0xf 0xfe240000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x00000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8000000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci1: pcie@ffe250000 {
+		reg = <0xf 0xfe250000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x10000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8010000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci2: pcie@ffe260000 {
+		reg = <0xf 0xfe260000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x20000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8020000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+};
+
+/include/ "fsl/t1024si-post.dtsi"
diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig
index 2fb4b24..e628d40 100644
--- a/arch/powerpc/platforms/85xx/Kconfig
+++ b/arch/powerpc/platforms/85xx/Kconfig
@@ -282,7 +282,7 @@ config CORENET_GENERIC
 	  For 64bit kernel, the following boards are supported:
 	    T208x QDS/RDB, T4240 QDS/RDB and B4 QDS
 	  The following boards are supported for both 32bit and 64bit kernel:
-	    P5020 DS, P5040 DS and T104xQDS/RDB
+	    P5020 DS, P5040 DS, T102x QDS, T104x QDS/RDB
 
 endif # FSL_SOC_BOOKE
 
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c
index 9824d2c..f61f477 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -150,6 +150,7 @@ static const char * const boards[] __initconst = {
 	"fsl,B4860QDS",
 	"fsl,B4420QDS",
 	"fsl,B4220QDS",
+	"fsl,T1024QDS",
 	"fsl,T1040QDS",
 	"fsl,T1042QDS",
 	"fsl,T1040RDB",
-- 
cgit v0.10.2


From 5afe13fd48c72be61342edbc6ede6e60f6153227 Mon Sep 17 00:00:00 2001
From: Shengzhou Liu <Shengzhou.Liu@freescale.com>
Date: Thu, 9 Apr 2015 16:07:44 +0800
Subject: powerpc/fsl-booke: Add T1024 RDB board support

T1024RDB is a Freescale Reference Design Board that hosts the T1024 SoC.

Signed-off-by: Shengzhou Liu <Shengzhou.Liu@freescale.com>
[scottwood:  vendor prefix: s/at24/atmel/ and trimmed detailed
 board description with too-long lines]
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/boot/dts/t1024rdb.dts b/arch/powerpc/boot/dts/t1024rdb.dts
new file mode 100644
index 0000000..733e723
--- /dev/null
+++ b/arch/powerpc/boot/dts/t1024rdb.dts
@@ -0,0 +1,185 @@
+/*
+ * T1024 RDB Device Tree Source
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *	 notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in the
+ *	 documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *	 names of its contributors may be used to endorse or promote products
+ *	 derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/include/ "fsl/t102xsi-pre.dtsi"
+
+/ {
+	model = "fsl,T1024RDB";
+	compatible = "fsl,T1024RDB";
+	#address-cells = <2>;
+	#size-cells = <2>;
+	interrupt-parent = <&mpic>;
+
+	ifc: localbus@ffe124000 {
+		reg = <0xf 0xfe124000 0 0x2000>;
+		ranges = <0 0 0xf 0xe8000000 0x08000000
+			  2 0 0xf 0xff800000 0x00010000
+			  3 0 0xf 0xffdf0000 0x00008000>;
+
+		nor@0,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "cfi-flash";
+			reg = <0x0 0x0 0x8000000>;
+			bank-width = <2>;
+			device-width = <1>;
+		};
+
+		nand@1,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,ifc-nand";
+			reg = <0x2 0x0 0x10000>;
+		};
+
+		board-control@2,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,t1024-cpld";
+			reg = <3 0 0x300>;
+			ranges = <0 3 0 0x300>;
+			bank-width = <1>;
+			device-width = <1>;
+		};
+	};
+
+	memory {
+		device_type = "memory";
+	};
+
+	dcsr: dcsr@f00000000 {
+		ranges = <0x00000000 0xf 0x00000000 0x01072000>;
+	};
+
+	soc: soc@ffe000000 {
+		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
+		reg = <0xf 0xfe000000 0 0x00001000>;
+		spi@110000 {
+			flash@0 {
+				#address-cells = <1>;
+				#size-cells = <1>;
+				compatible = "micron,n25q512ax3";
+				reg = <0>;
+				spi-max-frequency = <10000000>; /* input clk */
+			};
+
+			slic@1 {
+				compatible = "maxim,ds26522";
+				reg = <1>;
+				spi-max-frequency = <2000000>;
+			};
+
+			slic@2 {
+				compatible = "maxim,ds26522";
+				reg = <2>;
+				spi-max-frequency = <2000000>;
+			};
+		};
+
+		i2c@118000 {
+			adt7461@4c {
+				/* Thermal Monitor */
+				compatible = "adi,adt7461";
+				reg = <0x4c>;
+			};
+
+			eeprom@50 {
+				compatible = "atmel,24c256";
+				reg = <0x50>;
+			};
+
+			rtc@68 {
+				compatible = "dallas,ds1339";
+				reg = <0x68>;
+				interrupts = <0x1 0x1 0 0>;
+			};
+		};
+
+		i2c@118100 {
+			pca9546@77 {
+				compatible = "nxp,pca9546";
+				reg = <0x77>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
+		};
+	};
+
+	pci0: pcie@ffe240000 {
+		reg = <0xf 0xfe240000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x00000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8000000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci1: pcie@ffe250000 {
+		reg = <0xf 0xfe250000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x10000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8010000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci2: pcie@ffe260000 {
+		reg = <0xf 0xfe260000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x20000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8020000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+};
+
+/include/ "fsl/t1024si-post.dtsi"
diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig
index e628d40..97915fe 100644
--- a/arch/powerpc/platforms/85xx/Kconfig
+++ b/arch/powerpc/platforms/85xx/Kconfig
@@ -282,7 +282,7 @@ config CORENET_GENERIC
 	  For 64bit kernel, the following boards are supported:
 	    T208x QDS/RDB, T4240 QDS/RDB and B4 QDS
 	  The following boards are supported for both 32bit and 64bit kernel:
-	    P5020 DS, P5040 DS, T102x QDS, T104x QDS/RDB
+	    P5020 DS, P5040 DS, T102x QDS/RDB, T104x QDS/RDB
 
 endif # FSL_SOC_BOOKE
 
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c
index f61f477..d7c1b4d 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -151,6 +151,7 @@ static const char * const boards[] __initconst = {
 	"fsl,B4420QDS",
 	"fsl,B4220QDS",
 	"fsl,T1024QDS",
+	"fsl,T1024RDB",
 	"fsl,T1040QDS",
 	"fsl,T1042QDS",
 	"fsl,T1040RDB",
-- 
cgit v0.10.2


From 65bf2a057021b009ff9924b59203993bdff82ecc Mon Sep 17 00:00:00 2001
From: Shengzhou Liu <Shengzhou.Liu@freescale.com>
Date: Thu, 9 Apr 2015 16:07:45 +0800
Subject: powerpc/fsl-booke: Add T1023 RDB board support

T1023RDB is a Freescale Reference Design Board that hosts T1023 SoC.

T1023RDB board Overview
-----------------------
- T1023 SoC integrating two 64-bit e5500 cores up to 1.4GHz
- CoreNet fabric supporting coherent and noncoherent transactions with
  prioritization and bandwidth allocation
- Memory: 2GB Micron MT40A512M8HX unbuffered 32-bit fixed DDR4 without ECC
- Accelerator: DPAA components consist of FMan, BMan, QMan, DCE and SEC
- Ethernet interfaces:
  - one 1G RGMII port on-board(RTL8211F PHY)
  - one 1G SGMII port on-board(RTL8211F PHY)
  - one 2.5G SGMII port on-board(AQR105 PHY)
- PCIe: Two Mini-PCIe connectors on-board.
- SerDes: 4 lanes up to 10.3125GHz
- NOR:  128MB S29GL01GS110TFIV10 Spansion NOR Flash
- NAND: 512MB S34MS04G200BFI000 Spansion NAND Flash
- eSPI: 64MB S25FL512SAGMFI010 Spansion SPI flash
- USB: one Type-A USB 2.0 port with internal PHY
- eSDHC: support SD/MMC card and eMMC flash on-board
- 256Kbit M24256 I2C EEPROM
- RTC: Real-time clock DS1339 on I2C bus
- UART: one serial port on-board with RJ45 connector
- Debugging: JTAG/COP for T1023 debugging

Signed-off-by: Shengzhou Liu <Shengzhou.Liu@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/boot/dts/t1023rdb.dts b/arch/powerpc/boot/dts/t1023rdb.dts
new file mode 100644
index 0000000..06b090a
--- /dev/null
+++ b/arch/powerpc/boot/dts/t1023rdb.dts
@@ -0,0 +1,151 @@
+/*
+ * T1023 RDB Device Tree Source
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *	 notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in the
+ *	 documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *	 names of its contributors may be used to endorse or promote products
+ *	 derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/include/ "fsl/t102xsi-pre.dtsi"
+
+/ {
+	model = "fsl,T1023RDB";
+	compatible = "fsl,T1023RDB";
+	#address-cells = <2>;
+	#size-cells = <2>;
+	interrupt-parent = <&mpic>;
+
+	ifc: localbus@ffe124000 {
+		reg = <0xf 0xfe124000 0 0x2000>;
+		ranges = <0 0 0xf 0xe8000000 0x08000000
+			  1 0 0xf 0xff800000 0x00010000>;
+
+		nor@0,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			status = "disabled";
+			compatible = "cfi-flash";
+			reg = <0x0 0x0 0x8000000>;
+			bank-width = <2>;
+			device-width = <1>;
+		};
+
+		nand@1,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,ifc-nand";
+			reg = <0x2 0x0 0x10000>;
+		};
+	};
+
+	memory {
+		device_type = "memory";
+	};
+
+	dcsr: dcsr@f00000000 {
+		ranges = <0x00000000 0xf 0x00000000 0x01072000>;
+	};
+
+	soc: soc@ffe000000 {
+		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
+		reg = <0xf 0xfe000000 0 0x00001000>;
+		spi@110000 {
+			flash@0 {
+				#address-cells = <1>;
+				#size-cells = <1>;
+				compatible = "spansion,s25fl512s";
+				reg = <0>;
+				spi-max-frequency = <10000000>; /* input clk */
+			};
+		};
+
+		i2c@118000 {
+			eeprom@50 {
+				compatible = "st,m24256";
+				reg = <0x50>;
+			};
+
+			rtc@68 {
+				compatible = "dallas,ds1339";
+				reg = <0x68>;
+				interrupts = <0x5 0x1 0 0>;
+			};
+		};
+
+		i2c@118100 {
+		};
+	};
+
+	pci0: pcie@ffe240000 {
+		reg = <0xf 0xfe240000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x00000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8000000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci1: pcie@ffe250000 {
+		reg = <0xf 0xfe250000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x10000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8010000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci2: pcie@ffe260000 {
+		reg = <0xf 0xfe260000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x20000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8020000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+};
+
+/include/ "fsl/t1023si-post.dtsi"
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c
index d7c1b4d..bd839dc 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -150,6 +150,7 @@ static const char * const boards[] __initconst = {
 	"fsl,B4860QDS",
 	"fsl,B4420QDS",
 	"fsl,B4220QDS",
+	"fsl,T1023RDB",
 	"fsl,T1024QDS",
 	"fsl,T1024RDB",
 	"fsl,T1040QDS",
-- 
cgit v0.10.2


From 791b0bfae8408a5b7b9db621ce81bc7b3568981a Mon Sep 17 00:00:00 2001
From: Igal Liberman <Igal.Liberman@freescale.com>
Date: Thu, 16 Apr 2015 14:41:11 +0300
Subject: dt/bindings: fsl/guts: Added global-utilities compatibles

v3 - Addressed Scott's feedback:
	Added "fsl,<chip>-guts"

v2 - Addressed Scott's feedback

Signed-off-by: Igal Liberman <Igal.Liberman@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/Documentation/devicetree/bindings/powerpc/fsl/guts.txt b/Documentation/devicetree/bindings/powerpc/fsl/guts.txt
index 7f150b5..b71b203 100644
--- a/Documentation/devicetree/bindings/powerpc/fsl/guts.txt
+++ b/Documentation/devicetree/bindings/powerpc/fsl/guts.txt
@@ -9,6 +9,11 @@ Required properties:
 
  - compatible : Should define the compatible device type for
    global-utilities.
+   Possible compatibles:
+	"fsl,qoriq-device-config-1.0"
+	"fsl,qoriq-device-config-2.0"
+	"fsl,<chip>-device-config"
+	"fsl,<chip>-guts"
  - reg : Offset and length of the register set for the device.
 
 Recommended properties:
-- 
cgit v0.10.2


From 9b6179dc1e1a481026ce7b9222275727ec76aef4 Mon Sep 17 00:00:00 2001
From: Igal Liberman <Igal.Liberman@freescale.com>
Date: Thu, 16 Apr 2015 14:53:36 +0300
Subject: powerpc/dts: Fix incorrect clock-names property

Signed-off-by: Igal Liberman <Igal.Liberman@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi
index 86bdaf6..c9ad929 100644
--- a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi
@@ -417,7 +417,7 @@
 			compatible = "fsl,qoriq-core-mux-2.0";
 			clocks = <&pll0 0>, <&pll0 1>, <&pll0 2>,
 				 <&pll1 0>, <&pll1 1>, <&pll1 2>;
-			clock-names = "pll0", "pll0-div2", "pll1-div4",
+			clock-names = "pll0", "pll0-div2", "pll0-div4",
 				"pll1", "pll1-div2", "pll1-div4";
 			clock-output-names = "cmux0";
 		};
@@ -428,7 +428,7 @@
 			compatible = "fsl,qoriq-core-mux-2.0";
 			clocks = <&pll0 0>, <&pll0 1>, <&pll0 2>,
 				 <&pll1 0>, <&pll1 1>, <&pll1 2>;
-			clock-names = "pll0", "pll0-div2", "pll1-div4",
+			clock-names = "pll0", "pll0-div2", "pll0-div4",
 				"pll1", "pll1-div2", "pll1-div4";
 			clock-output-names = "cmux1";
 		};
-- 
cgit v0.10.2


From 86c3b16e9f330c94062c40302ab266f0d932057e Mon Sep 17 00:00:00 2001
From: LEROY Christophe <christophe.leroy@c-s.fr>
Date: Fri, 17 Apr 2015 18:37:17 +0200
Subject: powerpc/8xx: mmu_virtual_psize incorrect for 16k pages

mmu_virtual_psize shall be set to MMU_PAGE_16K when 16k pages have
been selected

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/include/asm/mmu-8xx.h b/arch/powerpc/include/asm/mmu-8xx.h
index 986b9e1..d41200c 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -145,7 +145,14 @@ typedef struct {
 } mm_context_t;
 #endif /* !__ASSEMBLY__ */
 
+#if (PAGE_SHIFT == 12)
 #define mmu_virtual_psize	MMU_PAGE_4K
+#elif (PAGE_SHIFT == 14)
+#define mmu_virtual_psize	MMU_PAGE_16K
+#else
+#error "Unsupported PAGE_SIZE"
+#endif
+
 #define mmu_linear_psize	MMU_PAGE_8M
 
 #endif /* _ASM_POWERPC_MMU_8XX_H_ */
-- 
cgit v0.10.2


From 6c0cc62715bfd0b26f7d1a79e6e8143085950ca7 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Fri, 17 Apr 2015 16:17:14 -0500
Subject: powerpc/mm: Use PFN_PHYS() in devmem_is_allowed()

This function can run on systems where physical addresses don't
fit in unsigned long, so make sure to use the macro that contains the
proper cast.

Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 45fda71..0f11819 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -560,7 +560,7 @@ subsys_initcall(add_system_ram_resources);
  */
 int devmem_is_allowed(unsigned long pfn)
 {
-	if (iomem_is_exclusive(pfn << PAGE_SHIFT))
+	if (iomem_is_exclusive(PFN_PHYS(pfn)))
 		return 0;
 	if (!page_is_ram(pfn))
 		return 1;
-- 
cgit v0.10.2


From e9326dea3fe76f38493d1b74999c45707fdc906d Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Fri, 17 Apr 2015 17:53:06 -0500
Subject: powerpc/qman: Change fsl,qman-channel-id to cell-index

It turns out that existing U-Boots will dereference NULL pointers
if the device tree does not have cell-index in the portal nodes.

No patch has yet been merged adding device tree nodes for this binding
(except a dtsi that has not yet been referenced), nor has any driver
yet been merged making use of the binding, so it's not too late to
change the binding in order to keep compatibility with existing
U-Boots.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Cc: Madalin-Cristian Bucur <madalin.bucur@freescale.com>

diff --git a/Documentation/devicetree/bindings/soc/fsl/qman-portals.txt b/Documentation/devicetree/bindings/soc/fsl/qman-portals.txt
index 48c4dae..47e46cc 100644
--- a/Documentation/devicetree/bindings/soc/fsl/qman-portals.txt
+++ b/Documentation/devicetree/bindings/soc/fsl/qman-portals.txt
@@ -47,7 +47,7 @@ PROPERTIES
 
 	For additional details about the PAMU/LIODN binding(s) see pamu.txt
 
-- fsl,qman-channel-id
+- cell-index
 	Usage:		Required
 	Value type:	<u32>
 	Definition:	The hardware index of the channel. This can also be
@@ -136,7 +136,7 @@ The example below shows a (P4080) QMan portals container/bus node with two porta
 			reg = <0x4000 0x4000>, <0x101000 0x1000>;
 			interrupts = <106 2 0 0>;
 			fsl,liodn = <3 4>;
-			fsl,qman-channel-id = <1>;
+			cell-index = <1>;
 
 			fman0 {
 				fsl,liodn = <0x22>;
-- 
cgit v0.10.2


From 7f6972a0d018daacef3b31090057771bc657f18a Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Fri, 17 Apr 2015 17:53:07 -0500
Subject: powerpc/mpc85xx: Add FSL QorIQ DPAA QMan support to device tree(s)

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Geoff Thorpe <Geoff.Thorpe@freescale.com>
Signed-off-by: Hai-Ying Wang <Haiying.Wang@freescale.com>
Signed-off-by: Chunhe Lan <Chunhe.Lan@freescale.com>
Signed-off-by: Poonam Aggrwal <poonam.aggrwal@freescale.com>
[Emil Medve: Sync with the upstream binding]
Signed-off-by: Emil Medve <Emilian.Medve@Freescale.com>
[Scott Wood: s/fsl,qman-channel-id/cell-index]
Signed-off-by: Scott Wood <scottwood@freescale.com>
Cc: Madalin-Cristian Bucur <madalin.bucur@freescale.com>

diff --git a/arch/powerpc/boot/dts/b4qds.dtsi b/arch/powerpc/boot/dts/b4qds.dtsi
index 24ed80d..559d006 100644
--- a/arch/powerpc/boot/dts/b4qds.dtsi
+++ b/arch/powerpc/boot/dts/b4qds.dtsi
@@ -106,6 +106,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -116,6 +124,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
index 68b9a05..9ba904b 100644
--- a/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
@@ -167,6 +167,75 @@
 	};
 };
 
+&qportals {
+	qportal14: qman-portal@38000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x38000 0x4000>, <0x100e000 0x1000>;
+		interrupts = <132 0x2 0 0>;
+		cell-index = <0xe>;
+	};
+	qportal15: qman-portal@3c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x3c000 0x4000>, <0x100f000 0x1000>;
+		interrupts = <134 0x2 0 0>;
+		cell-index = <0xf>;
+	};
+	qportal16: qman-portal@40000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x40000 0x4000>, <0x1010000 0x1000>;
+		interrupts = <136 0x2 0 0>;
+		cell-index = <0x10>;
+	};
+	qportal17: qman-portal@44000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x44000 0x4000>, <0x1011000 0x1000>;
+		interrupts = <138 0x2 0 0>;
+		cell-index = <0x11>;
+	};
+	qportal18: qman-portal@48000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x48000 0x4000>, <0x1012000 0x1000>;
+		interrupts = <140 0x2 0 0>;
+		cell-index = <0x12>;
+	};
+	qportal19: qman-portal@4c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4c000 0x4000>, <0x1013000 0x1000>;
+		interrupts = <142 0x2 0 0>;
+		cell-index = <0x13>;
+	};
+	qportal20: qman-portal@50000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x50000 0x4000>, <0x1014000 0x1000>;
+		interrupts = <144 0x2 0 0>;
+		cell-index = <0x14>;
+	};
+	qportal21: qman-portal@54000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x54000 0x4000>, <0x1015000 0x1000>;
+		interrupts = <146 0x2 0 0>;
+		cell-index = <0x15>;
+	};
+	qportal22: qman-portal@58000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x58000 0x4000>, <0x1016000 0x1000>;
+		interrupts = <148 0x2 0 0>;
+		cell-index = <0x16>;
+	};
+	qportal23: qman-portal@5c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x5c000 0x4000>, <0x1017000 0x1000>;
+		interrupts = <150 0x2 0 0>;
+		cell-index = <0x17>;
+	};
+	qportal24: qman-portal@60000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x60000 0x4000>, <0x1018000 0x1000>;
+		interrupts = <152 0x2 0 0>;
+		cell-index = <0x18>;
+	};
+};
+
 &soc {
 	ddr2: memory-controller@9000 {
 		compatible = "fsl,qoriq-memory-controller-v4.5", "fsl,qoriq-memory-controller";
diff --git a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
index c6bab55..603910a 100644
--- a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &ifc {
 	#address-cells = <2>;
 	#size-cells = <1>;
@@ -210,6 +220,97 @@
 	};
 };
 
+&qportals {
+	#address-cells = <0x1>;
+	#size-cells = <0x1>;
+	compatible = "simple-bus";
+
+	qportal0: qman-portal@0 {
+		compatible = "fsl,qman-portal";
+		reg = <0x0 0x4000>, <0x1000000 0x1000>;
+		interrupts = <104 0x2 0 0>;
+		cell-index = <0x0>;
+	};
+	qportal1: qman-portal@4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4000 0x4000>, <0x1001000 0x1000>;
+		interrupts = <106 0x2 0 0>;
+		cell-index = <0x1>;
+	};
+	qportal2: qman-portal@8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8000 0x4000>, <0x1002000 0x1000>;
+		interrupts = <108 0x2 0 0>;
+		cell-index = <0x2>;
+	};
+	qportal3: qman-portal@c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc000 0x4000>, <0x1003000 0x1000>;
+		interrupts = <110 0x2 0 0>;
+		cell-index = <0x3>;
+	};
+	qportal4: qman-portal@10000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x10000 0x4000>, <0x1004000 0x1000>;
+		interrupts = <112 0x2 0 0>;
+		cell-index = <0x4>;
+	};
+	qportal5: qman-portal@14000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x14000 0x4000>, <0x1005000 0x1000>;
+		interrupts = <114 0x2 0 0>;
+		cell-index = <0x5>;
+	};
+	qportal6: qman-portal@18000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x18000 0x4000>, <0x1006000 0x1000>;
+		interrupts = <116 0x2 0 0>;
+		cell-index = <0x6>;
+	};
+	qportal7: qman-portal@1c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x1c000 0x4000>, <0x1007000 0x1000>;
+		interrupts = <118 0x2 0 0>;
+		cell-index = <0x7>;
+	};
+	qportal8: qman-portal@20000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x20000 0x4000>, <0x1008000 0x1000>;
+		interrupts = <120 0x2 0 0>;
+		cell-index = <0x8>;
+	};
+	qportal9: qman-portal@24000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x24000 0x4000>, <0x1009000 0x1000>;
+		interrupts = <122 0x2 0 0>;
+		cell-index = <0x9>;
+	};
+	qportal10: qman-portal@28000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x28000 0x4000>, <0x100a000 0x1000>;
+		interrupts = <124 0x2 0 0>;
+		cell-index = <0xa>;
+	};
+	qportal11: qman-portal@2c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x2c000 0x4000>, <0x100b000 0x1000>;
+		interrupts = <126 0x2 0 0>;
+		cell-index = <0xb>;
+	};
+	qportal12: qman-portal@30000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x30000 0x4000>, <0x100c000 0x1000>;
+		interrupts = <128 0x2 0 0>;
+		cell-index = <0xc>;
+	};
+	qportal13: qman-portal@34000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x34000 0x4000>, <0x100d000 0x1000>;
+		interrupts = <130 0x2 0 0>;
+		cell-index = <0xd>;
+	};
+};
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -355,6 +456,11 @@
 /include/ "qoriq-duart-1.dtsi"
 /include/ "qoriq-sec5.3-0.dtsi"
 
+/include/ "qoriq-qman3.dtsi"
+	qman: qman@318000 {
+		interrupts = <16 2 1 28>;
+	};
+
 /include/ "qoriq-bman1.dtsi"
 	bman: bman@31a000 {
 		interrupts = <16 2 1 29>;
diff --git a/arch/powerpc/boot/dts/fsl/p1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/p1023si-post.dtsi
index 7780f21..da6d3fc 100644
--- a/arch/powerpc/boot/dts/fsl/p1023si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p1023si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
 &lbc {
 	#address-cells = <2>;
 	#size-cells = <1>;
@@ -102,6 +112,31 @@
 	};
 };
 
+&qportals {
+	#address-cells = <1>;
+	#size-cells = <1>;
+	compatible = "simple-bus";
+
+	qportal0: qman-portal@0 {
+		compatible = "fsl,qman-portal";
+		reg = <0x0 0x4000>, <0x100000 0x1000>;
+		interrupts = <29 2 0 0>;
+		cell-index = <0>;
+	};
+	qportal1: qman-portal@4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4000 0x4000>, <0x101000 0x1000>;
+		interrupts = <31 2 0 0>;
+		cell-index = <1>;
+	};
+	qportal2: qman-portal@8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8000 0x4000>, <0x102000 0x1000>;
+		interrupts = <33 2 0 0>;
+		cell-index = <2>;
+	};
+};
+
 &bportals {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -248,6 +283,14 @@
 /include/ "pq3-mpic.dtsi"
 /include/ "pq3-mpic-timer-B.dtsi"
 
+	qman: qman@88000 {
+		compatible = "fsl,qman";
+		reg = <0x88000 0x1000>;
+		interrupts = <16 2 0 0>;
+		fsl,qman-portals = <&qportals>;
+		memory-region = <&qman_fqd &qman_pfdr>;
+	};
+
 	bman: bman@8a000 {
 		compatible = "fsl,bman";
 		reg = <0x8a000 0x1000>;
diff --git a/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi b/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi
index f2feacf..1f18b8b 100644
--- a/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
 &lbc {
 	compatible = "fsl,p2041-elbc", "fsl,elbc", "simple-bus";
 	interrupts = <25 2 0 0>;
@@ -223,6 +233,8 @@
 
 /include/ "qoriq-bman1-portals.dtsi"
 
+/include/ "qoriq-qman1-portals.dtsi"
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -415,5 +427,6 @@ crypto: crypto@300000 {
 		fsl,iommu-parent = <&pamu1>;
 	};
 
+/include/ "qoriq-qman1.dtsi"
 /include/ "qoriq-bman1.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/fsl/p3041si-post.dtsi b/arch/powerpc/boot/dts/fsl/p3041si-post.dtsi
index d6fea37..a555d24 100644
--- a/arch/powerpc/boot/dts/fsl/p3041si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p3041si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
 &lbc {
 	compatible = "fsl,p3041-elbc", "fsl,elbc", "simple-bus";
 	interrupts = <25 2 0 0>;
@@ -250,6 +260,8 @@
 
 /include/ "qoriq-bman1-portals.dtsi"
 
+/include/ "qoriq-qman1-portals.dtsi"
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -442,5 +454,6 @@ crypto: crypto@300000 {
 		fsl,iommu-parent = <&pamu1>;
 	};
 
+/include/ "qoriq-qman1.dtsi"
 /include/ "qoriq-bman1.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/fsl/p4080si-post.dtsi b/arch/powerpc/boot/dts/fsl/p4080si-post.dtsi
index 89482c9..0fe7281 100644
--- a/arch/powerpc/boot/dts/fsl/p4080si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p4080si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
 &lbc {
 	compatible = "fsl,p4080-elbc", "fsl,elbc", "simple-bus";
 	interrupts = <25 2 0 0>;
@@ -250,6 +260,8 @@
 
 /include/ "qoriq-bman1-portals.dtsi"
 
+/include/ "qoriq-qman1-portals.dtsi"
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -498,5 +510,6 @@ crypto: crypto@300000 {
 		fsl,iommu-parent = <&pamu1>;
 	};
 
+/include/ "qoriq-qman1.dtsi"
 /include/ "qoriq-bman1.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi b/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi
index 6e04851..a34ca20 100644
--- a/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &lbc {
 	compatible = "fsl,p5020-elbc", "fsl,elbc", "simple-bus";
 	interrupts = <25 2 0 0>;
@@ -247,6 +257,8 @@
 
 /include/ "qoriq-bman1-portals.dtsi"
 
+/include/ "qoriq-qman1-portals.dtsi"
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -428,6 +440,7 @@
 		fsl,iommu-parent = <&pamu1>;
 	};
 
+/include/ "qoriq-qman1.dtsi"
 /include/ "qoriq-bman1.dtsi"
 
 /include/ "qoriq-raid1.0-0.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/p5040si-post.dtsi b/arch/powerpc/boot/dts/fsl/p5040si-post.dtsi
index 5e44dfa..bf57513 100644
--- a/arch/powerpc/boot/dts/fsl/p5040si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p5040si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &lbc {
 	compatible = "fsl,p5040-elbc", "fsl,elbc", "simple-bus";
 	interrupts = <25 2 0 0>;
@@ -202,6 +212,8 @@
 
 /include/ "qoriq-bman1-portals.dtsi"
 
+/include/ "qoriq-qman1-portals.dtsi"
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -407,5 +419,6 @@
 		fsl,iommu-parent = <&pamu4>;
 	};
 
+/include/ "qoriq-qman1.dtsi"
 /include/ "qoriq-bman1.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-qman1-portals.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-qman1-portals.dtsi
index 05d51ac..e77e4b4 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-qman1-portals.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-qman1-portals.dtsi
@@ -41,61 +41,61 @@
 		compatible = "fsl,qman-portal";
 		reg = <0x0 0x4000>, <0x100000 0x1000>;
 		interrupts = <104 2 0 0>;
-		fsl,qman-channel-id = <0x0>;
+		cell-index = <0x0>;
 	};
 	qportal1: qman-portal@4000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x4000 0x4000>, <0x101000 0x1000>;
 		interrupts = <106 2 0 0>;
-		fsl,qman-channel-id = <1>;
+		cell-index = <1>;
 	};
 	qportal2: qman-portal@8000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x8000 0x4000>, <0x102000 0x1000>;
 		interrupts = <108 2 0 0>;
-		fsl,qman-channel-id = <2>;
+		cell-index = <2>;
 	};
 	qportal3: qman-portal@c000 {
 		compatible = "fsl,qman-portal";
 		reg = <0xc000 0x4000>, <0x103000 0x1000>;
 		interrupts = <110 2 0 0>;
-		fsl,qman-channel-id = <3>;
+		cell-index = <3>;
 	};
 	qportal4: qman-portal@10000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x10000 0x4000>, <0x104000 0x1000>;
 		interrupts = <112 2 0 0>;
-		fsl,qman-channel-id = <4>;
+		cell-index = <4>;
 	};
 	qportal5: qman-portal@14000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x14000 0x4000>, <0x105000 0x1000>;
 		interrupts = <114 2 0 0>;
-		fsl,qman-channel-id = <5>;
+		cell-index = <5>;
 	};
 	qportal6: qman-portal@18000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x18000 0x4000>, <0x106000 0x1000>;
 		interrupts = <116 2 0 0>;
-		fsl,qman-channel-id = <6>;
+		cell-index = <6>;
 	};
 
 	qportal7: qman-portal@1c000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x1c000 0x4000>, <0x107000 0x1000>;
 		interrupts = <118 2 0 0>;
-		fsl,qman-channel-id = <7>;
+		cell-index = <7>;
 	};
 	qportal8: qman-portal@20000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x20000 0x4000>, <0x108000 0x1000>;
 		interrupts = <120 2 0 0>;
-		fsl,qman-channel-id = <8>;
+		cell-index = <8>;
 	};
 	qportal9: qman-portal@24000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x24000 0x4000>, <0x109000 0x1000>;
 		interrupts = <122 2 0 0>;
-		fsl,qman-channel-id = <9>;
+		cell-index = <9>;
 	};
 };
diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
index 5cc01be..9e9f7e2 100644
--- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &ifc {
 	#address-cells = <2>;
 	#size-cells = <1>;
@@ -280,6 +290,73 @@
 	};
 };
 
+&qportals {
+	#address-cells = <0x1>;
+	#size-cells = <0x1>;
+	compatible = "simple-bus";
+
+	qportal0: qman-portal@0 {
+		compatible = "fsl,qman-portal";
+		reg = <0x0 0x4000>, <0x1000000 0x1000>;
+		interrupts = <104 0x2 0 0>;
+		cell-index = <0x0>;
+	};
+	qportal1: qman-portal@4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4000 0x4000>, <0x1001000 0x1000>;
+		interrupts = <106 0x2 0 0>;
+		cell-index = <0x1>;
+	};
+	qportal2: qman-portal@8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8000 0x4000>, <0x1002000 0x1000>;
+		interrupts = <108 0x2 0 0>;
+		cell-index = <0x2>;
+	};
+	qportal3: qman-portal@c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc000 0x4000>, <0x1003000 0x1000>;
+		interrupts = <110 0x2 0 0>;
+		cell-index = <0x3>;
+	};
+	qportal4: qman-portal@10000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x10000 0x4000>, <0x1004000 0x1000>;
+		interrupts = <112 0x2 0 0>;
+		cell-index = <0x4>;
+	};
+	qportal5: qman-portal@14000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x14000 0x4000>, <0x1005000 0x1000>;
+		interrupts = <114 0x2 0 0>;
+		cell-index = <0x5>;
+	};
+	qportal6: qman-portal@18000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x18000 0x4000>, <0x1006000 0x1000>;
+		interrupts = <116 0x2 0 0>;
+		cell-index = <0x6>;
+	};
+	qportal7: qman-portal@1c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x1c000 0x4000>, <0x1007000 0x1000>;
+		interrupts = <118 0x2 0 0>;
+		cell-index = <0x7>;
+	};
+	qportal8: qman-portal@20000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x20000 0x4000>, <0x1008000 0x1000>;
+		interrupts = <120 0x2 0 0>;
+		cell-index = <0x8>;
+	};
+	qportal9: qman-portal@24000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x24000 0x4000>, <0x1009000 0x1000>;
+		interrupts = <122 0x2 0 0>;
+		cell-index = <0x9>;
+	};
+};
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -463,5 +540,6 @@
 		fsl,liodn-reg = <&guts 0x554>; /* SATA2LIODNR */
 	};
 /include/ "qoriq-sec5.0-0.dtsi"
+/include/ "qoriq-qman3.dtsi"
 /include/ "qoriq-bman1.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi
index c9ad929..32c790a 100644
--- a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &ifc {
 	#address-cells = <2>;
 	#size-cells = <1>;
@@ -326,6 +336,121 @@
 	};
 };
 
+&qportals {
+	#address-cells = <0x1>;
+	#size-cells = <0x1>;
+	compatible = "simple-bus";
+
+	qportal0: qman-portal@0 {
+		compatible = "fsl,qman-portal";
+		reg = <0x0 0x4000>, <0x1000000 0x1000>;
+		interrupts = <104 0x2 0 0>;
+		cell-index = <0x0>;
+	};
+	qportal1: qman-portal@4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4000 0x4000>, <0x1001000 0x1000>;
+		interrupts = <106 0x2 0 0>;
+		cell-index = <0x1>;
+	};
+	qportal2: qman-portal@8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8000 0x4000>, <0x1002000 0x1000>;
+		interrupts = <108 0x2 0 0>;
+		cell-index = <0x2>;
+	};
+	qportal3: qman-portal@c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc000 0x4000>, <0x1003000 0x1000>;
+		interrupts = <110 0x2 0 0>;
+		cell-index = <0x3>;
+	};
+	qportal4: qman-portal@10000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x10000 0x4000>, <0x1004000 0x1000>;
+		interrupts = <112 0x2 0 0>;
+		cell-index = <0x4>;
+	};
+	qportal5: qman-portal@14000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x14000 0x4000>, <0x1005000 0x1000>;
+		interrupts = <114 0x2 0 0>;
+		cell-index = <0x5>;
+	};
+	qportal6: qman-portal@18000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x18000 0x4000>, <0x1006000 0x1000>;
+		interrupts = <116 0x2 0 0>;
+		cell-index = <0x6>;
+	};
+	qportal7: qman-portal@1c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x1c000 0x4000>, <0x1007000 0x1000>;
+		interrupts = <118 0x2 0 0>;
+		cell-index = <0x7>;
+	};
+	qportal8: qman-portal@20000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x20000 0x4000>, <0x1008000 0x1000>;
+		interrupts = <120 0x2 0 0>;
+		cell-index = <0x8>;
+	};
+	qportal9: qman-portal@24000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x24000 0x4000>, <0x1009000 0x1000>;
+		interrupts = <122 0x2 0 0>;
+		cell-index = <0x9>;
+	};
+	qportal10: qman-portal@28000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x28000 0x4000>, <0x100a000 0x1000>;
+		interrupts = <124 0x2 0 0>;
+		cell-index = <0xa>;
+	};
+	qportal11: qman-portal@2c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x2c000 0x4000>, <0x100b000 0x1000>;
+		interrupts = <126 0x2 0 0>;
+		cell-index = <0xb>;
+	};
+	qportal12: qman-portal@30000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x30000 0x4000>, <0x100c000 0x1000>;
+		interrupts = <128 0x2 0 0>;
+		cell-index = <0xc>;
+	};
+	qportal13: qman-portal@34000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x34000 0x4000>, <0x100d000 0x1000>;
+		interrupts = <130 0x2 0 0>;
+		cell-index = <0xd>;
+	};
+	qportal14: qman-portal@38000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x38000 0x4000>, <0x100e000 0x1000>;
+		interrupts = <132 0x2 0 0>;
+		cell-index = <0xe>;
+	};
+	qportal15: qman-portal@3c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x3c000 0x4000>, <0x100f000 0x1000>;
+		interrupts = <134 0x2 0 0>;
+		cell-index = <0xf>;
+	};
+	qportal16: qman-portal@40000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x40000 0x4000>, <0x1010000 0x1000>;
+		interrupts = <136 0x2 0 0>;
+		cell-index = <0x10>;
+	};
+	qportal17: qman-portal@44000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x44000 0x4000>, <0x1011000 0x1000>;
+		interrupts = <138 0x2 0 0>;
+		cell-index = <0x11>;
+	};
+};
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -502,6 +627,7 @@
 		phy_type = "utmi";
 	};
 /include/ "qoriq-sec5.2-0.dtsi"
+/include/ "qoriq-qman3.dtsi"
 /include/ "qoriq-bman1.dtsi"
 
 	L2_1: l2-cache-controller@c20000 {
diff --git a/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi b/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi
index 4d4f258..d806360 100644
--- a/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &ifc {
 	#address-cells = <2>;
 	#size-cells = <1>;
@@ -556,6 +566,313 @@
 	};
 };
 
+&qportals {
+	#address-cells = <0x1>;
+	#size-cells = <0x1>;
+	compatible = "simple-bus";
+
+	qportal0: qman-portal@0 {
+		compatible = "fsl,qman-portal";
+		reg = <0x0 0x4000>, <0x1000000 0x1000>;
+		interrupts = <104 0x2 0 0>;
+		cell-index = <0x0>;
+	};
+	qportal1: qman-portal@4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4000 0x4000>, <0x1001000 0x1000>;
+		interrupts = <106 0x2 0 0>;
+		cell-index = <0x1>;
+	};
+	qportal2: qman-portal@8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8000 0x4000>, <0x1002000 0x1000>;
+		interrupts = <108 0x2 0 0>;
+		cell-index = <0x2>;
+	};
+	qportal3: qman-portal@c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc000 0x4000>, <0x1003000 0x1000>;
+		interrupts = <110 0x2 0 0>;
+		cell-index = <0x3>;
+	};
+	qportal4: qman-portal@10000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x10000 0x4000>, <0x1004000 0x1000>;
+		interrupts = <112 0x2 0 0>;
+		cell-index = <0x4>;
+	};
+	qportal5: qman-portal@14000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x14000 0x4000>, <0x1005000 0x1000>;
+		interrupts = <114 0x2 0 0>;
+		cell-index = <0x5>;
+	};
+	qportal6: qman-portal@18000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x18000 0x4000>, <0x1006000 0x1000>;
+		interrupts = <116 0x2 0 0>;
+		cell-index = <0x6>;
+	};
+	qportal7: qman-portal@1c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x1c000 0x4000>, <0x1007000 0x1000>;
+		interrupts = <118 0x2 0 0>;
+		cell-index = <0x7>;
+	};
+	qportal8: qman-portal@20000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x20000 0x4000>, <0x1008000 0x1000>;
+		interrupts = <120 0x2 0 0>;
+		cell-index = <0x8>;
+	};
+	qportal9: qman-portal@24000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x24000 0x4000>, <0x1009000 0x1000>;
+		interrupts = <122 0x2 0 0>;
+		cell-index = <0x9>;
+	};
+	qportal10: qman-portal@28000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x28000 0x4000>, <0x100a000 0x1000>;
+		interrupts = <124 0x2 0 0>;
+		cell-index = <0xa>;
+	};
+	qportal11: qman-portal@2c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x2c000 0x4000>, <0x100b000 0x1000>;
+		interrupts = <126 0x2 0 0>;
+		cell-index = <0xb>;
+	};
+	qportal12: qman-portal@30000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x30000 0x4000>, <0x100c000 0x1000>;
+		interrupts = <128 0x2 0 0>;
+		cell-index = <0xc>;
+	};
+	qportal13: qman-portal@34000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x34000 0x4000>, <0x100d000 0x1000>;
+		interrupts = <130 0x2 0 0>;
+		cell-index = <0xd>;
+	};
+	qportal14: qman-portal@38000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x38000 0x4000>, <0x100e000 0x1000>;
+		interrupts = <132 0x2 0 0>;
+		cell-index = <0xe>;
+	};
+	qportal15: qman-portal@3c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x3c000 0x4000>, <0x100f000 0x1000>;
+		interrupts = <134 0x2 0 0>;
+		cell-index = <0xf>;
+	};
+	qportal16: qman-portal@40000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x40000 0x4000>, <0x1010000 0x1000>;
+		interrupts = <136 0x2 0 0>;
+		cell-index = <0x10>;
+	};
+	qportal17: qman-portal@44000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x44000 0x4000>, <0x1011000 0x1000>;
+		interrupts = <138 0x2 0 0>;
+		cell-index = <0x11>;
+	};
+	qportal18: qman-portal@48000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x48000 0x4000>, <0x1012000 0x1000>;
+		interrupts = <140 0x2 0 0>;
+		cell-index = <0x12>;
+	};
+	qportal19: qman-portal@4c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4c000 0x4000>, <0x1013000 0x1000>;
+		interrupts = <142 0x2 0 0>;
+		cell-index = <0x13>;
+	};
+	qportal20: qman-portal@50000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x50000 0x4000>, <0x1014000 0x1000>;
+		interrupts = <144 0x2 0 0>;
+		cell-index = <0x14>;
+	};
+	qportal21: qman-portal@54000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x54000 0x4000>, <0x1015000 0x1000>;
+		interrupts = <146 0x2 0 0>;
+		cell-index = <0x15>;
+	};
+	qportal22: qman-portal@58000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x58000 0x4000>, <0x1016000 0x1000>;
+		interrupts = <148 0x2 0 0>;
+		cell-index = <0x16>;
+	};
+	qportal23: qman-portal@5c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x5c000 0x4000>, <0x1017000 0x1000>;
+		interrupts = <150 0x2 0 0>;
+		cell-index = <0x17>;
+	};
+	qportal24: qman-portal@60000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x60000 0x4000>, <0x1018000 0x1000>;
+		interrupts = <152 0x2 0 0>;
+		cell-index = <0x18>;
+	};
+	qportal25: qman-portal@64000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x64000 0x4000>, <0x1019000 0x1000>;
+		interrupts = <154 0x2 0 0>;
+		cell-index = <0x19>;
+	};
+	qportal26: qman-portal@68000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x68000 0x4000>, <0x101a000 0x1000>;
+		interrupts = <156 0x2 0 0>;
+		cell-index = <0x1a>;
+	};
+	qportal27: qman-portal@6c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x6c000 0x4000>, <0x101b000 0x1000>;
+		interrupts = <158 0x2 0 0>;
+		cell-index = <0x1b>;
+	};
+	qportal28: qman-portal@70000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x70000 0x4000>, <0x101c000 0x1000>;
+		interrupts = <160 0x2 0 0>;
+		cell-index = <0x1c>;
+	};
+	qportal29: qman-portal@74000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x74000 0x4000>, <0x101d000 0x1000>;
+		interrupts = <162 0x2 0 0>;
+		cell-index = <0x1d>;
+	};
+	qportal30: qman-portal@78000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x78000 0x4000>, <0x101e000 0x1000>;
+		interrupts = <164 0x2 0 0>;
+		cell-index = <0x1e>;
+	};
+	qportal31: qman-portal@7c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x7c000 0x4000>, <0x101f000 0x1000>;
+		interrupts = <166 0x2 0 0>;
+		cell-index = <0x1f>;
+	};
+	qportal32: qman-portal@80000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x80000 0x4000>, <0x1020000 0x1000>;
+		interrupts = <168 0x2 0 0>;
+		cell-index = <0x20>;
+	};
+	qportal33: qman-portal@84000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x84000 0x4000>, <0x1021000 0x1000>;
+		interrupts = <170 0x2 0 0>;
+		cell-index = <0x21>;
+	};
+	qportal34: qman-portal@88000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x88000 0x4000>, <0x1022000 0x1000>;
+		interrupts = <172 0x2 0 0>;
+		cell-index = <0x22>;
+	};
+	qportal35: qman-portal@8c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8c000 0x4000>, <0x1023000 0x1000>;
+		interrupts = <174 0x2 0 0>;
+		cell-index = <0x23>;
+	};
+	qportal36: qman-portal@90000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x90000 0x4000>, <0x1024000 0x1000>;
+		interrupts = <384 0x2 0 0>;
+		cell-index = <0x24>;
+	};
+	qportal37: qman-portal@94000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x94000 0x4000>, <0x1025000 0x1000>;
+		interrupts = <386 0x2 0 0>;
+		cell-index = <0x25>;
+	};
+	qportal38: qman-portal@98000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x98000 0x4000>, <0x1026000 0x1000>;
+		interrupts = <388 0x2 0 0>;
+		cell-index = <0x26>;
+	};
+	qportal39: qman-portal@9c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x9c000 0x4000>, <0x1027000 0x1000>;
+		interrupts = <390 0x2 0 0>;
+		cell-index = <0x27>;
+	};
+	qportal40: qman-portal@a0000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xa0000 0x4000>, <0x1028000 0x1000>;
+		interrupts = <392 0x2 0 0>;
+		cell-index = <0x28>;
+	};
+	qportal41: qman-portal@a4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xa4000 0x4000>, <0x1029000 0x1000>;
+		interrupts = <394 0x2 0 0>;
+		cell-index = <0x29>;
+	};
+	qportal42: qman-portal@a8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xa8000 0x4000>, <0x102a000 0x1000>;
+		interrupts = <396 0x2 0 0>;
+		cell-index = <0x2a>;
+	};
+	qportal43: qman-portal@ac000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xac000 0x4000>, <0x102b000 0x1000>;
+		interrupts = <398 0x2 0 0>;
+		cell-index = <0x2b>;
+	};
+	qportal44: qman-portal@b0000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xb0000 0x4000>, <0x102c000 0x1000>;
+		interrupts = <400 0x2 0 0>;
+		cell-index = <0x2c>;
+	};
+	qportal45: qman-portal@b4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xb4000 0x4000>, <0x102d000 0x1000>;
+		interrupts = <402 0x2 0 0>;
+		cell-index = <0x2d>;
+	};
+	qportal46: qman-portal@b8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xb8000 0x4000>, <0x102e000 0x1000>;
+		interrupts = <404 0x2 0 0>;
+		cell-index = <0x2e>;
+	};
+	qportal47: qman-portal@bc000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xbc000 0x4000>, <0x102f000 0x1000>;
+		interrupts = <406 0x2 0 0>;
+		cell-index = <0x2f>;
+	};
+	qportal48: qman-portal@c0000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc0000 0x4000>, <0x1030000 0x1000>;
+		interrupts = <408 0x2 0 0>;
+		cell-index = <0x30>;
+	};
+	qportal49: qman-portal@c4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc4000 0x4000>, <0x1031000 0x1000>;
+		interrupts = <410 0x2 0 0>;
+		cell-index = <0x31>;
+	};
+};
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -748,6 +1065,7 @@
 /include/ "qoriq-sata2-0.dtsi"
 /include/ "qoriq-sata2-1.dtsi"
 /include/ "qoriq-sec5.0-0.dtsi"
+/include/ "qoriq-qman3.dtsi"
 /include/ "qoriq-bman1.dtsi"
 
 	L2_1: l2-cache-controller@c20000 {
diff --git a/arch/powerpc/boot/dts/kmcoge4.dts b/arch/powerpc/boot/dts/kmcoge4.dts
index 97e6d11..48dab6a 100644
--- a/arch/powerpc/boot/dts/kmcoge4.dts
+++ b/arch/powerpc/boot/dts/kmcoge4.dts
@@ -34,6 +34,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -44,6 +52,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/oca4080.dts b/arch/powerpc/boot/dts/oca4080.dts
index eb76caa..42796c5 100644
--- a/arch/powerpc/boot/dts/oca4080.dts
+++ b/arch/powerpc/boot/dts/oca4080.dts
@@ -58,6 +58,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -68,6 +76,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/p1023rdb.dts b/arch/powerpc/boot/dts/p1023rdb.dts
index 9236e37..05a00a4 100644
--- a/arch/powerpc/boot/dts/p1023rdb.dts
+++ b/arch/powerpc/boot/dts/p1023rdb.dts
@@ -56,6 +56,18 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
+	};
+
+	qportals: qman-portals@ff000000 {
+		ranges = <0x0 0xf 0xff000000 0x200000>;
 	};
 
 	bportals: bman-portals@ff200000 {
diff --git a/arch/powerpc/boot/dts/p2041rdb.dts b/arch/powerpc/boot/dts/p2041rdb.dts
index c1e69dc..d2bb0765 100644
--- a/arch/powerpc/boot/dts/p2041rdb.dts
+++ b/arch/powerpc/boot/dts/p2041rdb.dts
@@ -54,6 +54,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -64,6 +72,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/p3041ds.dts b/arch/powerpc/boot/dts/p3041ds.dts
index 2192fe9..eca6c69 100644
--- a/arch/powerpc/boot/dts/p3041ds.dts
+++ b/arch/powerpc/boot/dts/p3041ds.dts
@@ -54,6 +54,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -64,6 +72,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/p4080ds.dts b/arch/powerpc/boot/dts/p4080ds.dts
index fad4416..4f80c9d 100644
--- a/arch/powerpc/boot/dts/p4080ds.dts
+++ b/arch/powerpc/boot/dts/p4080ds.dts
@@ -54,6 +54,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -64,6 +72,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/p5020ds.dts b/arch/powerpc/boot/dts/p5020ds.dts
index 7382636..d0309a8 100644
--- a/arch/powerpc/boot/dts/p5020ds.dts
+++ b/arch/powerpc/boot/dts/p5020ds.dts
@@ -54,6 +54,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -64,6 +72,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/p5040ds.dts b/arch/powerpc/boot/dts/p5040ds.dts
index 35dabf5..0516823 100644
--- a/arch/powerpc/boot/dts/p5040ds.dts
+++ b/arch/powerpc/boot/dts/p5040ds.dts
@@ -54,6 +54,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -64,6 +72,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t104xqds.dtsi b/arch/powerpc/boot/dts/t104xqds.dtsi
index f7e9bfb..1498d1e 100644
--- a/arch/powerpc/boot/dts/t104xqds.dtsi
+++ b/arch/powerpc/boot/dts/t104xqds.dtsi
@@ -47,6 +47,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	ifc: localbus@ffe124000 {
@@ -92,6 +100,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t104xrdb.dtsi b/arch/powerpc/boot/dts/t104xrdb.dtsi
index 76e07a3..830ea48 100644
--- a/arch/powerpc/boot/dts/t104xrdb.dtsi
+++ b/arch/powerpc/boot/dts/t104xrdb.dtsi
@@ -42,6 +42,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	ifc: localbus@ffe124000 {
@@ -83,6 +91,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t208xqds.dtsi b/arch/powerpc/boot/dts/t208xqds.dtsi
index c42e07f..869f915 100644
--- a/arch/powerpc/boot/dts/t208xqds.dtsi
+++ b/arch/powerpc/boot/dts/t208xqds.dtsi
@@ -48,6 +48,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	ifc: localbus@ffe124000 {
@@ -93,6 +101,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t208xrdb.dtsi b/arch/powerpc/boot/dts/t208xrdb.dtsi
index e1463b1..693d2a8 100644
--- a/arch/powerpc/boot/dts/t208xrdb.dtsi
+++ b/arch/powerpc/boot/dts/t208xrdb.dtsi
@@ -48,6 +48,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	ifc: localbus@ffe124000 {
@@ -94,6 +102,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t4240qds.dts b/arch/powerpc/boot/dts/t4240qds.dts
index 6df7776..93722da 100644
--- a/arch/powerpc/boot/dts/t4240qds.dts
+++ b/arch/powerpc/boot/dts/t4240qds.dts
@@ -109,6 +109,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -119,6 +127,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t4240rdb.dts b/arch/powerpc/boot/dts/t4240rdb.dts
index 46049cf..993eb4b 100644
--- a/arch/powerpc/boot/dts/t4240rdb.dts
+++ b/arch/powerpc/boot/dts/t4240rdb.dts
@@ -78,6 +78,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -88,6 +96,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
-- 
cgit v0.10.2


From 90883a8255148e98026591e52a3495db18b29905 Mon Sep 17 00:00:00 2001
From: LEROY Christophe <christophe.leroy@c-s.fr>
Date: Mon, 20 Apr 2015 07:54:38 +0200
Subject: powerpc/8xx: macro for handling CPU15 errata

Having a macro will help keep clear code.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 9b53fe1..1279018 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -297,6 +297,17 @@ SystemCall:
  * We have to use the MD_xxx registers for the tablewalk because the
  * equivalent MI_xxx registers only perform the attribute functions.
  */
+
+#ifdef CONFIG_8xx_CPU15
+#define INVALIDATE_ADJACENT_PAGES_CPU15(tmp, addr)	\
+	addi	tmp, addr, PAGE_SIZE;	\
+	tlbie	tmp;			\
+	addi	tmp, addr, -PAGE_SIZE;	\
+	tlbie	tmp
+#else
+#define INVALIDATE_ADJACENT_PAGES_CPU15(tmp, addr)
+#endif
+
 InstructionTLBMiss:
 #ifdef CONFIG_8xx_CPU6
 	mtspr	SPRN_DAR, r3
@@ -304,12 +315,7 @@ InstructionTLBMiss:
 	EXCEPTION_PROLOG_0
 	mtspr	SPRN_SPRG_SCRATCH2, r10
 	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
-#ifdef CONFIG_8xx_CPU15
-	addi	r11, r10, PAGE_SIZE
-	tlbie	r11
-	addi	r11, r10, -PAGE_SIZE
-	tlbie	r11
-#endif
+	INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10)
 
 	/* If we are faulting a kernel address, we have to use the
 	 * kernel page tables.
-- 
cgit v0.10.2


From d5fd9d7d669e81e024737a198d12ff8af4fd89cf Mon Sep 17 00:00:00 2001
From: LEROY Christophe <christophe.leroy@c-s.fr>
Date: Mon, 20 Apr 2015 07:54:40 +0200
Subject: powerpc/8xx: Handle CR out of exception PROLOG/EPILOG

In order to be able to reduce scope during which CR is saved, we take
CR saving/restoring out of exception PROLOG and EPILOG

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 1279018..5a69c5e 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -116,13 +116,13 @@ turn_on_mmu:
  */
 #define EXCEPTION_PROLOG	\
 	EXCEPTION_PROLOG_0;	\
+	mfcr	r10;		\
 	EXCEPTION_PROLOG_1;	\
 	EXCEPTION_PROLOG_2
 
 #define EXCEPTION_PROLOG_0	\
 	mtspr	SPRN_SPRG_SCRATCH0,r10;	\
-	mtspr	SPRN_SPRG_SCRATCH1,r11;	\
-	mfcr	r10
+	mtspr	SPRN_SPRG_SCRATCH1,r11
 
 #define EXCEPTION_PROLOG_1	\
 	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel */ \
@@ -162,7 +162,6 @@ turn_on_mmu:
  * Exception exit code.
  */
 #define EXCEPTION_EPILOG_0	\
-	mtcr	r10;		\
 	mfspr	r10,SPRN_SPRG_SCRATCH0;	\
 	mfspr	r11,SPRN_SPRG_SCRATCH1
 
@@ -313,6 +312,7 @@ InstructionTLBMiss:
 	mtspr	SPRN_DAR, r3
 #endif
 	EXCEPTION_PROLOG_0
+	mfcr	r10
 	mtspr	SPRN_SPRG_SCRATCH2, r10
 	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
 	INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10)
@@ -363,6 +363,7 @@ InstructionTLBMiss:
 	mtspr	SPRN_DAR, r11	/* Tag DAR */
 #endif
 	mfspr	r10, SPRN_SPRG_SCRATCH2
+	mtcr	r10
 	EXCEPTION_EPILOG_0
 	rfi
 
@@ -372,6 +373,7 @@ DataStoreTLBMiss:
 	mtspr	SPRN_DAR, r3
 #endif
 	EXCEPTION_PROLOG_0
+	mfcr	r10
 	mtspr	SPRN_SPRG_SCRATCH2, r10
 	mfspr	r10, SPRN_MD_EPN
 
@@ -437,6 +439,7 @@ DataStoreTLBMiss:
 #endif
 	mtspr	SPRN_DAR, r11	/* Tag DAR */
 	mfspr	r10, SPRN_SPRG_SCRATCH2
+	mtcr	r10
 	EXCEPTION_EPILOG_0
 	rfi
 
@@ -462,6 +465,7 @@ InstructionTLBError:
 	. = 0x1400
 DataTLBError:
 	EXCEPTION_PROLOG_0
+	mfcr	r10
 
 	mfspr	r11, SPRN_DAR
 	cmpwi	cr0, r11, RPN_PATTERN
-- 
cgit v0.10.2


From 2eb2fd95001e93c611034f494c7254350ff94d2a Mon Sep 17 00:00:00 2001
From: LEROY Christophe <christophe.leroy@c-s.fr>
Date: Mon, 20 Apr 2015 07:54:42 +0200
Subject: powerpc/8xx: dont save CR in SCRATCH registers

CR only needs to be preserved when checking if we are handling a kernel address.
So we can preserve CR in a register:
- In ITLBMiss, check is done only when CONFIG_MODULES is defined. Otherwise we
don't need to do anything at all with CR.
- We use r10, then we reload SRR0/MD_EPN into r10 when CR is restored

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 5a69c5e..150d03f 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -312,10 +312,6 @@ InstructionTLBMiss:
 	mtspr	SPRN_DAR, r3
 #endif
 	EXCEPTION_PROLOG_0
-	mfcr	r10
-	mtspr	SPRN_SPRG_SCRATCH2, r10
-	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
-	INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10)
 
 	/* If we are faulting a kernel address, we have to use the
 	 * kernel page tables.
@@ -323,13 +319,20 @@ InstructionTLBMiss:
 #ifdef CONFIG_MODULES
 	/* Only modules will cause ITLB Misses as we always
 	 * pin the first 8MB of kernel memory */
-	andis.	r11, r10, 0x8000	/* Address >= 0x80000000 */
-#endif
+	mfspr	r11, SPRN_SRR0	/* Get effective address of fault */
+	INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11)
+	mfcr	r10
+	andis.	r11, r11, 0x8000	/* Address >= 0x80000000 */
 	mfspr	r11, SPRN_M_TW	/* Get level 1 table */
-#ifdef CONFIG_MODULES
 	beq	3f
 	lis	r11, (swapper_pg_dir-PAGE_OFFSET)@ha
 3:
+	mtcr	r10
+	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
+#else
+	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
+	INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10)
+	mfspr	r11, SPRN_M_TW	/* Get level 1 table base address */
 #endif
 	/* Insert level 1 index */
 	rlwimi	r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29
@@ -362,8 +365,6 @@ InstructionTLBMiss:
 	mfspr	r3, SPRN_DAR
 	mtspr	SPRN_DAR, r11	/* Tag DAR */
 #endif
-	mfspr	r10, SPRN_SPRG_SCRATCH2
-	mtcr	r10
 	EXCEPTION_EPILOG_0
 	rfi
 
@@ -374,17 +375,19 @@ DataStoreTLBMiss:
 #endif
 	EXCEPTION_PROLOG_0
 	mfcr	r10
-	mtspr	SPRN_SPRG_SCRATCH2, r10
-	mfspr	r10, SPRN_MD_EPN
 
 	/* If we are faulting a kernel address, we have to use the
 	 * kernel page tables.
 	 */
-	andis.	r11, r10, 0x8000
+	mfspr	r11, SPRN_MD_EPN
+	andis.	r11, r11, 0x8000
 	mfspr	r11, SPRN_M_TW	/* Get level 1 table */
 	beq	3f
 	lis	r11, (swapper_pg_dir-PAGE_OFFSET)@ha
 3:
+	mtcr	r10
+	mfspr	r10, SPRN_MD_EPN
+
 	/* Insert level 1 index */
 	rlwimi	r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29
 	lwz	r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)	/* Get the level 1 entry */
@@ -438,8 +441,6 @@ DataStoreTLBMiss:
 	mfspr	r3, SPRN_DAR
 #endif
 	mtspr	SPRN_DAR, r11	/* Tag DAR */
-	mfspr	r10, SPRN_SPRG_SCRATCH2
-	mtcr	r10
 	EXCEPTION_EPILOG_0
 	rfi
 
-- 
cgit v0.10.2


From b821c5fe84829996e179bba5da30ee33fb8e9f9f Mon Sep 17 00:00:00 2001
From: LEROY Christophe <christophe.leroy@c-s.fr>
Date: Mon, 20 Apr 2015 07:54:44 +0200
Subject: powerpc/8xx: Use SPRG2 instead of DAR for saving r3

We now have SPRG2 available as in it not used anymore for saving CR, so we don't
need to crash DAR anymore for saving r3 for CPU6 ERRATA handling.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 150d03f..ba2dc53 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -309,7 +309,7 @@ SystemCall:
 
 InstructionTLBMiss:
 #ifdef CONFIG_8xx_CPU6
-	mtspr	SPRN_DAR, r3
+	mtspr	SPRN_SPRG_SCRATCH2, r3
 #endif
 	EXCEPTION_PROLOG_0
 
@@ -362,8 +362,7 @@ InstructionTLBMiss:
 
 	/* Restore registers */
 #ifdef CONFIG_8xx_CPU6
-	mfspr	r3, SPRN_DAR
-	mtspr	SPRN_DAR, r11	/* Tag DAR */
+	mfspr	r3, SPRN_SPRG_SCRATCH2
 #endif
 	EXCEPTION_EPILOG_0
 	rfi
@@ -371,7 +370,7 @@ InstructionTLBMiss:
 	. = 0x1200
 DataStoreTLBMiss:
 #ifdef CONFIG_8xx_CPU6
-	mtspr	SPRN_DAR, r3
+	mtspr	SPRN_SPRG_SCRATCH2, r3
 #endif
 	EXCEPTION_PROLOG_0
 	mfcr	r10
@@ -438,7 +437,7 @@ DataStoreTLBMiss:
 
 	/* Restore registers */
 #ifdef CONFIG_8xx_CPU6
-	mfspr	r3, SPRN_DAR
+	mfspr	r3, SPRN_SPRG_SCRATCH2
 #endif
 	mtspr	SPRN_DAR, r11	/* Tag DAR */
 	EXCEPTION_EPILOG_0
-- 
cgit v0.10.2


From eeba1f7c38425cfc0b70ba133ab7e523d5b6d90e Mon Sep 17 00:00:00 2001
From: LEROY Christophe <christophe.leroy@c-s.fr>
Date: Mon, 20 Apr 2015 07:54:46 +0200
Subject: powerpc/8xx: Add support for TASK_SIZE greater than 0x80000000

By default, TASK_SIZE is set to 0x80000000 for PPC_8xx, which is most
likely sufficient for most cases. However, kernel configuration allows
to set TASK_SIZE to another value, so the 8xx shall handle it.

This patch also takes into account the case of PAGE_OFFSET lower than
0x80000000, allthought most of the time it is equal to 0xC0000000

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index ba2dc53..c640bbb 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -48,6 +48,19 @@
 	mtspr	spr, reg
 #endif
 
+/* Macro to test if an address is a kernel address */
+#if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000
+#define IS_KERNEL(tmp, addr)		\
+	andis.	tmp, addr, 0x8000	/* Address >= 0x80000000 */
+#define BRANCH_UNLESS_KERNEL(label)	beq	label
+#else
+#define IS_KERNEL(tmp, addr)		\
+	rlwinm	tmp, addr, 16, 16, 31;	\
+	cmpli	cr0, tmp, PAGE_OFFSET >> 16
+#define BRANCH_UNLESS_KERNEL(label)	blt	label
+#endif
+
+
 /*
  * Value for the bits that have fixed value in RPN entries.
  * Also used for tagging DAR for DTLBerror.
@@ -322,9 +335,9 @@ InstructionTLBMiss:
 	mfspr	r11, SPRN_SRR0	/* Get effective address of fault */
 	INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11)
 	mfcr	r10
-	andis.	r11, r11, 0x8000	/* Address >= 0x80000000 */
+	IS_KERNEL(r11, r11)
 	mfspr	r11, SPRN_M_TW	/* Get level 1 table */
-	beq	3f
+	BRANCH_UNLESS_KERNEL(3f)
 	lis	r11, (swapper_pg_dir-PAGE_OFFSET)@ha
 3:
 	mtcr	r10
@@ -379,9 +392,9 @@ DataStoreTLBMiss:
 	 * kernel page tables.
 	 */
 	mfspr	r11, SPRN_MD_EPN
-	andis.	r11, r11, 0x8000
+	IS_KERNEL(r11, r11)
 	mfspr	r11, SPRN_M_TW	/* Get level 1 table */
-	beq	3f
+	BRANCH_UNLESS_KERNEL(3f)
 	lis	r11, (swapper_pg_dir-PAGE_OFFSET)@ha
 3:
 	mtcr	r10
@@ -513,9 +526,9 @@ FixupDAR:/* Entry point for dcbx workaround. */
 	mtspr	SPRN_SPRG_SCRATCH2, r10
 	/* fetch instruction from memory. */
 	mfspr	r10, SPRN_SRR0
-	andis.	r11, r10, 0x8000	/* Address >= 0x80000000 */
+	IS_KERNEL(r11, r10)
 	mfspr	r11, SPRN_M_TW	/* Get level 1 table */
-	beq	3f
+	BRANCH_UNLESS_KERNEL(3f)
 	lis	r11, (swapper_pg_dir-PAGE_OFFSET)@ha
 	/* Insert level 1 index */
 3:	rlwimi	r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29
-- 
cgit v0.10.2


From 83b086c5697169f1e34d2430dad062cc714c5c57 Mon Sep 17 00:00:00 2001
From: LEROY Christophe <christophe.leroy@c-s.fr>
Date: Wed, 22 Apr 2015 12:06:41 +0200
Subject: powerpc/8xx: mark _PAGE_SHARED all types of kernel pages

All kernel pages have to be marked as shared in order to not perform
CASID verification.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h
index 97bae64..8ea537e 100644
--- a/arch/powerpc/include/asm/pte-8xx.h
+++ b/arch/powerpc/include/asm/pte-8xx.h
@@ -62,7 +62,12 @@
 
 /* We need to add _PAGE_SHARED to kernel pages */
 #define _PAGE_KERNEL_RO	(_PAGE_SHARED | _PAGE_RO | _PAGE_KNLRO)
-#define _PAGE_KERNEL_ROX	(_PAGE_EXEC | _PAGE_RO | _PAGE_KNLRO)
+#define _PAGE_KERNEL_ROX	(_PAGE_SHARED | _PAGE_RO | _PAGE_KNLRO | \
+				 _PAGE_EXEC)
+#define _PAGE_KERNEL_RW		(_PAGE_SHARED | _PAGE_DIRTY | _PAGE_RW | \
+				 _PAGE_HWWRITE)
+#define _PAGE_KERNEL_RWX	(_PAGE_SHARED | _PAGE_DIRTY | _PAGE_RW | \
+				 _PAGE_HWWRITE | _PAGE_EXEC)
 
 #endif /* __KERNEL__ */
 #endif /*  _ASM_POWERPC_PTE_8xx_H */
-- 
cgit v0.10.2


From e0a8e0d90a9f0be66ba49f4f2380a63c22d4aaae Mon Sep 17 00:00:00 2001
From: LEROY Christophe <christophe.leroy@c-s.fr>
Date: Wed, 22 Apr 2015 12:06:43 +0200
Subject: powerpc/8xx: Handle PAGE_USER via APG bits

Use of APG for handling PAGE_USER.

All pages PP exec bits are set to either 000 or 011, which means
respectively RW for Supervisor and no access for User, or RO for
Supervisor and no access for user.

Then we use the APG to say whether accesses are according to
Page rules or "all Supervisor" rules (Access to all)

Therefore, we define 2 APG groups corresponding to _PAGE_USER.
Mx_AP are initialised as follows:
GP0 => No user => 01 (all accesses performed according
				to page definition)
GP1 => User => 00 (all accesses performed as supervisor
                                according to page definition)

This removes the special 8xx handling in pte_update()

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index 64b52b1..9c32656 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -170,24 +170,6 @@ static inline unsigned long pte_update(pte_t *p,
 #ifdef PTE_ATOMIC_UPDATES
 	unsigned long old, tmp;
 
-#ifdef CONFIG_PPC_8xx
-	unsigned long tmp2;
-
-	__asm__ __volatile__("\
-1:	lwarx	%0,0,%4\n\
-	andc	%1,%0,%5\n\
-	or	%1,%1,%6\n\
-	/* 0x200 == Extended encoding, bit 22 */ \
-	/* Bit 22 has to be 1 when _PAGE_USER is unset and _PAGE_RO is set */ \
-	rlwimi	%1,%1,32-1,0x200\n /* get _PAGE_RO */ \
-	rlwinm	%3,%1,32-2,0x200\n /* get _PAGE_USER */ \
-	andc	%1,%1,%3\n\
-	stwcx.	%1,0,%4\n\
-	bne-	1b"
-	: "=&r" (old), "=&r" (tmp), "=m" (*p), "=&r" (tmp2)
-	: "r" (p), "r" (clr), "r" (set), "m" (*p)
-	: "cc" );
-#else /* CONFIG_PPC_8xx */
 	__asm__ __volatile__("\
 1:	lwarx	%0,0,%3\n\
 	andc	%1,%0,%4\n\
@@ -198,7 +180,6 @@ static inline unsigned long pte_update(pte_t *p,
 	: "=&r" (old), "=&r" (tmp), "=m" (*p)
 	: "r" (p), "r" (clr), "r" (set), "m" (*p)
 	: "cc" );
-#endif /* CONFIG_PPC_8xx */
 #else /* PTE_ATOMIC_UPDATES */
 	unsigned long old = pte_val(*p);
 	*p = __pte((old & ~clr) | set);
diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h
index 8ea537e..b82094e 100644
--- a/arch/powerpc/include/asm/pte-8xx.h
+++ b/arch/powerpc/include/asm/pte-8xx.h
@@ -34,36 +34,27 @@
 #define _PAGE_SPECIAL	0x0008	/* SW entry, forced to 0 by the TLB miss */
 #define _PAGE_DIRTY	0x0100	/* C: page changed */
 
-/* These 4 software bits must be masked out when the entry is loaded
- * into the TLB, 1 SW bit left(0x0080).
+/* These 4 software bits must be masked out when the L2 entry is loaded
+ * into the TLB.
  */
-#define _PAGE_GUARDED	0x0010	/* software: guarded access */
-#define _PAGE_ACCESSED	0x0020	/* software: page referenced */
-#define _PAGE_WRITETHRU	0x0040	/* software: caching is write through */
+#define _PAGE_GUARDED	0x0010	/* Copied to L1 G entry in DTLB */
+#define _PAGE_USER	0x0020	/* Copied to L1 APG lsb */
+#define _PAGE_ACCESSED	0x0040	/* software: page referenced */
+#define _PAGE_WRITETHRU	0x0080	/* software: caching is write through */
 
-/* Setting any bits in the nibble with the follow two controls will
- * require a TLB exception handler change.  It is assumed unused bits
- * are always zero.
- */
-#define _PAGE_RO	0x0400	/* lsb PP bits */
-#define _PAGE_USER	0x0800	/* msb PP bits */
-/* set when _PAGE_USER is unset and _PAGE_RO is set */
-#define _PAGE_KNLRO	0x0200
+#define _PAGE_RO	0x0600	/* Supervisor RO, User no access */
 
 #define _PMD_PRESENT	0x0001
 #define _PMD_BAD	0x0ff0
 #define _PMD_PAGE_MASK	0x000c
 #define _PMD_PAGE_8M	0x000c
 
-#define _PTE_NONE_MASK _PAGE_KNLRO
-
 /* Until my rework is finished, 8xx still needs atomic PTE updates */
 #define PTE_ATOMIC_UPDATES	1
 
 /* We need to add _PAGE_SHARED to kernel pages */
-#define _PAGE_KERNEL_RO	(_PAGE_SHARED | _PAGE_RO | _PAGE_KNLRO)
-#define _PAGE_KERNEL_ROX	(_PAGE_SHARED | _PAGE_RO | _PAGE_KNLRO | \
-				 _PAGE_EXEC)
+#define _PAGE_KERNEL_RO		(_PAGE_SHARED | _PAGE_RO)
+#define _PAGE_KERNEL_ROX	(_PAGE_SHARED | _PAGE_RO | _PAGE_EXEC)
 #define _PAGE_KERNEL_RW		(_PAGE_SHARED | _PAGE_DIRTY | _PAGE_RW | \
 				 _PAGE_HWWRITE)
 #define _PAGE_KERNEL_RWX	(_PAGE_SHARED | _PAGE_DIRTY | _PAGE_RW | \
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index c640bbb..c79184d 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -351,12 +351,15 @@ InstructionTLBMiss:
 	rlwimi	r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29
 	lwz	r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)	/* Get the level 1 entry */
 
-	/* Load the MI_TWC with the attributes for this "segment." */
-	MTSPR_CPU6(SPRN_MI_TWC, r11, r3)	/* Set segment attributes */
-	rlwinm	r11, r11,0,0,19	/* Extract page descriptor page address */
 	/* Extract level 2 index */
 	rlwinm	r10, r10, 32 - (PAGE_SHIFT - 2), 32 - PAGE_SHIFT, 29
-	lwzx	r10, r10, r11	/* Get the pte */
+	rlwimi	r10, r11, 0, 0, 32 - PAGE_SHIFT - 1	/* Add level 2 base */
+	lwz	r10, 0(r10)	/* Get the pte */
+
+	/* Insert the APG into the TWC from the Linux PTE. */
+	rlwimi	r11, r10, 0, 26, 26
+	/* Load the MI_TWC with the attributes for this "segment." */
+	MTSPR_CPU6(SPRN_MI_TWC, r11, r3)	/* Set segment attributes */
 
 #ifdef CONFIG_SWAP
 	rlwinm	r11, r10, 32-5, _PAGE_PRESENT
@@ -365,12 +368,12 @@ InstructionTLBMiss:
 #endif
 	li	r11, RPN_PATTERN
 	/* The Linux PTE won't go exactly into the MMU TLB.
-	 * Software indicator bits 21 and 28 must be clear.
+	 * Software indicator bits 20-23 and 28 must be clear.
 	 * Software indicator bits 24, 25, 26, and 27 must be
 	 * set.  All other Linux PTE bits control the behavior
 	 * of the MMU.
 	 */
-	rlwimi	r10, r11, 0, 0x07f8	/* Set 24-27, clear 21-23,28 */
+	rlwimi	r10, r11, 0, 0x0ff8	/* Set 24-27, clear 20-23,28 */
 	MTSPR_CPU6(SPRN_MI_RPN, r10, r3)	/* Update TLB entry */
 
 	/* Restore registers */
@@ -411,13 +414,13 @@ DataStoreTLBMiss:
 	rlwimi	r10, r11, 0, 0, 32 - PAGE_SHIFT - 1	/* Add level 2 base */
 	lwz	r10, 0(r10)	/* Get the pte */
 
-	/* Insert the Guarded flag into the TWC from the Linux PTE.
-	 * It is bit 27 of both the Linux PTE and the TWC (at least
+	/* Insert the Guarded flag and APG into the TWC from the Linux PTE.
+	 * It is bit 26-27 of both the Linux PTE and the TWC (at least
 	 * I got that right :-).  It will be better when we can put
 	 * this into the Linux pgd/pmd and load it in the operation
 	 * above.
 	 */
-	rlwimi	r11, r10, 0, 27, 27
+	rlwimi	r11, r10, 0, 26, 27
 	/* Insert the WriteThru flag into the TWC from the Linux PTE.
 	 * It is bit 25 in the Linux PTE and bit 30 in the TWC
 	 */
-- 
cgit v0.10.2


From 5b2753fc3e8a72253310d01a8b0a5bb05d917ef8 Mon Sep 17 00:00:00 2001
From: LEROY Christophe <christophe.leroy@c-s.fr>
Date: Wed, 22 Apr 2015 12:06:45 +0200
Subject: powerpc/8xx: Implementation of PAGE_EXEC

This patch implements PAGE_EXEC capability on the 8xx.

All pages PP exec bits are set to 000, which means Execute for
Supervisor and no Execute for User.
Then we use the APG to say whether accesses are according to Page
rules, "all Supervisor" rules (Exec for all) and
"all User" rules (Exec for noone)

Therefore, we define 4 APG groups. msb is _PAGE_EXEC,
lsb is _PAGE_USER. MI_AP is initialised as follows:
GP0 (00) => Not User, no exec => 11 (all accesses performed as user)
GP1 (01) => User but no exec => 11 (all accesses performed as user)
GP2 (10) => Not User, exec => 01 (rights according to page definition)
GP3 (11) => User, exec => 00 (all accesses performed as supervisor)

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
[scottwood: comments: s/exec/data/ on data side, and s/pages/pages'/]
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 6367b83..ae1fa65 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -366,7 +366,7 @@ enum {
 	    CPU_FTR_USE_TB | CPU_FTR_MAYBE_CAN_NAP | \
 	    CPU_FTR_COMMON | CPU_FTR_FPU_UNAVAILABLE)
 #define CPU_FTRS_CLASSIC32	(CPU_FTR_COMMON | CPU_FTR_USE_TB)
-#define CPU_FTRS_8XX	(CPU_FTR_USE_TB)
+#define CPU_FTRS_8XX	(CPU_FTR_USE_TB | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_40X	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_44X	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_440x6	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE | \
diff --git a/arch/powerpc/include/asm/mmu-8xx.h b/arch/powerpc/include/asm/mmu-8xx.h
index d41200c..f05500a 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -27,6 +27,19 @@
 #define MI_Ks		0x80000000	/* Should not be set */
 #define MI_Kp		0x40000000	/* Should always be set */
 
+/*
+ * All pages' PP exec bits are set to 000, which means Execute for Supervisor
+ * and no Execute for User.
+ * Then we use the APG to say whether accesses are according to Page rules,
+ * "all Supervisor" rules (Exec for all) and "all User" rules (Exec for noone)
+ * Therefore, we define 4 APG groups. msb is _PAGE_EXEC, lsb is _PAGE_USER
+ * 0 (00) => Not User, no exec => 11 (all accesses performed as user)
+ * 1 (01) => User but no exec => 11 (all accesses performed as user)
+ * 2 (10) => Not User, exec => 01 (rights according to page definition)
+ * 3 (11) => User, exec => 00 (all accesses performed as supervisor)
+ */
+#define MI_APG_INIT	0xf4ffffff
+
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MI_RPN is written, bits in
  * this register are used to create the TLB entry.
@@ -87,6 +100,19 @@
 #define MD_Ks		0x80000000	/* Should not be set */
 #define MD_Kp		0x40000000	/* Should always be set */
 
+/*
+ * All pages' PP data bits are set to either 000 or 011, which means
+ * respectively RW for Supervisor and no access for User, or RO for
+ * Supervisor and no access for user.
+ * Then we use the APG to say whether accesses are according to Page rules or
+ * "all Supervisor" rules (Access to all)
+ * Therefore, we define 2 APG groups. lsb is _PAGE_USER
+ * 0 => No user => 01 (all accesses performed according to page definition)
+ * 1 => User => 00 (all accesses performed as supervisor
+ *                                 according to page definition)
+ */
+#define MD_APG_INIT	0x4fffffff
+
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MD_RPN is written, bits in
  * this register are used to create the TLB entry.
diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h
index b82094e..a0e2ba9 100644
--- a/arch/powerpc/include/asm/pte-8xx.h
+++ b/arch/powerpc/include/asm/pte-8xx.h
@@ -39,8 +39,9 @@
  */
 #define _PAGE_GUARDED	0x0010	/* Copied to L1 G entry in DTLB */
 #define _PAGE_USER	0x0020	/* Copied to L1 APG lsb */
-#define _PAGE_ACCESSED	0x0040	/* software: page referenced */
+#define _PAGE_EXEC	0x0040	/* Copied to L1 APG */
 #define _PAGE_WRITETHRU	0x0080	/* software: caching is write through */
+#define _PAGE_ACCESSED	0x0800	/* software: page referenced */
 
 #define _PAGE_RO	0x0600	/* Supervisor RO, User no access */
 
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index c79184d..78c1eba 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -357,7 +357,7 @@ InstructionTLBMiss:
 	lwz	r10, 0(r10)	/* Get the pte */
 
 	/* Insert the APG into the TWC from the Linux PTE. */
-	rlwimi	r11, r10, 0, 26, 26
+	rlwimi	r11, r10, 0, 25, 26
 	/* Load the MI_TWC with the attributes for this "segment." */
 	MTSPR_CPU6(SPRN_MI_TWC, r11, r3)	/* Set segment attributes */
 
@@ -449,6 +449,7 @@ DataStoreTLBMiss:
 	 */
 	li	r11, RPN_PATTERN
 	rlwimi	r10, r11, 0, 24, 28	/* Set 24-27, clear 28 */
+	rlwimi	r10, r11, 0, 20, 20	/* clear 20 */
 	MTSPR_CPU6(SPRN_MD_RPN, r10, r3)	/* Update TLB entry */
 
 	/* Restore registers */
@@ -769,15 +770,20 @@ initial_mmu:
 	ori	r8, r8, MI_EVALID	/* Mark it valid */
 	mtspr	SPRN_MI_EPN, r8
 	mtspr	SPRN_MD_EPN, r8
-	li	r8, MI_PS8MEG		/* Set 8M byte page */
+	li	r8, MI_PS8MEG | (2 << 5)	/* Set 8M byte page, APG 2 */
 	ori	r8, r8, MI_SVALID	/* Make it valid */
 	mtspr	SPRN_MI_TWC, r8
+	li	r8, MI_PS8MEG		/* Set 8M byte page, APG 0 */
+	ori	r8, r8, MI_SVALID	/* Make it valid */
 	mtspr	SPRN_MD_TWC, r8
 	li	r8, MI_BOOTINIT		/* Create RPN for address 0 */
 	mtspr	SPRN_MI_RPN, r8		/* Store TLB entry */
 	mtspr	SPRN_MD_RPN, r8
-	lis	r8, MI_Kp@h		/* Set the protection mode */
+	lis	r8, MI_APG_INIT@h	/* Set protection modes */
+	ori	r8, r8, MI_APG_INIT@l
 	mtspr	SPRN_MI_AP, r8
+	lis	r8, MD_APG_INIT@h
+	ori	r8, r8, MD_APG_INIT@l
 	mtspr	SPRN_MD_AP, r8
 
 	/* Map another 8 MByte at the IMMR to get the processor
-- 
cgit v0.10.2


From 379caf606396a13c8da2742e9a6e0bcfaaffa726 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Sun, 17 May 2015 16:12:37 +0800
Subject: powerpc: mpc85xx: flush the l1 cache before cpu down in kexec

We observe a "Zero PT_NOTE entries found" warning when vmcore_init()
is running on the dump-capture kernel. Actually the PT_NOTE segments
is not empty, but the entries generated by crash_save_cpu() are not
flushed to the memory before we reset these cores. So we should flush
the l1 cache as what we do in cpu hotplug. With this change, we can
also kill the mpc85xx_smp_flush_dcache_kexec() since that becomes
unnecessary.

Please note: this only fix the issue on e500 core, we still need to
implement the function to flush the l2 cache for the e500mc core.
Fortunately we already had proposing patch for this support [1].
Hope we can fix this issue for e500mc after that merged.

[1] https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-March/115830.html

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 8631ac5..b8b8216 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -345,6 +345,7 @@ void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary)
 	local_irq_disable();
 
 	if (secondary) {
+		__flush_disable_L1();
 		atomic_inc(&kexec_down_cpus);
 		/* loop forever */
 		while (1);
@@ -357,61 +358,11 @@ static void mpc85xx_smp_kexec_down(void *arg)
 		ppc_md.kexec_cpu_down(0,1);
 }
 
-static void map_and_flush(unsigned long paddr)
-{
-	struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
-	unsigned long kaddr  = (unsigned long)kmap_atomic(page);
-
-	flush_dcache_range(kaddr, kaddr + PAGE_SIZE);
-	kunmap_atomic((void *)kaddr);
-}
-
-/**
- * Before we reset the other cores, we need to flush relevant cache
- * out to memory so we don't get anything corrupted, some of these flushes
- * are performed out of an overabundance of caution as interrupts are not
- * disabled yet and we can switch cores
- */
-static void mpc85xx_smp_flush_dcache_kexec(struct kimage *image)
-{
-	kimage_entry_t *ptr, entry;
-	unsigned long paddr;
-	int i;
-
-	if (image->type == KEXEC_TYPE_DEFAULT) {
-		/* normal kexec images are stored in temporary pages */
-		for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE);
-		     ptr = (entry & IND_INDIRECTION) ?
-				phys_to_virt(entry & PAGE_MASK) : ptr + 1) {
-			if (!(entry & IND_DESTINATION)) {
-				map_and_flush(entry);
-			}
-		}
-		/* flush out last IND_DONE page */
-		map_and_flush(entry);
-	} else {
-		/* crash type kexec images are copied to the crash region */
-		for (i = 0; i < image->nr_segments; i++) {
-			struct kexec_segment *seg = &image->segment[i];
-			for (paddr = seg->mem; paddr < seg->mem + seg->memsz;
-			     paddr += PAGE_SIZE) {
-				map_and_flush(paddr);
-			}
-		}
-	}
-
-	/* also flush the kimage struct to be passed in as well */
-	flush_dcache_range((unsigned long)image,
-			   (unsigned long)image + sizeof(*image));
-}
-
 static void mpc85xx_smp_machine_kexec(struct kimage *image)
 {
 	int timeout = INT_MAX;
 	int i, num_cpus = num_present_cpus();
 
-	mpc85xx_smp_flush_dcache_kexec(image);
-
 	if (image->type == KEXEC_TYPE_DEFAULT)
 		smp_call_function(mpc85xx_smp_kexec_down, NULL, 0);
 
-- 
cgit v0.10.2


From 31ea9d5dfd27b9aa6f92c207041eba8e81f4c497 Mon Sep 17 00:00:00 2001
From: Xie Xiaobo <X.Xie@freescale.com>
Date: Fri, 22 May 2015 13:07:19 +0800
Subject: powerpc/85xx: p1025twr: add module conditional to fix QE-uart issue

A ioport setting was needed when used the QE uart function on TWR-P1025.
Added a conditional definition to avoid missing this setting when the
QE-uart driver was bulit to a module.

Signed-off-by: Xie Xiaobo <X.Xie@freescale.com>
Signed-off-by: Li Pengbo <Pengbo.Li@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/platforms/85xx/twr_p102x.c b/arch/powerpc/platforms/85xx/twr_p102x.c
index 1eadb6d..30e002f 100644
--- a/arch/powerpc/platforms/85xx/twr_p102x.c
+++ b/arch/powerpc/platforms/85xx/twr_p102x.c
@@ -79,7 +79,7 @@ static void __init twr_p1025_setup_arch(void)
 	mpc85xx_qe_init();
 	mpc85xx_qe_par_io_init();
 
-#if defined(CONFIG_UCC_GETH) || defined(CONFIG_SERIAL_QE)
+#if IS_ENABLED(CONFIG_UCC_GETH) || IS_ENABLED(CONFIG_SERIAL_QE)
 	if (machine_is(twr_p1025)) {
 		struct ccsr_guts __iomem *guts;
 
@@ -101,7 +101,7 @@ static void __init twr_p1025_setup_arch(void)
 					MPC85xx_PMUXCR_QE(12));
 			iounmap(guts);
 
-#if defined(CONFIG_SERIAL_QE)
+#if IS_ENABLED(CONFIG_SERIAL_QE)
 			/* On P1025TWR board, the UCC7 acted as UART port.
 			 * However, The UCC7's CTS pin is low level in default,
 			 * it will impact the transmission in full duplex
-- 
cgit v0.10.2


From 27d4dc7116eed98775902627ba61b70e9045e321 Mon Sep 17 00:00:00 2001
From: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Date: Wed, 29 Apr 2015 15:47:23 +0530
Subject: cxl: Implement an ioctl to fetch afu card-id, offset-id and mode

Given a file descriptor on an afu device, libcxl currently uses the
major/minor number obtained from fstat on the fd to construct path to
the afu's sysfs directory. However it is possible that rather than using
one of the device in /dev/cxl, a kernel driver creates its own device
which export generic cxl interface to the userspace. This causes
problems with libcxl as it tries to use a wrong major/minor number to
construct the sysfs path and fail.

So this patch introduces a new ioctl called CXL_IOCTL_GET_AFU_ID on the
afu file descriptor to fetch the cxl_afu_id struct that holds the
card/offset-id and mode information. These info is then used by libcxl to
construct the correct path to the afu sysfs directory.

Testing:
	- Build against pseries be/le configs
	- Testing with corresponding libcxl changes to verify that it constructs
	  right sysfs path to the afu.

Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index a1cee47..9f280e3 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -315,8 +315,6 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
 #define CXL_MAX_SLICES 4
 #define MAX_AFU_MMIO_REGS 3
 
-#define CXL_MODE_DEDICATED   0x1
-#define CXL_MODE_DIRECTED    0x2
 #define CXL_MODE_TIME_SLICED 0x4
 #define CXL_SUPPORTED_MODES (CXL_MODE_DEDICATED | CXL_MODE_DIRECTED)
 
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 2364bca..9855234 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -212,6 +212,25 @@ static long afu_ioctl_process_element(struct cxl_context *ctx,
 	return 0;
 }
 
+static long afu_ioctl_get_afu_id(struct cxl_context *ctx,
+				 struct cxl_afu_id __user *upafuid)
+{
+	struct cxl_afu_id afuid = { 0 };
+
+	afuid.card_id = ctx->afu->adapter->adapter_num;
+	afuid.afu_offset = ctx->afu->slice;
+	afuid.afu_mode = ctx->afu->current_mode;
+
+	/* set the flag bit in case the afu is a slave */
+	if (ctx->afu->current_mode == CXL_MODE_DIRECTED && !ctx->master)
+		afuid.flags |= CXL_AFUID_FLAG_SLAVE;
+
+	if (copy_to_user(upafuid, &afuid, sizeof(afuid)))
+		return -EFAULT;
+
+	return 0;
+}
+
 static long afu_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct cxl_context *ctx = file->private_data;
@@ -225,6 +244,9 @@ static long afu_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		return afu_ioctl_start_work(ctx, (struct cxl_ioctl_start_work __user *)arg);
 	case CXL_IOCTL_GET_PROCESS_ELEMENT:
 		return afu_ioctl_process_element(ctx, (__u32 __user *)arg);
+	case CXL_IOCTL_GET_AFU_ID:
+		return afu_ioctl_get_afu_id(ctx, (struct cxl_afu_id __user *)
+					    arg);
 	}
 	return -EINVAL;
 }
diff --git a/include/uapi/misc/cxl.h b/include/uapi/misc/cxl.h
index cd6d789..99a8ca1 100644
--- a/include/uapi/misc/cxl.h
+++ b/include/uapi/misc/cxl.h
@@ -32,10 +32,32 @@ struct cxl_ioctl_start_work {
 #define CXL_START_WORK_ALL		(CXL_START_WORK_AMR |\
 					 CXL_START_WORK_NUM_IRQS)
 
+
+/* Possible modes that an afu can be in */
+#define CXL_MODE_DEDICATED   0x1
+#define CXL_MODE_DIRECTED    0x2
+
+/* possible flags for the cxl_afu_id flags field */
+#define CXL_AFUID_FLAG_SLAVE    0x1  /* In directed-mode afu is in slave mode */
+
+struct cxl_afu_id {
+	__u64 flags;     /* One of CXL_AFUID_FLAG_X */
+	__u32 card_id;
+	__u32 afu_offset;
+	__u32 afu_mode;  /* one of the CXL_MODE_X */
+	__u32 reserved1;
+	__u64 reserved2;
+	__u64 reserved3;
+	__u64 reserved4;
+	__u64 reserved5;
+	__u64 reserved6;
+};
+
 /* ioctl numbers */
 #define CXL_MAGIC 0xCA
 #define CXL_IOCTL_START_WORK		_IOW(CXL_MAGIC, 0x00, struct cxl_ioctl_start_work)
 #define CXL_IOCTL_GET_PROCESS_ELEMENT	_IOR(CXL_MAGIC, 0x01, __u32)
+#define CXL_IOCTL_GET_AFU_ID            _IOR(CXL_MAGIC, 0x02, struct cxl_afu_id)
 
 #define CXL_READ_MIN_SIZE 0x1000 /* 4K */
 
-- 
cgit v0.10.2


From e36f6fe1f7aa4238478d4b253aac7d3fcfff6ee0 Mon Sep 17 00:00:00 2001
From: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Date: Fri, 22 May 2015 10:56:05 +0530
Subject: cxl: Export AFU error buffer via sysfs

Export the "AFU Error Buffer" via sysfs attribute (afu_err_buf). AFU
error buffer is used by the AFU to report application specific
errors. The contents of this buffer are AFU specific and are intended to
be interpreted by the application interacting with the afu.

Suggested-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl
index d46bba8..45e9ce3 100644
--- a/Documentation/ABI/testing/sysfs-class-cxl
+++ b/Documentation/ABI/testing/sysfs-class-cxl
@@ -6,6 +6,17 @@ Example: The real path of the attribute /sys/class/cxl/afu0.0s/irqs_max is
 
 Slave contexts (eg. /sys/class/cxl/afu0.0s):
 
+What:           /sys/class/cxl/<afu>/afu_err_buf
+Date:           September 2014
+Contact:        linuxppc-dev@lists.ozlabs.org
+Description:    read only
+                AFU Error Buffer contents. The contents of this file are
+		application specific and depends on the AFU being used.
+		Applications interacting with the AFU can use this attribute
+		to know about the current error condition and take appropriate
+		action like logging the event etc.
+
+
 What:           /sys/class/cxl/<afu>/irqs_max
 Date:           September 2014
 Contact:        linuxppc-dev@lists.ozlabs.org
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 9f280e3..cfee819 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -360,6 +360,10 @@ struct cxl_afu {
 	struct mutex spa_mutex;
 	spinlock_t afu_cntl_lock;
 
+	/* AFU error buffer fields and bin attribute for sysfs */
+	u64 eb_len, eb_offset;
+	struct bin_attribute attr_eb;
+
 	/*
 	 * Only the first part of the SPA is used for the process element
 	 * linked list. The only other part that software needs to worry about
@@ -561,6 +565,9 @@ static inline void __iomem *_cxl_p2n_addr(struct cxl_afu *afu, cxl_p2n_reg_t reg
 u16 cxl_afu_cr_read16(struct cxl_afu *afu, int cr, u64 off);
 u8 cxl_afu_cr_read8(struct cxl_afu *afu, int cr, u64 off);
 
+ssize_t cxl_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
+				loff_t off, size_t count);
+
 
 struct cxl_calls {
 	void (*cxl_slbia)(struct mm_struct *mm);
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 1ef0164..c3671b3 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -593,6 +593,22 @@ static int cxl_read_afu_descriptor(struct cxl_afu *afu)
 	afu->crs_len = AFUD_CR_LEN(val) * 256;
 	afu->crs_offset = AFUD_READ_CR_OFF(afu);
 
+
+	/* eb_len is in multiple of 4K */
+	afu->eb_len = AFUD_EB_LEN(AFUD_READ_EB(afu)) * 4096;
+	afu->eb_offset = AFUD_READ_EB_OFF(afu);
+
+	/* eb_off is 4K aligned so lower 12 bits are always zero */
+	if (EXTRACT_PPC_BITS(afu->eb_offset, 0, 11) != 0) {
+		dev_warn(&afu->dev,
+			 "Invalid AFU error buffer offset %Lx\n",
+			 afu->eb_offset);
+		dev_info(&afu->dev,
+			 "Ignoring AFU error buffer in the descriptor\n");
+		/* indicate that no afu buffer exists */
+		afu->eb_len = 0;
+	}
+
 	return 0;
 }
 
@@ -672,6 +688,50 @@ static int sanitise_afu_regs(struct cxl_afu *afu)
 	return 0;
 }
 
+#define ERR_BUFF_MAX_COPY_SIZE PAGE_SIZE
+/*
+ * afu_eb_read:
+ * Called from sysfs and reads the afu error info buffer. The h/w only supports
+ * 4/8 bytes aligned access. So in case the requested offset/count arent 8 byte
+ * aligned the function uses a bounce buffer which can be max PAGE_SIZE.
+ */
+ssize_t cxl_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
+				loff_t off, size_t count)
+{
+	loff_t aligned_start, aligned_end;
+	size_t aligned_length;
+	void *tbuf;
+	const void __iomem *ebuf = afu->afu_desc_mmio + afu->eb_offset;
+
+	if (count == 0 || off < 0 || (size_t)off >= afu->eb_len)
+		return 0;
+
+	/* calculate aligned read window */
+	count = min((size_t)(afu->eb_len - off), count);
+	aligned_start = round_down(off, 8);
+	aligned_end = round_up(off + count, 8);
+	aligned_length = aligned_end - aligned_start;
+
+	/* max we can copy in one read is PAGE_SIZE */
+	if (aligned_length > ERR_BUFF_MAX_COPY_SIZE) {
+		aligned_length = ERR_BUFF_MAX_COPY_SIZE;
+		count = ERR_BUFF_MAX_COPY_SIZE - (off & 0x7);
+	}
+
+	/* use bounce buffer for copy */
+	tbuf = (void *)__get_free_page(GFP_TEMPORARY);
+	if (!tbuf)
+		return -ENOMEM;
+
+	/* perform aligned read from the mmio region */
+	memcpy_fromio(tbuf, ebuf + aligned_start, aligned_length);
+	memcpy(buf, tbuf + (off & 0x7), count);
+
+	free_page((unsigned long)tbuf);
+
+	return count;
+}
+
 static int cxl_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev)
 {
 	struct cxl_afu *afu;
diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c
index d0c38c7..4942d55 100644
--- a/drivers/misc/cxl/sysfs.c
+++ b/drivers/misc/cxl/sysfs.c
@@ -356,6 +356,16 @@ static ssize_t api_version_compatible_show(struct device *device,
 	return scnprintf(buf, PAGE_SIZE, "%i\n", CXL_API_VERSION_COMPATIBLE);
 }
 
+static ssize_t afu_eb_read(struct file *filp, struct kobject *kobj,
+			       struct bin_attribute *bin_attr, char *buf,
+			       loff_t off, size_t count)
+{
+	struct cxl_afu *afu = to_cxl_afu(container_of(kobj,
+						      struct device, kobj));
+
+	return cxl_afu_read_err_buffer(afu, buf, off, count);
+}
+
 static struct device_attribute afu_attrs[] = {
 	__ATTR_RO(mmio_size),
 	__ATTR_RO(irqs_min),
@@ -534,6 +544,10 @@ void cxl_sysfs_afu_remove(struct cxl_afu *afu)
 	struct afu_config_record *cr, *tmp;
 	int i;
 
+	/* remove the err buffer bin attribute */
+	if (afu->eb_len)
+		device_remove_bin_file(&afu->dev, &afu->attr_eb);
+
 	for (i = 0; i < ARRAY_SIZE(afu_attrs); i++)
 		device_remove_file(&afu->dev, &afu_attrs[i]);
 
@@ -555,6 +569,22 @@ int cxl_sysfs_afu_add(struct cxl_afu *afu)
 			goto err;
 	}
 
+	/* conditionally create the add the binary file for error info buffer */
+	if (afu->eb_len) {
+		afu->attr_eb.attr.name = "afu_err_buff";
+		afu->attr_eb.attr.mode = S_IRUGO;
+		afu->attr_eb.size = afu->eb_len;
+		afu->attr_eb.read = afu_eb_read;
+
+		rc = device_create_bin_file(&afu->dev, &afu->attr_eb);
+		if (rc) {
+			dev_err(&afu->dev,
+				"Unable to create eb attr for the afu. Err(%d)\n",
+				rc);
+			goto err;
+		}
+	}
+
 	for (i = 0; i < afu->crs_num; i++) {
 		cr = cxl_sysfs_afu_new_cr(afu, i);
 		if (IS_ERR(cr)) {
@@ -570,6 +600,9 @@ err1:
 	cxl_sysfs_afu_remove(afu);
 	return rc;
 err:
+	/* reset the eb_len as we havent created the bin attr */
+	afu->eb_len = 0;
+
 	for (i--; i >= 0; i--)
 		device_remove_file(&afu->dev, &afu_attrs[i]);
 	return rc;
-- 
cgit v0.10.2


From 8ac75b96be71e20ec1785ca18170890c4dfffe87 Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Fri, 8 May 2015 22:55:18 +1000
Subject: cxl: Use call_rcu to reduce latency when releasing the afu fd

The afu fd release path was identified as a significant bottleneck in
the overall performance of cxl. While an optimal AFU design would
minimise the need to close & reopen the AFU fd, it is not always
practical to avoid.

The bottleneck seems to be down to the call to synchronize_rcu(), which
will block until every other thread is guaranteed to be out of an RCU
critical section. Replace it with call_rcu() to free the context
structures later so we can return to the application sooner.

This reduces the time spent in the fd release path from 13356 usec to
13.3 usec - about a 100x speed up.

Reported-by: Fei K Chen <uchen@cn.ibm.com>
Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index d1b55fe..78ce990 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -232,12 +232,9 @@ void cxl_context_detach_all(struct cxl_afu *afu)
 	mutex_unlock(&afu->contexts_lock);
 }
 
-void cxl_context_free(struct cxl_context *ctx)
+static void reclaim_ctx(struct rcu_head *rcu)
 {
-	mutex_lock(&ctx->afu->contexts_lock);
-	idr_remove(&ctx->afu->contexts_idr, ctx->pe);
-	mutex_unlock(&ctx->afu->contexts_lock);
-	synchronize_rcu();
+	struct cxl_context *ctx = container_of(rcu, struct cxl_context, rcu);
 
 	free_page((u64)ctx->sstp);
 	ctx->sstp = NULL;
@@ -245,3 +242,11 @@ void cxl_context_free(struct cxl_context *ctx)
 	put_pid(ctx->pid);
 	kfree(ctx);
 }
+
+void cxl_context_free(struct cxl_context *ctx)
+{
+	mutex_lock(&ctx->afu->contexts_lock);
+	idr_remove(&ctx->afu->contexts_idr, ctx->pe);
+	mutex_unlock(&ctx->afu->contexts_lock);
+	call_rcu(&ctx->rcu, reclaim_ctx);
+}
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index cfee819..b361b48 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -457,6 +457,8 @@ struct cxl_context {
 	bool pending_irq;
 	bool pending_fault;
 	bool pending_afu_err;
+
+	struct rcu_head rcu;
 };
 
 struct cxl {
-- 
cgit v0.10.2


From 85a97da9584868b6cd0c54b93c5f65319f615b08 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:06:55 +1000
Subject: powerpc/copro: Fix faulting kernel segments

This fixes calculating the key bits (KP and KS) in the SLB VSID for kernel
mappings.

I'm not CCing this to stable as there are no uses of this currently.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index f031a47..7a71d7f 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -100,7 +100,7 @@ EXPORT_SYMBOL_GPL(copro_handle_mm_fault);
 
 int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 {
-	u64 vsid;
+	u64 vsid, vsidkey;
 	int psize, ssize;
 
 	switch (REGION_ID(ea)) {
@@ -109,6 +109,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 		psize = get_slice_psize(mm, ea);
 		ssize = user_segment_size(ea);
 		vsid = get_vsid(mm->context.id, ea, ssize);
+		vsidkey = SLB_VSID_USER;
 		break;
 	case VMALLOC_REGION_ID:
 		pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea);
@@ -118,19 +119,21 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 			psize = mmu_io_psize;
 		ssize = mmu_kernel_ssize;
 		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+		vsidkey = SLB_VSID_KERNEL;
 		break;
 	case KERNEL_REGION_ID:
 		pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea);
 		psize = mmu_linear_psize;
 		ssize = mmu_kernel_ssize;
 		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+		vsidkey = SLB_VSID_KERNEL;
 		break;
 	default:
 		pr_debug("%s: invalid region access at %016llx\n", __func__, ea);
 		return 1;
 	}
 
-	vsid = (vsid << slb_vsid_shift(ssize)) | SLB_VSID_USER;
+	vsid = (vsid << slb_vsid_shift(ssize)) | vsidkey;
 
 	vsid |= mmu_psize_defs[psize].sllp |
 		((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0);
-- 
cgit v0.10.2


From 5b64d2cc41216bef3e19913d1a39c324c2587d11 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Wed, 27 May 2015 16:06:56 +1000
Subject: powerpc/pci: Export symbols for CXL

Export pcibios_claim_one_bus, pcibios_scan_phb and pcibios_alloc_controller.

These will be used by the CXL driver.

Signed-off-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 0d05406..2040cd2 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -89,6 +89,7 @@ struct pci_controller *pcibios_alloc_controller(struct device_node *dev)
 #endif
 	return phb;
 }
+EXPORT_SYMBOL_GPL(pcibios_alloc_controller);
 
 void pcibios_free_controller(struct pci_controller *phb)
 {
@@ -1447,6 +1448,7 @@ void pcibios_claim_one_bus(struct pci_bus *bus)
 	list_for_each_entry(child_bus, &bus->children, node)
 		pcibios_claim_one_bus(child_bus);
 }
+EXPORT_SYMBOL_GPL(pcibios_claim_one_bus);
 
 
 /* pcibios_finish_adding_to_bus
@@ -1680,6 +1682,7 @@ void pcibios_scan_phb(struct pci_controller *hose)
 			pcie_bus_configure_settings(child);
 	}
 }
+EXPORT_SYMBOL_GPL(pcibios_scan_phb);
 
 static void fixup_hide_host_resource_fsl(struct pci_dev *dev)
 {
-- 
cgit v0.10.2


From 10e796309a670bb2c13eae5aa6f60e10d2d6a6e1 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:06:57 +1000
Subject: powerpc/pci: Add release_device() hook to phb ops

Add release_device() hook to phb ops so we can clean up for specific phbs.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 6d17bb8..4cf0caa 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -27,6 +27,8 @@ struct pci_controller_ops {
 	 * allow assignment/enabling of the device. */
 	bool		(*enable_device_hook)(struct pci_dev *);
 
+	void		(*release_device)(struct pci_dev *);
+
 	/* Called during PCI resource reassignment */
 	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
 	void		(*reset_secondary_bus)(struct pci_dev *dev);
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index 7ed85a6..7f9ed0c 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -29,7 +29,12 @@
  */
 void pcibios_release_device(struct pci_dev *dev)
 {
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
 	eeh_remove_device(dev);
+
+	if (phb->controller_ops.release_device)
+		phb->controller_ops.release_device(dev);
 }
 
 /**
-- 
cgit v0.10.2


From f46580a5cf07b3d35e357bb2401ae3c49cff7e65 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:06:58 +1000
Subject: powerpc: Add cxl context to device archdata

Add cxl context pointer to archdata.  We'll want to create one of these for cxl
PCI devices.  Put them here until we can get a pci_dev specific private data.

This location was suggested by benh.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
index 9f1371b..e9bdda8 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -46,6 +46,9 @@ struct dev_archdata {
 #ifdef CONFIG_FAIL_IOMMU
 	int fail_iommu;
 #endif
+#ifdef CONFIG_CXL_BASE
+	struct cxl_context	*cxl_ctx;
+#endif
 };
 
 struct pdev_archdata {
-- 
cgit v0.10.2


From 7a8e6bbf8593a9395dd6c61f7c5f421570600017 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:06:59 +1000
Subject: powerpc/pci: Add shutdown hook to pci_controller_ops

Currently pnv_pci_shutdown() calls the PHB shutdown code for all PHBs in the
system.  It dereferences the private_data assuming it's a powernv PHB, which
won't be the case when we have different PHB in the systems (like when we add
vPHBs for CXL).

This moves the shutdown hook to the pci_controller_ops and fixes the call site
to use that instead.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 4cf0caa..3947749 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -40,6 +40,8 @@ struct pci_controller_ops {
 #endif
 
 	int             (*dma_set_mask)(struct pci_dev *dev, u64 dma_mask);
+
+	void		(*shutdown)(struct pci_controller *);
 };
 
 /*
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 508f530..cec43c8 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2643,8 +2643,10 @@ static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
 	return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
 }
 
-static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
+static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 {
+	struct pnv_phb *phb = hose->private_data;
+
 	opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
 		       OPAL_ASSERT_RESET);
 }
@@ -2659,6 +2661,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
        .window_alignment = pnv_pci_window_alignment,
        .reset_secondary_bus = pnv_pci_reset_secondary_bus,
        .dma_set_mask = pnv_pci_ioda_dma_set_mask,
+       .shutdown = pnv_pci_ioda_shutdown,
 };
 
 static void __init pnv_pci_init_ioda_phb(struct device_node *np,
@@ -2806,9 +2809,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
 	phb->dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask;
 
-	/* Setup shutdown function for kexec */
-	phb->shutdown = pnv_pci_ioda_shutdown;
-
 	/* Setup MSI support */
 	pnv_pci_init_ioda_msis(phb);
 
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 8a557b5..3c35487 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -704,12 +704,9 @@ void pnv_pci_shutdown(void)
 {
 	struct pci_controller *hose;
 
-	list_for_each_entry(hose, &hose_list, list_node) {
-		struct pnv_phb *phb = hose->private_data;
-
-		if (phb && phb->shutdown)
-			phb->shutdown(phb);
-	}
+	list_for_each_entry(hose, &hose_list, list_node)
+		if (hose->controller_ops.shutdown)
+			hose->controller_ops.shutdown(hose);
 }
 
 /* Fixup wrong class code in p7ioc and p8 root complex */
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index ac8686c..714ee3c 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -110,7 +110,6 @@ struct pnv_phb {
 				     struct pci_dev *pdev);
 	void (*fixup_phb)(struct pci_controller *hose);
 	u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn);
-	void (*shutdown)(struct pnv_phb *phb);
 	int (*init_m64)(struct pnv_phb *phb);
 	void (*reserve_m64_pe)(struct pnv_phb *phb);
 	int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
-- 
cgit v0.10.2


From abeeed6d3d9948b3235a5bb77759d8c1f84de39d Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:00 +1000
Subject: powerpc/pci: Add pcibios_disable_device() hook

This adds a hook into the powerpc pci code for pci_disable_device() calls.  The
generic code already provides a weak pcibios_disable_device() symbol, so we
just need to provide our own in powerpc and it'll get picked up.

This is passed directly to the phb controller ops, provided one exists.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 3947749..b76cd56 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -27,6 +27,8 @@ struct pci_controller_ops {
 	 * allow assignment/enabling of the device. */
 	bool		(*enable_device_hook)(struct pci_dev *);
 
+	void		(*disable_device)(struct pci_dev *);
+
 	void		(*release_device)(struct pci_dev *);
 
 	/* Called during PCI resource reassignment */
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 2040cd2..b9de34d 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1490,6 +1490,14 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
 	return pci_enable_resources(dev, mask);
 }
 
+void pcibios_disable_device(struct pci_dev *dev)
+{
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	if (phb->controller_ops.disable_device)
+		phb->controller_ops.disable_device(dev);
+}
+
 resource_size_t pcibios_io_space_offset(struct pci_controller *hose)
 {
 	return (unsigned long) hose->io_base_virt - _IO_BASE;
-- 
cgit v0.10.2


From aee85fb6ba766dc813ec0211a7340334b27b8f47 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:01 +1000
Subject: cxl: Document external user of existing API

Now that libcxl is public, let's document it.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl
index 45e9ce3..acfe9df 100644
--- a/Documentation/ABI/testing/sysfs-class-cxl
+++ b/Documentation/ABI/testing/sysfs-class-cxl
@@ -26,6 +26,7 @@ Description:    read/write
                 that hardware can support (eg. 2037). Write values will limit
                 userspace applications to that many userspace interrupts. Must
                 be >= irqs_min.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>/irqs_min
 Date:           September 2014
@@ -35,6 +36,7 @@ Description:    read only
                 userspace must request on a CXL_START_WORK ioctl. Userspace may
                 omit the num_interrupts field in the START_WORK IOCTL to get
                 this minimum automatically.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>/mmio_size
 Date:           September 2014
@@ -42,6 +44,7 @@ Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
                 Decimal value of the size of the MMIO space that may be mmaped
                 by userspace.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>/modes_supported
 Date:           September 2014
@@ -49,6 +52,7 @@ Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
                 List of the modes this AFU supports. One per line.
                 Valid entries are: "dedicated_process" and "afu_directed"
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>/mode
 Date:           September 2014
@@ -57,6 +61,7 @@ Description:    read/write
                 The current mode the AFU is using. Will be one of the modes
                 given in modes_supported. Writing will change the mode
                 provided that no user contexts are attached.
+Users:		https://github.com/ibm-capi/libcxl
 
 
 What:           /sys/class/cxl/<afu>/prefault_mode
@@ -70,6 +75,7 @@ Description:    read/write
                                  descriptor as an effective address and
                                  prefault what it points to.
                         all: all segments process calling START_WORK maps.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>/reset
 Date:           September 2014
@@ -77,12 +83,14 @@ Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    write only
                 Writing 1 here will reset the AFU provided there are not
                 contexts active on the AFU.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>/api_version
 Date:           September 2014
 Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
                 Decimal value of the current version of the kernel/user API.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>/api_version_compatible
 Date:           September 2014
@@ -90,6 +98,7 @@ Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
                 Decimal value of the the lowest version of the userspace API
                 this this kernel supports.
+Users:		https://github.com/ibm-capi/libcxl
 
 
 AFU configuration records (eg. /sys/class/cxl/afu0.0/cr0):
@@ -103,6 +112,7 @@ Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
 		Hexadecimal value of the vendor ID found in this AFU
 		configuration record.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>/cr<config num>/device
 Date:           February 2015
@@ -110,6 +120,7 @@ Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
 		Hexadecimal value of the device ID found in this AFU
 		configuration record.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>/cr<config num>/class
 Date:           February 2015
@@ -117,6 +128,7 @@ Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
 		Hexadecimal value of the class code found in this AFU
 		configuration record.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>/cr<config num>/config
 Date:           February 2015
@@ -126,6 +138,7 @@ Description:    read only
 		record. The format is expected to match the either the standard
 		or extended configuration space defined by the PCIe
 		specification.
+Users:		https://github.com/ibm-capi/libcxl
 
 
@@ -137,18 +150,21 @@ Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
                 Decimal value of the size of the MMIO space that may be mmaped
                 by userspace. This includes all slave contexts space also.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>m/pp_mmio_len
 Date:           September 2014
 Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
                 Decimal value of the Per Process MMIO space length.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<afu>m/pp_mmio_off
 Date:           September 2014
 Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
                 Decimal value of the Per Process MMIO space offset.
+Users:		https://github.com/ibm-capi/libcxl
 
 
 Card info (eg. /sys/class/cxl/card0)
@@ -158,12 +174,14 @@ Date:           September 2014
 Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
                 Identifies the CAIA Version the card implements.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<card>/psl_revision
 Date:           September 2014
 Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
                 Identifies the revision level of the PSL.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<card>/base_image
 Date:           September 2014
@@ -173,6 +191,7 @@ Description:    read only
                 that support loadable PSLs. For FPGAs this field identifies
                 the image contained in the on-adapter flash which is loaded
                 during the initial program load.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<card>/image_loaded
 Date:           September 2014
@@ -180,6 +199,7 @@ Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
                 Will return "user" or "factory" depending on the image loaded
                 onto the card.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<card>/load_image_on_perst
 Date:           December 2014
@@ -194,6 +214,7 @@ Description:    read/write
                 user or factory image to be loaded.
                 Default is to reload on PERST whichever image the card has
                 loaded.
+Users:		https://github.com/ibm-capi/libcxl
 
 What:           /sys/class/cxl/<card>/reset
 Date:           October 2014
@@ -201,3 +222,4 @@ Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    write only
                 Writing 1 will issue a PERST to card which may cause the card
                 to reload the FPGA depending on load_image_on_perst.
+Users:		https://github.com/ibm-capi/libcxl
diff --git a/Documentation/powerpc/cxl.txt b/Documentation/powerpc/cxl.txt
index 2c71ecc..2a230d01 100644
--- a/Documentation/powerpc/cxl.txt
+++ b/Documentation/powerpc/cxl.txt
@@ -133,6 +133,9 @@ User API
     The following file operations are supported on both slave and
     master devices.
 
+    A userspace library libcxl is avaliable here:
+	https://github.com/ibm-capi/libcxl
+    This provides a C interface to this kernel API.
 
 open
 ----
@@ -366,6 +369,7 @@ Sysfs Class
     enumeration and tuning of the accelerators. Its layout is
     described in Documentation/ABI/testing/sysfs-class-cxl
 
+
 Udev rules
 ==========
 
-- 
cgit v0.10.2


From aa70775e9ae1f7406fcfb877e6dbb9c13aa34840 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:02 +1000
Subject: cxl: Add shutdown hook

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index c3671b3..d966859 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1170,4 +1170,5 @@ struct pci_driver cxl_pci_driver = {
 	.id_table = cxl_pci_tbl,
 	.probe = cxl_probe,
 	.remove = cxl_remove,
+	.shutdown = cxl_remove,
 };
-- 
cgit v0.10.2


From 69c3a73c81a648daeac08225ff08b168472ff27e Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:03 +1000
Subject: cxl: Remove unnecessarily verbose print in cxl_remove()

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index d966859..ef24e21 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1154,8 +1154,6 @@ static void cxl_remove(struct pci_dev *dev)
 	struct cxl *adapter = pci_get_drvdata(dev);
 	int afu;
 
-	dev_warn(&dev->dev, "pci remove\n");
-
 	/*
 	 * Lock to prevent someone grabbing a ref through the adapter list as
 	 * we are removing it
-- 
cgit v0.10.2


From bee30c7045847524be1b4b6b23a303c6763a96f9 Mon Sep 17 00:00:00 2001
From: Ian Munsie <imunsie@au1.ibm.com>
Date: Wed, 27 May 2015 16:07:04 +1000
Subject: cxl: Re-order card init to check the VSEC earlier

When we expose AFUs as virtual PCI devices, they may look like the physical
CAPI PCI card.  ie they may have the same vendor/device IDs.

We want to avoid these AFUs binding to this driver and any init this driver may
do.

Re-order card init to check the VSEC earlier before assigning BARs or
activating CXL.  Also change the dev used in early prints as the adapter struct
may not be inited at this earlier stage.

Signed-off-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index ef24e21..787f966 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -917,13 +917,13 @@ static int cxl_read_vsec(struct cxl *adapter, struct pci_dev *dev)
 	u8 image_state;
 
 	if (!(vsec = find_cxl_vsec(dev))) {
-		dev_err(&adapter->dev, "ABORTING: CXL VSEC not found!\n");
+		dev_err(&dev->dev, "ABORTING: CXL VSEC not found!\n");
 		return -ENODEV;
 	}
 
 	CXL_READ_VSEC_LENGTH(dev, vsec, &vseclen);
 	if (vseclen < CXL_VSEC_MIN_SIZE) {
-		pr_err("ABORTING: CXL VSEC too short\n");
+		dev_err(&dev->dev, "ABORTING: CXL VSEC too short\n");
 		return -EINVAL;
 	}
 
@@ -962,24 +962,24 @@ static int cxl_vsec_looks_ok(struct cxl *adapter, struct pci_dev *dev)
 		return -EBUSY;
 
 	if (adapter->vsec_status & CXL_UNSUPPORTED_FEATURES) {
-		dev_err(&adapter->dev, "ABORTING: CXL requires unsupported features\n");
+		dev_err(&dev->dev, "ABORTING: CXL requires unsupported features\n");
 		return -EINVAL;
 	}
 
 	if (!adapter->slices) {
 		/* Once we support dynamic reprogramming we can use the card if
 		 * it supports loadable AFUs */
-		dev_err(&adapter->dev, "ABORTING: Device has no AFUs\n");
+		dev_err(&dev->dev, "ABORTING: Device has no AFUs\n");
 		return -EINVAL;
 	}
 
 	if (!adapter->afu_desc_off || !adapter->afu_desc_size) {
-		dev_err(&adapter->dev, "ABORTING: VSEC shows no AFU descriptors\n");
+		dev_err(&dev->dev, "ABORTING: VSEC shows no AFU descriptors\n");
 		return -EINVAL;
 	}
 
 	if (adapter->ps_size > p2_size(dev) - adapter->ps_off) {
-		dev_err(&adapter->dev, "ABORTING: Problem state size larger than "
+		dev_err(&dev->dev, "ABORTING: Problem state size larger than "
 				   "available in BAR2: 0x%llx > 0x%llx\n",
 			 adapter->ps_size, p2_size(dev) - adapter->ps_off);
 		return -EINVAL;
@@ -1028,6 +1028,15 @@ static struct cxl *cxl_init_adapter(struct pci_dev *dev)
 	if (!(adapter = cxl_alloc_adapter(dev)))
 		return ERR_PTR(-ENOMEM);
 
+	if ((rc = cxl_read_vsec(adapter, dev)))
+		goto err1;
+
+	if ((rc = cxl_vsec_looks_ok(adapter, dev)))
+		goto err1;
+
+	if ((rc = setup_cxl_bars(dev)))
+		goto err1;
+
 	if ((rc = switch_card_to_cxl(dev)))
 		goto err1;
 
@@ -1037,12 +1046,6 @@ static struct cxl *cxl_init_adapter(struct pci_dev *dev)
 	if ((rc = dev_set_name(&adapter->dev, "card%i", adapter->adapter_num)))
 		goto err2;
 
-	if ((rc = cxl_read_vsec(adapter, dev)))
-		goto err2;
-
-	if ((rc = cxl_vsec_looks_ok(adapter, dev)))
-		goto err2;
-
 	if ((rc = cxl_update_image_control(adapter)))
 		goto err2;
 
@@ -1127,9 +1130,6 @@ static int cxl_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	if (cxl_verbose)
 		dump_cxl_config_space(dev);
 
-	if ((rc = setup_cxl_bars(dev)))
-		return rc;
-
 	if ((rc = pci_enable_device(dev))) {
 		dev_err(&dev->dev, "pci_enable_device failed: %i\n", rc);
 		return rc;
-- 
cgit v0.10.2


From 7f436b534d5a2277b4826e252a6f622e7986bbd3 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:05 +1000
Subject: cxl: Fix error path on probe

When probing we call pci_enable_device() but don't call pci_disable_device() on
fail. This causes refcounting issues in the PCI subsystem if a second driver
tries to bind to the same device.

This patch adds the pci_disable_device() to the probe error path. This error
path is hit when this cxl driver tries to bind to AFUs (on the vPHB) rather
than the physical device.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 787f966..4a26e5f 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1138,6 +1138,7 @@ static int cxl_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	adapter = cxl_init_adapter(dev);
 	if (IS_ERR(adapter)) {
 		dev_err(&dev->dev, "cxl_init_adapter failed: %li\n", PTR_ERR(adapter));
+		pci_disable_device(dev);
 		return PTR_ERR(adapter);
 	}
 
-- 
cgit v0.10.2


From bfcdc8fffe738c4c06673ed0e447a718b43b7311 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:06 +1000
Subject: cxl: Dump debug info on the AFU configuration record

Now that we parse the AFU Configuration record, dump some info on it when in
debug mode.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 4a26e5f..12fa406 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -90,6 +90,7 @@
 /* This works a little different than the p1/p2 register accesses to make it
  * easier to pull out individual fields */
 #define AFUD_READ(afu, off)		in_be64(afu->afu_desc_mmio + off)
+#define AFUD_READ_LE(afu, off)		in_le64(afu->afu_desc_mmio + off)
 #define EXTRACT_PPC_BIT(val, bit)	(!!(val & PPC_BIT(bit)))
 #define EXTRACT_PPC_BITS(val, bs, be)	((val & PPC_BITMASK(bs, be)) >> PPC_BITLSHIFT(be))
 
@@ -286,7 +287,8 @@ static void dump_cxl_config_space(struct pci_dev *dev)
 
 static void dump_afu_descriptor(struct cxl_afu *afu)
 {
-	u64 val;
+	u64 val, afu_cr_num, afu_cr_off, afu_cr_len;
+	int i;
 
 #define show_reg(name, what) \
 	dev_info(&afu->dev, "afu desc: %30s: %#llx\n", name, what)
@@ -296,6 +298,7 @@ static void dump_afu_descriptor(struct cxl_afu *afu)
 	show_reg("num_of_processes", AFUD_NUM_PROCS(val));
 	show_reg("num_of_afu_CRs", AFUD_NUM_CRS(val));
 	show_reg("req_prog_mode", val & 0xffffULL);
+	afu_cr_num = AFUD_NUM_CRS(val);
 
 	val = AFUD_READ(afu, 0x8);
 	show_reg("Reserved", val);
@@ -307,8 +310,10 @@ static void dump_afu_descriptor(struct cxl_afu *afu)
 	val = AFUD_READ_CR(afu);
 	show_reg("Reserved", (val >> (63-7)) & 0xff);
 	show_reg("AFU_CR_len", AFUD_CR_LEN(val));
+	afu_cr_len = AFUD_CR_LEN(val) * 256;
 
 	val = AFUD_READ_CR_OFF(afu);
+	afu_cr_off = val;
 	show_reg("AFU_CR_offset", val);
 
 	val = AFUD_READ_PPPSA(afu);
@@ -325,6 +330,11 @@ static void dump_afu_descriptor(struct cxl_afu *afu)
 	val = AFUD_READ_EB_OFF(afu);
 	show_reg("AFU_EB_offset", val);
 
+	for (i = 0; i < afu_cr_num; i++) {
+		val = AFUD_READ_LE(afu, afu_cr_off + i * afu_cr_len);
+		show_reg("CR Vendor", val & 0xffff);
+		show_reg("CR Device", (val >> 16) & 0xffff);
+	}
 #undef show_reg
 }
 
-- 
cgit v0.10.2


From 6428832a7bfae73345706d63a228a6ce60af0081 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:07 +1000
Subject: cxl: Add cookie parameter to afu_release_irqs()

Add cookie parameter to afu_release_irqs() so that we can pass in a different
cookie than the context structure.  This will be useful for other kernel
drivers that want to call this but get their own cookie back in the interrupt
handler.

Update all existing call sites.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 78ce990..36bb8e4 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -186,7 +186,7 @@ static void __detach_context(struct cxl_context *ctx)
 		return;
 
 	WARN_ON(cxl_detach_process(ctx));
-	afu_release_irqs(ctx);
+	afu_release_irqs(ctx, ctx);
 	flush_work(&ctx->fault_work); /* Only needed for dedicated process */
 	wake_up_all(&ctx->wq);
 }
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index b361b48..a4dadc5 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -613,7 +613,7 @@ void cxl_release_psl_err_irq(struct cxl *adapter);
 int cxl_register_serr_irq(struct cxl_afu *afu);
 void cxl_release_serr_irq(struct cxl_afu *afu);
 int afu_register_irqs(struct cxl_context *ctx, u32 count);
-void afu_release_irqs(struct cxl_context *ctx);
+void afu_release_irqs(struct cxl_context *ctx, void *cookie);
 irqreturn_t cxl_slice_irq_err(int irq, void *data);
 
 int cxl_debugfs_init(void);
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 9855234..676e2c8 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -191,7 +191,7 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 
 	if ((rc = cxl_attach_process(ctx, false, work.work_element_descriptor,
 				     amr))) {
-		afu_release_irqs(ctx);
+		afu_release_irqs(ctx, ctx);
 		goto out;
 	}
 
diff --git a/drivers/misc/cxl/irq.c b/drivers/misc/cxl/irq.c
index c8929c5..c740c7b 100644
--- a/drivers/misc/cxl/irq.c
+++ b/drivers/misc/cxl/irq.c
@@ -477,7 +477,7 @@ out:
 	return -ENOMEM;
 }
 
-void afu_release_irqs(struct cxl_context *ctx)
+void afu_release_irqs(struct cxl_context *ctx, void *cookie)
 {
 	irq_hw_number_t hwirq;
 	unsigned int virq;
@@ -488,7 +488,7 @@ void afu_release_irqs(struct cxl_context *ctx)
 		for (i = 0; i < ctx->irqs.range[r]; hwirq++, i++) {
 			virq = irq_find_mapping(NULL, hwirq);
 			if (virq)
-				cxl_unmap_irq(virq, ctx);
+				cxl_unmap_irq(virq, cookie);
 		}
 	}
 
-- 
cgit v0.10.2


From eda3693c842ed169af66af943554c648633769d0 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:08 +1000
Subject: cxl: Rework detach context functions

Rework __detach_context() and cxl_context_detach() so we can reuse them in the
kernel API.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 36bb8e4..7d857b7 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -174,7 +174,7 @@ int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma)
  * return until all outstanding interrupts for this context have completed. The
  * hardware should no longer access *ctx after this has returned.
  */
-static void __detach_context(struct cxl_context *ctx)
+int __detach_context(struct cxl_context *ctx)
 {
 	enum cxl_context_status status;
 
@@ -183,12 +183,10 @@ static void __detach_context(struct cxl_context *ctx)
 	ctx->status = CLOSED;
 	mutex_unlock(&ctx->status_mutex);
 	if (status != STARTED)
-		return;
+		return -EBUSY;
 
 	WARN_ON(cxl_detach_process(ctx));
-	afu_release_irqs(ctx, ctx);
-	flush_work(&ctx->fault_work); /* Only needed for dedicated process */
-	wake_up_all(&ctx->wq);
+	return 0;
 }
 
 /*
@@ -199,7 +197,15 @@ static void __detach_context(struct cxl_context *ctx)
  */
 void cxl_context_detach(struct cxl_context *ctx)
 {
-	__detach_context(ctx);
+	int rc;
+
+	rc = __detach_context(ctx);
+	if (rc)
+		return;
+
+	afu_release_irqs(ctx, ctx);
+	flush_work(&ctx->fault_work); /* Only needed for dedicated process */
+	wake_up_all(&ctx->wq);
 }
 
 /*
@@ -216,7 +222,7 @@ void cxl_context_detach_all(struct cxl_afu *afu)
 		 * Anything done in here needs to be setup before the IDR is
 		 * created and torn down after the IDR removed
 		 */
-		__detach_context(ctx);
+		cxl_context_detach(ctx);
 
 		/*
 		 * We are force detaching - remove any active PSA mappings so
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index a4dadc5..7b7509c 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -636,6 +636,7 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
 		     struct address_space *mapping);
 void cxl_context_free(struct cxl_context *ctx);
 int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma);
+int __detach_context(struct cxl_context *ctx);
 
 /* This matches the layout of the H_COLLECT_CA_INT_INFO retbuf */
 struct cxl_irq_info {
-- 
cgit v0.10.2


From b12994fbfe9362e3b5ed9f360d3cd6fb34d6645a Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:09 +1000
Subject: cxl: cxl_afu_reset() -> __cxl_afu_reset()

Rename cxl_afu_reset() to __cxl_afu_reset() to we can reuse this function name
in the API.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 7b7509c..d6570a7 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -661,7 +661,7 @@ int cxl_check_error(struct cxl_afu *afu);
 int cxl_afu_slbia(struct cxl_afu *afu);
 int cxl_tlb_slb_invalidate(struct cxl *adapter);
 int cxl_afu_disable(struct cxl_afu *afu);
-int cxl_afu_reset(struct cxl_afu *afu);
+int __cxl_afu_reset(struct cxl_afu *afu);
 int cxl_psl_purge(struct cxl_afu *afu);
 
 void cxl_stop_trace(struct cxl *cxl);
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 29185fc..f85b6ae 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -73,7 +73,7 @@ int cxl_afu_disable(struct cxl_afu *afu)
 }
 
 /* This will disable as well as reset */
-int cxl_afu_reset(struct cxl_afu *afu)
+int __cxl_afu_reset(struct cxl_afu *afu)
 {
 	pr_devel("AFU reset request\n");
 
@@ -495,7 +495,7 @@ static int deactivate_afu_directed(struct cxl_afu *afu)
 	cxl_sysfs_afu_m_remove(afu);
 	cxl_chardev_afu_remove(afu);
 
-	cxl_afu_reset(afu);
+	__cxl_afu_reset(afu);
 	cxl_afu_disable(afu);
 	cxl_psl_purge(afu);
 
@@ -566,7 +566,7 @@ static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr)
 	/* master only context for dedicated */
 	assign_psn_space(ctx);
 
-	if ((rc = cxl_afu_reset(afu)))
+	if ((rc = __cxl_afu_reset(afu)))
 		return rc;
 
 	cxl_p2n_write(afu, CXL_PSL_WED_An, wed);
@@ -629,7 +629,7 @@ int cxl_attach_process(struct cxl_context *ctx, bool kernel, u64 wed, u64 amr)
 
 static inline int detach_process_native_dedicated(struct cxl_context *ctx)
 {
-	cxl_afu_reset(ctx->afu);
+	__cxl_afu_reset(ctx->afu);
 	cxl_afu_disable(ctx->afu);
 	cxl_psl_purge(ctx->afu);
 	return 0;
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 12fa406..88de2ce 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -657,7 +657,7 @@ static int sanitise_afu_regs(struct cxl_afu *afu)
 	reg = cxl_p2n_read(afu, CXL_AFU_Cntl_An);
 	if ((reg & CXL_AFU_Cntl_An_ES_MASK) != CXL_AFU_Cntl_An_ES_Disabled) {
 		dev_warn(&afu->dev, "WARNING: AFU was not disabled: %#.16llx\n", reg);
-		if (cxl_afu_reset(afu))
+		if (__cxl_afu_reset(afu))
 			return -EIO;
 		if (cxl_afu_disable(afu))
 			return -EIO;
@@ -761,7 +761,7 @@ static int cxl_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev)
 		goto err2;
 
 	/* We need to reset the AFU before we can read the AFU descriptor */
-	if ((rc = cxl_afu_reset(afu)))
+	if ((rc = __cxl_afu_reset(afu)))
 		goto err2;
 
 	if (cxl_verbose)
diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c
index 4942d55..31f38bc 100644
--- a/drivers/misc/cxl/sysfs.c
+++ b/drivers/misc/cxl/sysfs.c
@@ -185,7 +185,7 @@ static ssize_t reset_store_afu(struct device *device,
 		goto err;
 	}
 
-	if ((rc = cxl_afu_reset(afu)))
+	if ((rc = __cxl_afu_reset(afu)))
 		goto err;
 
 	rc = count;
-- 
cgit v0.10.2


From 1a1a94b876b9159d2b10675d8f680f7bbedc163e Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:10 +1000
Subject: cxl: Export some symbols

Export some symbols which will soon be used elsewhere in this driver.

Now they are global we rename them so to avoid collisions.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index d6570a7..eccddfe 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -636,6 +636,9 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
 		     struct address_space *mapping);
 void cxl_context_free(struct cxl_context *ctx);
 int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma);
+unsigned int cxl_map_irq(struct cxl *adapter, irq_hw_number_t hwirq,
+			 irq_handler_t handler, void *cookie, const char *name);
+void cxl_unmap_irq(unsigned int virq, void *cookie);
 int __detach_context(struct cxl_context *ctx);
 
 /* This matches the layout of the H_COLLECT_CA_INT_INFO retbuf */
@@ -650,6 +653,7 @@ struct cxl_irq_info {
 	u64 padding[3]; /* to match the expected retbuf size for plpar_hcall9 */
 };
 
+void cxl_assign_psn_space(struct cxl_context *ctx);
 int cxl_attach_process(struct cxl_context *ctx, bool kernel, u64 wed,
 			    u64 amr);
 int cxl_detach_process(struct cxl_context *ctx);
@@ -662,6 +666,7 @@ int cxl_afu_slbia(struct cxl_afu *afu);
 int cxl_tlb_slb_invalidate(struct cxl *adapter);
 int cxl_afu_disable(struct cxl_afu *afu);
 int __cxl_afu_reset(struct cxl_afu *afu);
+int cxl_afu_check_and_enable(struct cxl_afu *afu);
 int cxl_psl_purge(struct cxl_afu *afu);
 
 void cxl_stop_trace(struct cxl *cxl);
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index f85b6ae..615ccbf 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -83,7 +83,7 @@ int __cxl_afu_reset(struct cxl_afu *afu)
 			   false);
 }
 
-static int afu_check_and_enable(struct cxl_afu *afu)
+int cxl_afu_check_and_enable(struct cxl_afu *afu)
 {
 	if (afu->enabled)
 		return 0;
@@ -379,7 +379,7 @@ static int remove_process_element(struct cxl_context *ctx)
 }
 
 
-static void assign_psn_space(struct cxl_context *ctx)
+void cxl_assign_psn_space(struct cxl_context *ctx)
 {
 	if (!ctx->afu->pp_size || ctx->master) {
 		ctx->psn_phys = ctx->afu->psn_phys;
@@ -435,7 +435,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
 	u64 sr;
 	int r, result;
 
-	assign_psn_space(ctx);
+	cxl_assign_psn_space(ctx);
 
 	ctx->elem->ctxtime = 0; /* disable */
 	ctx->elem->lpid = cpu_to_be32(mfspr(SPRN_LPID));
@@ -477,7 +477,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
 	ctx->elem->common.wed = cpu_to_be64(wed);
 
 	/* first guy needs to enable */
-	if ((result = afu_check_and_enable(ctx->afu)))
+	if ((result = cxl_afu_check_and_enable(ctx->afu)))
 		return result;
 
 	add_process_element(ctx);
@@ -564,7 +564,7 @@ static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr)
 	cxl_p2n_write(afu, CXL_PSL_AMR_An, amr);
 
 	/* master only context for dedicated */
-	assign_psn_space(ctx);
+	cxl_assign_psn_space(ctx);
 
 	if ((rc = __cxl_afu_reset(afu)))
 		return rc;
-- 
cgit v0.10.2


From a6b07d825750f76eeb2972c70a8f0e6e62566954 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:11 +1000
Subject: cxl: Only check pid for userspace contexts

We only need to check the pid attached to this context for userspace contexts.
Kernel contexts can skip this check.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 5286b8b..25a5418 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -172,8 +172,8 @@ void cxl_handle_fault(struct work_struct *fault_work)
 		container_of(fault_work, struct cxl_context, fault_work);
 	u64 dsisr = ctx->dsisr;
 	u64 dar = ctx->dar;
-	struct task_struct *task;
-	struct mm_struct *mm;
+	struct task_struct *task = NULL;
+	struct mm_struct *mm = NULL;
 
 	if (cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An) != dsisr ||
 	    cxl_p2n_read(ctx->afu, CXL_PSL_DAR_An) != dar ||
@@ -194,17 +194,19 @@ void cxl_handle_fault(struct work_struct *fault_work)
 	pr_devel("CXL BOTTOM HALF handling fault for afu pe: %i. "
 		"DSISR: %#llx DAR: %#llx\n", ctx->pe, dsisr, dar);
 
-	if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) {
-		pr_devel("cxl_handle_fault unable to get task %i\n",
-			 pid_nr(ctx->pid));
-		cxl_ack_ae(ctx);
-		return;
-	}
-	if (!(mm = get_task_mm(task))) {
-		pr_devel("cxl_handle_fault unable to get mm %i\n",
-			 pid_nr(ctx->pid));
-		cxl_ack_ae(ctx);
-		goto out;
+	if (!ctx->kernel) {
+		if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) {
+			pr_devel("cxl_handle_fault unable to get task %i\n",
+				 pid_nr(ctx->pid));
+			cxl_ack_ae(ctx);
+			return;
+		}
+		if (!(mm = get_task_mm(task))) {
+			pr_devel("cxl_handle_fault unable to get mm %i\n",
+				 pid_nr(ctx->pid));
+			cxl_ack_ae(ctx);
+			goto out;
+		}
 	}
 
 	if (dsisr & CXL_PSL_DSISR_An_DS)
@@ -214,9 +216,11 @@ void cxl_handle_fault(struct work_struct *fault_work)
 	else
 		WARN(1, "cxl_handle_fault has nothing to handle\n");
 
-	mmput(mm);
+	if (mm)
+		mmput(mm);
 out:
-	put_task_struct(task);
+	if (task)
+		put_task_struct(task);
 }
 
 static void cxl_prefault_one(struct cxl_context *ctx, u64 ea)
-- 
cgit v0.10.2


From c358d84b4e57b060e0287d0c614f8f076d725612 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:12 +1000
Subject: cxl: Split afu_register_irqs() function

Split the afu_register_irqs() function so that different parts can
be useful elsewhere.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index eccddfe..778161a 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -672,5 +672,6 @@ int cxl_psl_purge(struct cxl_afu *afu);
 void cxl_stop_trace(struct cxl *cxl);
 
 extern struct pci_driver cxl_pci_driver;
+int afu_allocate_irqs(struct cxl_context *ctx, u32 count);
 
 #endif
diff --git a/drivers/misc/cxl/irq.c b/drivers/misc/cxl/irq.c
index c740c7b..212790b 100644
--- a/drivers/misc/cxl/irq.c
+++ b/drivers/misc/cxl/irq.c
@@ -416,9 +416,8 @@ void afu_irq_name_free(struct cxl_context *ctx)
 	}
 }
 
-int afu_register_irqs(struct cxl_context *ctx, u32 count)
+int afu_allocate_irqs(struct cxl_context *ctx, u32 count)
 {
-	irq_hw_number_t hwirq;
 	int rc, r, i, j = 1;
 	struct cxl_irq_name *irq_name;
 
@@ -458,6 +457,18 @@ int afu_register_irqs(struct cxl_context *ctx, u32 count)
 			j++;
 		}
 	}
+	return 0;
+
+out:
+	afu_irq_name_free(ctx);
+	return -ENOMEM;
+}
+
+void afu_register_hwirqs(struct cxl_context *ctx)
+{
+	irq_hw_number_t hwirq;
+	struct cxl_irq_name *irq_name;
+	int r,i;
 
 	/* We've allocated all memory now, so let's do the irq allocations */
 	irq_name = list_first_entry(&ctx->irq_names, struct cxl_irq_name, list);
@@ -469,13 +480,19 @@ int afu_register_irqs(struct cxl_context *ctx, u32 count)
 			irq_name = list_next_entry(irq_name, list);
 		}
 	}
+}
 
-	return 0;
+int afu_register_irqs(struct cxl_context *ctx, u32 count)
+{
+	int rc;
 
-out:
-	afu_irq_name_free(ctx);
-	return -ENOMEM;
-}
+	rc = afu_allocate_irqs(ctx, count);
+	if (rc)
+		return rc;
+
+	afu_register_hwirqs(ctx);
+	return 0;
+ }
 
 void afu_release_irqs(struct cxl_context *ctx, void *cookie)
 {
-- 
cgit v0.10.2


From 2f663527bd6aa94763aafd3a22332903815fbf81 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:13 +1000
Subject: cxl: Configure PSL for kernel contexts and merge code

This updates AFU directed and dedicated modes for contexts attached to the
kernel.

The SR (similar to the MSR in the core) calculation is getting
quite complex and is duplicated in AFU directed and dedicated
modes.  This patch also merges this SR calculation for these modes.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 615ccbf..a4b40d7 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -430,9 +430,30 @@ err:
 #define set_endian(sr) ((sr) &= ~(CXL_PSL_SR_An_LE))
 #endif
 
+static u64 calculate_sr(struct cxl_context *ctx)
+{
+	u64 sr = 0;
+
+	if (ctx->master)
+		sr |= CXL_PSL_SR_An_MP;
+	if (mfspr(SPRN_LPCR) & LPCR_TC)
+		sr |= CXL_PSL_SR_An_TC;
+	if (ctx->kernel) {
+		sr |= CXL_PSL_SR_An_R | (mfmsr() & MSR_SF);
+		sr |= CXL_PSL_SR_An_HV;
+	} else {
+		sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R;
+		set_endian(sr);
+		sr &= ~(CXL_PSL_SR_An_HV);
+		if (!test_tsk_thread_flag(current, TIF_32BIT))
+			sr |= CXL_PSL_SR_An_SF;
+	}
+	return sr;
+}
+
 static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
 {
-	u64 sr;
+	u32 pid;
 	int r, result;
 
 	cxl_assign_psn_space(ctx);
@@ -442,22 +463,13 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
 	ctx->elem->haurp = 0; /* disable */
 	ctx->elem->sdr = cpu_to_be64(mfspr(SPRN_SDR1));
 
-	sr = 0;
-	if (ctx->master)
-		sr |= CXL_PSL_SR_An_MP;
-	if (mfspr(SPRN_LPCR) & LPCR_TC)
-		sr |= CXL_PSL_SR_An_TC;
-	/* HV=0, PR=1, R=1 for userspace
-	 * For kernel contexts: this would need to change
-	 */
-	sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R;
-	set_endian(sr);
-	sr &= ~(CXL_PSL_SR_An_HV);
-	if (!test_tsk_thread_flag(current, TIF_32BIT))
-		sr |= CXL_PSL_SR_An_SF;
-	ctx->elem->common.pid = cpu_to_be32(current->pid);
+	pid = current->pid;
+	if (ctx->kernel)
+		pid = 0;
 	ctx->elem->common.tid = 0;
-	ctx->elem->sr = cpu_to_be64(sr);
+	ctx->elem->common.pid = cpu_to_be32(pid);
+
+	ctx->elem->sr = cpu_to_be64(calculate_sr(ctx));
 
 	ctx->elem->common.csrp = 0; /* disable */
 	ctx->elem->common.aurp0 = 0; /* disable */
@@ -530,20 +542,15 @@ static int activate_dedicated_process(struct cxl_afu *afu)
 static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr)
 {
 	struct cxl_afu *afu = ctx->afu;
-	u64 sr;
+	u64 pid;
 	int rc;
 
-	sr = 0;
-	set_endian(sr);
-	if (ctx->master)
-		sr |= CXL_PSL_SR_An_MP;
-	if (mfspr(SPRN_LPCR) & LPCR_TC)
-		sr |= CXL_PSL_SR_An_TC;
-	sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R;
-	if (!test_tsk_thread_flag(current, TIF_32BIT))
-		sr |= CXL_PSL_SR_An_SF;
-	cxl_p2n_write(afu, CXL_PSL_PID_TID_An, (u64)current->pid << 32);
-	cxl_p1n_write(afu, CXL_PSL_SR_An, sr);
+	pid = (u64)current->pid << 32;
+	if (ctx->kernel)
+		pid = 0;
+	cxl_p2n_write(afu, CXL_PSL_PID_TID_An, pid);
+
+	cxl_p1n_write(afu, CXL_PSL_SR_An, calculate_sr(ctx));
 
 	if ((rc = cxl_write_sstp(afu, ctx->sstp0, ctx->sstp1)))
 		return rc;
-- 
cgit v0.10.2


From 7bb5d91a4dda92e28b2704a3e7ebe94260ccd6f1 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:14 +1000
Subject: cxl: Rework context lifetimes

This reworks contexts lifetimes a bit to enable the kernel API where we may
want to reuse contexts. Here we will want to start and stop contexts without
freeing them.

Start context does the get pid & ctx so stop context will need to do the puts.
Here we move put pid & ctx to the detach context path which will become part of
the stop context path.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 7d857b7..2a4c80a 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -186,6 +186,9 @@ int __detach_context(struct cxl_context *ctx)
 		return -EBUSY;
 
 	WARN_ON(cxl_detach_process(ctx));
+	flush_work(&ctx->fault_work); /* Only needed for dedicated process */
+	put_pid(ctx->pid);
+	cxl_ctx_put();
 	return 0;
 }
 
@@ -204,7 +207,6 @@ void cxl_context_detach(struct cxl_context *ctx)
 		return;
 
 	afu_release_irqs(ctx, ctx);
-	flush_work(&ctx->fault_work); /* Only needed for dedicated process */
 	wake_up_all(&ctx->wq);
 }
 
@@ -245,7 +247,6 @@ static void reclaim_ctx(struct rcu_head *rcu)
 	free_page((u64)ctx->sstp);
 	ctx->sstp = NULL;
 
-	put_pid(ctx->pid);
 	kfree(ctx);
 }
 
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 676e2c8..c16bd6f 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -128,7 +128,6 @@ static int afu_release(struct inode *inode, struct file *file)
 	 */
 	cxl_context_free(ctx);
 
-	cxl_ctx_put();
 	return 0;
 }
 
-- 
cgit v0.10.2


From 406e12ec0b645f22b16affc71b8aa429d1777f05 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:15 +1000
Subject: cxl: Cleanup Makefile

Cleanup Makefile by fixing line wrapping.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/Makefile b/drivers/misc/cxl/Makefile
index edb494d..f9f5514 100644
--- a/drivers/misc/cxl/Makefile
+++ b/drivers/misc/cxl/Makefile
@@ -1,4 +1,5 @@
-cxl-y				+= main.o file.o irq.o fault.o native.o context.o sysfs.o debugfs.o pci.o trace.o
+cxl-y				+= main.o file.o irq.o fault.o native.o
+cxl-y				+= context.o sysfs.o debugfs.o pci.o trace.o
 obj-$(CONFIG_CXL)		+= cxl.o
 obj-$(CONFIG_CXL_BASE)		+= base.o
 
-- 
cgit v0.10.2


From ec249dd860ed88e15b3e2bd363cbfc76ba8c1884 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:16 +1000
Subject: cxl: Move include file cxl.h -> cxl-base.h

This moves the current include file from cxl.h -> cxl-base.h.  This current
include file is used only to pass information between the base driver that
needs to be built into the kernel and the cxl module.

This is to make way for a new include/misc/cxl.h which will
contain just the kernel API for other driver to use

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/MAINTAINERS b/MAINTAINERS
index 562ae4e..80627dd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2946,7 +2946,7 @@ M:	Michael Neuling <mikey@neuling.org>
 L:	linuxppc-dev@lists.ozlabs.org
 S:	Supported
 F:	drivers/misc/cxl/
-F:	include/misc/cxl.h
+F:	include/misc/cxl*
 F:	include/uapi/misc/cxl.h
 F:	Documentation/powerpc/cxl.txt
 F:	Documentation/powerpc/cxl.txt
diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h
index f9b4982..6f77f71 100644
--- a/arch/powerpc/include/asm/pnv-pci.h
+++ b/arch/powerpc/include/asm/pnv-pci.h
@@ -11,7 +11,7 @@
 #define _ASM_PNV_PCI_H
 
 #include <linux/pci.h>
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode);
 int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 7a71d7f..6527882 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -26,7 +26,7 @@
 #include <asm/reg.h>
 #include <asm/copro.h>
 #include <asm/spu.h>
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 /*
  * This ought to be kept in sync with the powerpc specific do_page_fault
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 9c4880d..13befa35 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -29,7 +29,7 @@
 #include <asm/kexec.h>
 #include <asm/ppc-opcode.h>
 
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 #ifdef DEBUG_LOW
 #define DBG_LOW(fmt...) udbg_printf(fmt)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index cec43c8..b284185 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -39,7 +39,7 @@
 #include <asm/firmware.h>
 #include <asm/pnv-pci.h>
 
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 #include "powernv.h"
 #include "pci.h"
diff --git a/drivers/misc/cxl/base.c b/drivers/misc/cxl/base.c
index 0654ad8..a9f0dd3 100644
--- a/drivers/misc/cxl/base.c
+++ b/drivers/misc/cxl/base.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <asm/errno.h>
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 #include "cxl.h"
 
 /* protected by rcu */
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 778161a..46d806e 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -21,7 +21,7 @@
 #include <asm/cputable.h>
 #include <asm/mmu.h>
 #include <asm/reg.h>
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 #include <uapi/misc/cxl.h>
 
diff --git a/drivers/misc/cxl/irq.c b/drivers/misc/cxl/irq.c
index 212790b..680cd26 100644
--- a/drivers/misc/cxl/irq.c
+++ b/drivers/misc/cxl/irq.c
@@ -14,7 +14,7 @@
 #include <linux/slab.h>
 #include <linux/pid.h>
 #include <asm/cputable.h>
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 #include "cxl.h"
 #include "trace.h"
diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
index 8ccddce..833348e 100644
--- a/drivers/misc/cxl/main.c
+++ b/drivers/misc/cxl/main.c
@@ -20,7 +20,7 @@
 #include <linux/idr.h>
 #include <linux/pci.h>
 #include <asm/cputable.h>
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 #include "cxl.h"
 #include "trace.h"
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index a4b40d7..10567f2 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -15,7 +15,7 @@
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <asm/synch.h>
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 #include "cxl.h"
 #include "trace.h"
diff --git a/include/misc/cxl-base.h b/include/misc/cxl-base.h
new file mode 100644
index 0000000..5ae9625
--- /dev/null
+++ b/include/misc/cxl-base.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _MISC_CXL_BASE_H
+#define _MISC_CXL_BASE_H
+
+#ifdef CONFIG_CXL_BASE
+
+#define CXL_IRQ_RANGES 4
+
+struct cxl_irq_ranges {
+	irq_hw_number_t offset[CXL_IRQ_RANGES];
+	irq_hw_number_t range[CXL_IRQ_RANGES];
+};
+
+extern atomic_t cxl_use_count;
+
+static inline bool cxl_ctx_in_use(void)
+{
+       return (atomic_read(&cxl_use_count) != 0);
+}
+
+static inline void cxl_ctx_get(void)
+{
+       atomic_inc(&cxl_use_count);
+}
+
+static inline void cxl_ctx_put(void)
+{
+       atomic_dec(&cxl_use_count);
+}
+
+void cxl_slbia(struct mm_struct *mm);
+
+#else /* CONFIG_CXL_BASE */
+
+static inline bool cxl_ctx_in_use(void) { return false; }
+static inline void cxl_slbia(struct mm_struct *mm) {}
+
+#endif /* CONFIG_CXL_BASE */
+
+#endif
diff --git a/include/misc/cxl.h b/include/misc/cxl.h
deleted file mode 100644
index 975cc78..0000000
--- a/include/misc/cxl.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright 2014 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _MISC_CXL_H
-#define _MISC_CXL_H
-
-#ifdef CONFIG_CXL_BASE
-
-#define CXL_IRQ_RANGES 4
-
-struct cxl_irq_ranges {
-	irq_hw_number_t offset[CXL_IRQ_RANGES];
-	irq_hw_number_t range[CXL_IRQ_RANGES];
-};
-
-extern atomic_t cxl_use_count;
-
-static inline bool cxl_ctx_in_use(void)
-{
-       return (atomic_read(&cxl_use_count) != 0);
-}
-
-static inline void cxl_ctx_get(void)
-{
-       atomic_inc(&cxl_use_count);
-}
-
-static inline void cxl_ctx_put(void)
-{
-       atomic_dec(&cxl_use_count);
-}
-
-void cxl_slbia(struct mm_struct *mm);
-
-#else /* CONFIG_CXL_BASE */
-
-static inline bool cxl_ctx_in_use(void) { return false; }
-static inline void cxl_slbia(struct mm_struct *mm) {}
-
-#endif /* CONFIG_CXL_BASE */
-
-#endif
-- 
cgit v0.10.2


From 0520336afe5de76324c73f793bc40732e5c13359 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:17 +1000
Subject: cxl: Export file ops for use by API

The cxl kernel API will allow drivers other than cxl to export a file
descriptor which has the same userspace API.  These file descriptors will be
able to be used against libcxl.

This exports those file ops for use by other drivers.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 46d806e..2ec8e96 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -18,6 +18,7 @@
 #include <linux/pid.h>
 #include <linux/io.h>
 #include <linux/pci.h>
+#include <linux/fs.h>
 #include <asm/cputable.h>
 #include <asm/mmu.h>
 #include <asm/reg.h>
@@ -674,4 +675,12 @@ void cxl_stop_trace(struct cxl *cxl);
 extern struct pci_driver cxl_pci_driver;
 int afu_allocate_irqs(struct cxl_context *ctx, u32 count);
 
+int afu_open(struct inode *inode, struct file *file);
+int afu_release(struct inode *inode, struct file *file);
+long afu_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int afu_mmap(struct file *file, struct vm_area_struct *vm);
+unsigned int afu_poll(struct file *file, struct poll_table_struct *poll);
+ssize_t afu_read(struct file *file, char __user *buf, size_t count, loff_t *off);
+extern const struct file_operations afu_fops;
+
 #endif
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index c16bd6f..e3f4b69 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -96,7 +96,8 @@ err_put_adapter:
 	put_device(&adapter->dev);
 	return rc;
 }
-static int afu_open(struct inode *inode, struct file *file)
+
+int afu_open(struct inode *inode, struct file *file)
 {
 	return __afu_open(inode, file, false);
 }
@@ -106,7 +107,7 @@ static int afu_master_open(struct inode *inode, struct file *file)
 	return __afu_open(inode, file, true);
 }
 
-static int afu_release(struct inode *inode, struct file *file)
+int afu_release(struct inode *inode, struct file *file)
 {
 	struct cxl_context *ctx = file->private_data;
 
@@ -230,7 +231,7 @@ static long afu_ioctl_get_afu_id(struct cxl_context *ctx,
 	return 0;
 }
 
-static long afu_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+long afu_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct cxl_context *ctx = file->private_data;
 
@@ -250,13 +251,13 @@ static long afu_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return -EINVAL;
 }
 
-static long afu_compat_ioctl(struct file *file, unsigned int cmd,
+long afu_compat_ioctl(struct file *file, unsigned int cmd,
 			     unsigned long arg)
 {
 	return afu_ioctl(file, cmd, arg);
 }
 
-static int afu_mmap(struct file *file, struct vm_area_struct *vm)
+int afu_mmap(struct file *file, struct vm_area_struct *vm)
 {
 	struct cxl_context *ctx = file->private_data;
 
@@ -267,7 +268,7 @@ static int afu_mmap(struct file *file, struct vm_area_struct *vm)
 	return cxl_context_iomap(ctx, vm);
 }
 
-static unsigned int afu_poll(struct file *file, struct poll_table_struct *poll)
+unsigned int afu_poll(struct file *file, struct poll_table_struct *poll)
 {
 	struct cxl_context *ctx = file->private_data;
 	int mask = 0;
@@ -299,7 +300,7 @@ static inline int ctx_event_pending(struct cxl_context *ctx)
 	    ctx->pending_afu_err || (ctx->status == CLOSED));
 }
 
-static ssize_t afu_read(struct file *file, char __user *buf, size_t count,
+ssize_t afu_read(struct file *file, char __user *buf, size_t count,
 			loff_t *off)
 {
 	struct cxl_context *ctx = file->private_data;
@@ -380,7 +381,11 @@ out:
 	return rc;
 }
 
-static const struct file_operations afu_fops = {
+/* 
+ * Note: if this is updated, we need to update api.c to patch the new ones in
+ * too
+ */
+const struct file_operations afu_fops = {
 	.owner		= THIS_MODULE,
 	.open           = afu_open,
 	.poll		= afu_poll,
@@ -391,7 +396,7 @@ static const struct file_operations afu_fops = {
 	.mmap           = afu_mmap,
 };
 
-static const struct file_operations afu_master_fops = {
+const struct file_operations afu_master_fops = {
 	.owner		= THIS_MODULE,
 	.open           = afu_master_open,
 	.poll		= afu_poll,
-- 
cgit v0.10.2


From 6f7f0b3df6d49316c6f27390bb5ec250255be548 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 27 May 2015 16:07:18 +1000
Subject: cxl: Add AFU virtual PHB and kernel API

This patch does two things.

Firstly it presents the Accelerator Function Unit (AFUs) behind the POWER
Service Layer (PSL) as PCI devices on a virtual PCI Host Bridge (vPHB).  This
in in addition to the PSL being a PCI device itself.

As part of the Coherent Accelerator Interface Architecture (CAIA) AFUs can
provide an AFU configuration.  This AFU configuration recored is architected to
be the same as a PCI config space.

This patch sets discovers the AFU configuration records, provides AFU config
space read/write functions to these configuration records.  It then enumerates
the PCI bus.  It also hooks in PCI ops where appropriate.  It also destroys the
vPHB when the physical card is removed.

Secondly, it add an in kernel API for AFU to use CXL.  AFUs must present a
driver that firstly binds as a PCI device.  This PCI device can then be using
to do CXL specific operations (that can't sit in the PCI ops) using this API.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/Makefile b/drivers/misc/cxl/Makefile
index f9f5514..14e3f82 100644
--- a/drivers/misc/cxl/Makefile
+++ b/drivers/misc/cxl/Makefile
@@ -1,5 +1,6 @@
 cxl-y				+= main.o file.o irq.o fault.o native.o
 cxl-y				+= context.o sysfs.o debugfs.o pci.o trace.o
+cxl-y				+= vphb.o api.o
 obj-$(CONFIG_CXL)		+= cxl.o
 obj-$(CONFIG_CXL_BASE)		+= base.o
 
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
new file mode 100644
index 0000000..0c77240
--- /dev/null
+++ b/drivers/misc/cxl/api.c
@@ -0,0 +1,331 @@
+/*
+ * Copyright 2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <misc/cxl.h>
+
+#include "cxl.h"
+
+struct cxl_context *cxl_dev_context_init(struct pci_dev *dev)
+{
+	struct cxl_afu *afu;
+	struct cxl_context  *ctx;
+	int rc;
+
+	afu = cxl_pci_to_afu(dev);
+
+	ctx = cxl_context_alloc();
+	if (IS_ERR(ctx))
+		return ctx;
+
+	/* Make it a slave context.  We can promote it later? */
+	rc = cxl_context_init(ctx, afu, false, NULL);
+	if (rc) {
+		kfree(ctx);
+		return ERR_PTR(-ENOMEM);
+	}
+	cxl_assign_psn_space(ctx);
+
+	return ctx;
+}
+EXPORT_SYMBOL_GPL(cxl_dev_context_init);
+
+struct cxl_context *cxl_get_context(struct pci_dev *dev)
+{
+	return dev->dev.archdata.cxl_ctx;
+}
+EXPORT_SYMBOL_GPL(cxl_get_context);
+
+struct device *cxl_get_phys_dev(struct pci_dev *dev)
+{
+	struct cxl_afu *afu;
+
+	afu = cxl_pci_to_afu(dev);
+
+	return afu->adapter->dev.parent;
+}
+EXPORT_SYMBOL_GPL(cxl_get_phys_dev);
+
+int cxl_release_context(struct cxl_context *ctx)
+{
+	if (ctx->status != CLOSED)
+		return -EBUSY;
+
+	cxl_context_free(ctx);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxl_release_context);
+
+int cxl_allocate_afu_irqs(struct cxl_context *ctx, int num)
+{
+	if (num == 0)
+		num = ctx->afu->pp_irqs;
+	return afu_allocate_irqs(ctx, num);
+}
+EXPORT_SYMBOL_GPL(cxl_allocate_afu_irqs);
+
+void cxl_free_afu_irqs(struct cxl_context *ctx)
+{
+	cxl_release_irq_ranges(&ctx->irqs, ctx->afu->adapter);
+}
+EXPORT_SYMBOL_GPL(cxl_free_afu_irqs);
+
+static irq_hw_number_t cxl_find_afu_irq(struct cxl_context *ctx, int num)
+{
+	__u16 range;
+	int r;
+
+	WARN_ON(num == 0);
+
+	for (r = 0; r < CXL_IRQ_RANGES; r++) {
+		range = ctx->irqs.range[r];
+		if (num < range) {
+			return ctx->irqs.offset[r] + num;
+		}
+		num -= range;
+	}
+	return 0;
+}
+
+int cxl_map_afu_irq(struct cxl_context *ctx, int num,
+		    irq_handler_t handler, void *cookie, char *name)
+{
+	irq_hw_number_t hwirq;
+
+	/*
+	 * Find interrupt we are to register.
+	 */
+	hwirq = cxl_find_afu_irq(ctx, num);
+	if (!hwirq)
+		return -ENOENT;
+
+	return cxl_map_irq(ctx->afu->adapter, hwirq, handler, cookie, name);
+}
+EXPORT_SYMBOL_GPL(cxl_map_afu_irq);
+
+void cxl_unmap_afu_irq(struct cxl_context *ctx, int num, void *cookie)
+{
+	irq_hw_number_t hwirq;
+	unsigned int virq;
+
+	hwirq = cxl_find_afu_irq(ctx, num);
+	if (!hwirq)
+		return;
+
+	virq = irq_find_mapping(NULL, hwirq);
+	if (virq)
+		cxl_unmap_irq(virq, cookie);
+}
+EXPORT_SYMBOL_GPL(cxl_unmap_afu_irq);
+
+/*
+ * Start a context
+ * Code here similar to afu_ioctl_start_work().
+ */
+int cxl_start_context(struct cxl_context *ctx, u64 wed,
+		      struct task_struct *task)
+{
+	int rc = 0;
+	bool kernel = true;
+
+	pr_devel("%s: pe: %i\n", __func__, ctx->pe);
+
+	mutex_lock(&ctx->status_mutex);
+	if (ctx->status == STARTED)
+		goto out; /* already started */
+
+	if (task) {
+		ctx->pid = get_task_pid(task, PIDTYPE_PID);
+		get_pid(ctx->pid);
+		kernel = false;
+	}
+
+	cxl_ctx_get();
+
+	if ((rc = cxl_attach_process(ctx, kernel, wed , 0))) {
+		put_pid(ctx->pid);
+		cxl_ctx_put();
+		goto out;
+	}
+
+	ctx->status = STARTED;
+	get_device(&ctx->afu->dev);
+out:
+	mutex_unlock(&ctx->status_mutex);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(cxl_start_context);
+
+int cxl_process_element(struct cxl_context *ctx)
+{
+	return ctx->pe;
+}
+EXPORT_SYMBOL_GPL(cxl_process_element);
+
+/* Stop a context.  Returns 0 on success, otherwise -Errno */
+int cxl_stop_context(struct cxl_context *ctx)
+{
+	int rc;
+
+	rc = __detach_context(ctx);
+	if (!rc)
+		put_device(&ctx->afu->dev);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(cxl_stop_context);
+
+void cxl_set_master(struct cxl_context *ctx)
+{
+	ctx->master = true;
+	cxl_assign_psn_space(ctx);
+}
+EXPORT_SYMBOL_GPL(cxl_set_master);
+
+/* wrappers around afu_* file ops which are EXPORTED */
+int cxl_fd_open(struct inode *inode, struct file *file)
+{
+	return afu_open(inode, file);
+}
+EXPORT_SYMBOL_GPL(cxl_fd_open);
+int cxl_fd_release(struct inode *inode, struct file *file)
+{
+	return afu_release(inode, file);
+}
+EXPORT_SYMBOL_GPL(cxl_fd_release);
+long cxl_fd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	return afu_ioctl(file, cmd, arg);
+}
+EXPORT_SYMBOL_GPL(cxl_fd_ioctl);
+int cxl_fd_mmap(struct file *file, struct vm_area_struct *vm)
+{
+	return afu_mmap(file, vm);
+}
+EXPORT_SYMBOL_GPL(cxl_fd_mmap);
+unsigned int cxl_fd_poll(struct file *file, struct poll_table_struct *poll)
+{
+	return afu_poll(file, poll);
+}
+EXPORT_SYMBOL_GPL(cxl_fd_poll);
+ssize_t cxl_fd_read(struct file *file, char __user *buf, size_t count,
+			loff_t *off)
+{
+	return afu_read(file, buf, count, off);
+}
+EXPORT_SYMBOL_GPL(cxl_fd_read);
+
+#define PATCH_FOPS(NAME) if (!fops->NAME) fops->NAME = afu_fops.NAME
+
+/* Get a struct file and fd for a context and attach the ops */
+struct file *cxl_get_fd(struct cxl_context *ctx, struct file_operations *fops,
+			int *fd)
+{
+	struct file *file;
+	int rc, flags, fdtmp;
+
+	flags = O_RDWR | O_CLOEXEC;
+
+	/* This code is similar to anon_inode_getfd() */
+	rc = get_unused_fd_flags(flags);
+	if (rc < 0)
+		return ERR_PTR(rc);
+	fdtmp = rc;
+
+	/*
+	 * Patch the file ops.  Needs to be careful that this is rentrant safe.
+	 */
+	if (fops) {
+		PATCH_FOPS(open);
+		PATCH_FOPS(poll);
+		PATCH_FOPS(read);
+		PATCH_FOPS(release);
+		PATCH_FOPS(unlocked_ioctl);
+		PATCH_FOPS(compat_ioctl);
+		PATCH_FOPS(mmap);
+	} else /* use default ops */
+		fops = (struct file_operations *)&afu_fops;
+
+	file = anon_inode_getfile("cxl", fops, ctx, flags);
+	if (IS_ERR(file))
+		put_unused_fd(fdtmp);
+	*fd = fdtmp;
+	return file;
+}
+EXPORT_SYMBOL_GPL(cxl_get_fd);
+
+struct cxl_context *cxl_fops_get_context(struct file *file)
+{
+	return file->private_data;
+}
+EXPORT_SYMBOL_GPL(cxl_fops_get_context);
+
+int cxl_start_work(struct cxl_context *ctx,
+		   struct cxl_ioctl_start_work *work)
+{
+	int rc;
+
+	/* code taken from afu_ioctl_start_work */
+	if (!(work->flags & CXL_START_WORK_NUM_IRQS))
+		work->num_interrupts = ctx->afu->pp_irqs;
+	else if ((work->num_interrupts < ctx->afu->pp_irqs) ||
+		 (work->num_interrupts > ctx->afu->irqs_max)) {
+		return -EINVAL;
+	}
+
+	rc = afu_register_irqs(ctx, work->num_interrupts);
+	if (rc)
+		return rc;
+
+	rc = cxl_start_context(ctx, work->work_element_descriptor, current);
+	if (rc < 0) {
+		afu_release_irqs(ctx, ctx);
+		return rc;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cxl_start_work);
+
+void __iomem *cxl_psa_map(struct cxl_context *ctx)
+{
+	struct cxl_afu *afu = ctx->afu;
+	int rc;
+
+	rc = cxl_afu_check_and_enable(afu);
+	if (rc)
+		return NULL;
+
+	pr_devel("%s: psn_phys%llx size:%llx\n",
+		 __func__, afu->psn_phys, afu->adapter->ps_size);
+	return ioremap(ctx->psn_phys, ctx->psn_size);
+}
+EXPORT_SYMBOL_GPL(cxl_psa_map);
+
+void cxl_psa_unmap(void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL_GPL(cxl_psa_unmap);
+
+int cxl_afu_reset(struct cxl_context *ctx)
+{
+	struct cxl_afu *afu = ctx->afu;
+	int rc;
+
+	rc = __cxl_afu_reset(afu);
+	if (rc)
+		return rc;
+
+	return cxl_afu_check_and_enable(afu);
+}
+EXPORT_SYMBOL_GPL(cxl_afu_reset);
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 2ec8e96..4fd66ca 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -378,6 +378,9 @@ struct cxl_afu {
 	int spa_max_procs;
 	unsigned int psl_virq;
 
+	/* pointer to the vphb */
+	struct pci_controller *phb;
+
 	int pp_irqs;
 	int irqs_max;
 	int num_procs;
@@ -671,6 +674,8 @@ int cxl_afu_check_and_enable(struct cxl_afu *afu);
 int cxl_psl_purge(struct cxl_afu *afu);
 
 void cxl_stop_trace(struct cxl *cxl);
+int cxl_pci_vphb_add(struct cxl_afu *afu);
+void cxl_pci_vphb_remove(struct cxl_afu *afu);
 
 extern struct pci_driver cxl_pci_driver;
 int afu_allocate_irqs(struct cxl_context *ctx, u32 count);
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 88de2ce..416905e 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -801,6 +801,9 @@ static int cxl_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev)
 
 	adapter->afu[afu->slice] = afu;
 
+	if ((rc = cxl_pci_vphb_add(afu)))
+		dev_info(&afu->dev, "Can't register vPHB\n");
+
 	return 0;
 
 err_put2:
@@ -853,8 +856,10 @@ int cxl_reset(struct cxl *adapter)
 
 	dev_info(&dev->dev, "CXL reset\n");
 
-	for (i = 0; i < adapter->slices; i++)
+	for (i = 0; i < adapter->slices; i++) {
+		cxl_pci_vphb_remove(adapter->afu[i]);
 		cxl_remove_afu(adapter->afu[i]);
+	}
 
 	/* pcie_warm_reset requests a fundamental pci reset which includes a
 	 * PERST assert/deassert.  PERST triggers a loading of the image
@@ -1163,14 +1168,18 @@ static int cxl_probe(struct pci_dev *dev, const struct pci_device_id *id)
 static void cxl_remove(struct pci_dev *dev)
 {
 	struct cxl *adapter = pci_get_drvdata(dev);
-	int afu;
+	struct cxl_afu *afu;
+	int i;
 
 	/*
 	 * Lock to prevent someone grabbing a ref through the adapter list as
 	 * we are removing it
 	 */
-	for (afu = 0; afu < adapter->slices; afu++)
-		cxl_remove_afu(adapter->afu[afu]);
+	for (i = 0; i < adapter->slices; i++) {
+		afu = adapter->afu[i];
+		cxl_pci_vphb_remove(afu);
+		cxl_remove_afu(afu);
+	}
 	cxl_remove_adapter(adapter);
 }
 
diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c
new file mode 100644
index 0000000..f733309
--- /dev/null
+++ b/drivers/misc/cxl/vphb.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright 2014 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/pci.h>
+#include <misc/cxl.h>
+#include "cxl.h"
+
+static int cxl_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+{
+	if (dma_mask < DMA_BIT_MASK(64)) {
+		pr_info("%s only 64bit DMA supported on CXL", __func__);
+		return -EIO;
+	}
+
+	*(pdev->dev.dma_mask) = dma_mask;
+	return 0;
+}
+
+static int cxl_pci_probe_mode(struct pci_bus *bus)
+{
+	return PCI_PROBE_NORMAL;
+}
+
+static int cxl_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+	return -ENODEV;
+}
+
+static void cxl_teardown_msi_irqs(struct pci_dev *pdev)
+{
+	/*
+	 * MSI should never be set but need still need to provide this call
+	 * back.
+	 */
+}
+
+static bool cxl_pci_enable_device_hook(struct pci_dev *dev)
+{
+	struct pci_controller *phb;
+	struct cxl_afu *afu;
+	struct cxl_context *ctx;
+
+	phb = pci_bus_to_host(dev->bus);
+	afu = (struct cxl_afu *)phb->private_data;
+	set_dma_ops(&dev->dev, &dma_direct_ops);
+	set_dma_offset(&dev->dev, PAGE_OFFSET);
+
+	/*
+	 * Allocate a context to do cxl things too.  If we eventually do real
+	 * DMA ops, we'll need a default context to attach them to
+	 */
+	ctx = cxl_dev_context_init(dev);
+	if (!ctx)
+		return false;
+	dev->dev.archdata.cxl_ctx = ctx;
+
+	return (cxl_afu_check_and_enable(afu) == 0);
+}
+
+static void cxl_pci_disable_device(struct pci_dev *dev)
+{
+	struct cxl_context *ctx = cxl_get_context(dev);
+
+	if (ctx) {
+		if (ctx->status == STARTED) {
+			dev_err(&dev->dev, "Default context started\n");
+			return;
+		}
+		cxl_release_context(ctx);
+	}
+}
+
+static resource_size_t cxl_pci_window_alignment(struct pci_bus *bus,
+						unsigned long type)
+{
+	return 1;
+}
+
+static void cxl_pci_reset_secondary_bus(struct pci_dev *dev)
+{
+	/* Should we do an AFU reset here ? */
+}
+
+static int cxl_pcie_cfg_record(u8 bus, u8 devfn)
+{
+	return (bus << 8) + devfn;
+}
+
+static unsigned long cxl_pcie_cfg_addr(struct pci_controller* phb,
+				       u8 bus, u8 devfn, int offset)
+{
+	int record = cxl_pcie_cfg_record(bus, devfn);
+
+	return (unsigned long)phb->cfg_addr + ((unsigned long)phb->cfg_data * record) + offset;
+}
+
+
+static int cxl_pcie_config_info(struct pci_bus *bus, unsigned int devfn,
+				int offset, int len,
+				volatile void __iomem **ioaddr,
+				u32 *mask, int *shift)
+{
+	struct pci_controller *phb;
+	struct cxl_afu *afu;
+	unsigned long addr;
+
+	phb = pci_bus_to_host(bus);
+	afu = (struct cxl_afu *)phb->private_data;
+	if (phb == NULL)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	if (cxl_pcie_cfg_record(bus->number, devfn) > afu->crs_num)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	if (offset >= (unsigned long)phb->cfg_data)
+		return PCIBIOS_BAD_REGISTER_NUMBER;
+	addr = cxl_pcie_cfg_addr(phb, bus->number, devfn, offset);
+
+	*ioaddr = (void *)(addr & ~0x3ULL);
+	*shift = ((addr & 0x3) * 8);
+	switch (len) {
+	case 1:
+		*mask = 0xff;
+		break;
+	case 2:
+		*mask = 0xffff;
+		break;
+	default:
+		*mask = 0xffffffff;
+		break;
+	}
+	return 0;
+}
+
+static int cxl_pcie_read_config(struct pci_bus *bus, unsigned int devfn,
+				int offset, int len, u32 *val)
+{
+	volatile void __iomem *ioaddr;
+	int shift, rc;
+	u32 mask;
+
+	rc = cxl_pcie_config_info(bus, devfn, offset, len, &ioaddr,
+				  &mask, &shift);
+	if (rc)
+		return rc;
+
+	/* Can only read 32 bits */
+	*val = (in_le32(ioaddr) >> shift) & mask;
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int cxl_pcie_write_config(struct pci_bus *bus, unsigned int devfn,
+				 int offset, int len, u32 val)
+{
+	volatile void __iomem *ioaddr;
+	u32 v, mask;
+	int shift, rc;
+
+	rc = cxl_pcie_config_info(bus, devfn, offset, len, &ioaddr,
+				  &mask, &shift);
+	if (rc)
+		return rc;
+
+	/* Can only write 32 bits so do read-modify-write */
+	mask <<= shift;
+	val <<= shift;
+
+	v = (in_le32(ioaddr) & ~mask) || (val & mask);
+
+	out_le32(ioaddr, v);
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops cxl_pcie_pci_ops =
+{
+	.read = cxl_pcie_read_config,
+	.write = cxl_pcie_write_config,
+};
+
+
+static struct pci_controller_ops cxl_pci_controller_ops =
+{
+	.probe_mode = cxl_pci_probe_mode,
+	.enable_device_hook = cxl_pci_enable_device_hook,
+	.disable_device = cxl_pci_disable_device,
+	.release_device = cxl_pci_disable_device,
+	.window_alignment = cxl_pci_window_alignment,
+	.reset_secondary_bus = cxl_pci_reset_secondary_bus,
+	.setup_msi_irqs = cxl_setup_msi_irqs,
+	.teardown_msi_irqs = cxl_teardown_msi_irqs,
+	.dma_set_mask = cxl_dma_set_mask,
+};
+
+int cxl_pci_vphb_add(struct cxl_afu *afu)
+{
+	struct pci_dev *phys_dev;
+	struct pci_controller *phb, *phys_phb;
+
+	phys_dev = to_pci_dev(afu->adapter->dev.parent);
+	phys_phb = pci_bus_to_host(phys_dev->bus);
+
+	/* Alloc and setup PHB data structure */
+	phb = pcibios_alloc_controller(phys_phb->dn);
+
+	if (!phb)
+		return -ENODEV;
+
+	/* Setup parent in sysfs */
+	phb->parent = &phys_dev->dev;
+
+	/* Setup the PHB using arch provided callback */
+	phb->ops = &cxl_pcie_pci_ops;
+	phb->cfg_addr = afu->afu_desc_mmio + afu->crs_offset;
+	phb->cfg_data = (void *)(u64)afu->crs_len;
+	phb->private_data = afu;
+	phb->controller_ops = cxl_pci_controller_ops;
+
+	/* Scan the bus */
+	pcibios_scan_phb(phb);
+	if (phb->bus == NULL)
+		return -ENXIO;
+
+	/* Claim resources. This might need some rework as well depending
+	 * whether we are doing probe-only or not, like assigning unassigned
+	 * resources etc...
+	 */
+	pcibios_claim_one_bus(phb->bus);
+
+	/* Add probed PCI devices to the device model */
+	pci_bus_add_devices(phb->bus);
+
+	afu->phb = phb;
+
+	return 0;
+}
+
+
+void cxl_pci_vphb_remove(struct cxl_afu *afu)
+{
+	struct pci_controller *phb;
+
+	/* If there is no configuration record we won't have one of these */
+	if (!afu || !afu->phb)
+		return;
+
+	phb = afu->phb;
+
+	pci_remove_root_bus(phb->bus);
+}
+
+struct cxl_afu *cxl_pci_to_afu(struct pci_dev *dev)
+{
+	struct pci_controller *phb;
+
+	phb = pci_bus_to_host(dev->bus);
+
+	return (struct cxl_afu *)phb->private_data;
+}
+EXPORT_SYMBOL_GPL(cxl_pci_to_afu);
+
+unsigned int cxl_pci_to_cfg_record(struct pci_dev *dev)
+{
+	return cxl_pcie_cfg_record(dev->bus->number, dev->devfn);
+}
+EXPORT_SYMBOL_GPL(cxl_pci_to_cfg_record);
diff --git a/include/misc/cxl.h b/include/misc/cxl.h
new file mode 100644
index 0000000..7a6c1d6
--- /dev/null
+++ b/include/misc/cxl.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _MISC_CXL_H
+#define _MISC_CXL_H
+
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/interrupt.h>
+#include <uapi/misc/cxl.h>
+
+/*
+ * This documents the in kernel API for driver to use CXL. It allows kernel
+ * drivers to bind to AFUs using an AFU configuration record exposed as a PCI
+ * configuration record.
+ *
+ * This API enables control over AFU and contexts which can't be part of the
+ * generic PCI API. This API is agnostic to the actual AFU.
+ */
+
+/* Get the AFU associated with a pci_dev */
+struct cxl_afu *cxl_pci_to_afu(struct pci_dev *dev);
+
+/* Get the AFU conf record number associated with a pci_dev */
+unsigned int cxl_pci_to_cfg_record(struct pci_dev *dev);
+
+/* Get the physical device (ie. the PCIe card) which the AFU is attached */
+struct device *cxl_get_phys_dev(struct pci_dev *dev);
+
+
+/*
+ * Context lifetime overview:
+ *
+ * An AFU context may be inited and then started and stoppped multiple times
+ * before it's released. ie.
+ *    - cxl_dev_context_init()
+ *      - cxl_start_context()
+ *      - cxl_stop_context()
+ *      - cxl_start_context()
+ *      - cxl_stop_context()
+ *     ...repeat...
+ *    - cxl_release_context()
+ * Once released, a context can't be started again.
+ *
+ * One context is inited by the cxl driver for every pci_dev. This is to be
+ * used as a default kernel context. cxl_get_context() will get this
+ * context. This context will be released by PCI hot unplug, so doesn't need to
+ * be released explicitly by drivers.
+ *
+ * Additional kernel contexts may be inited using cxl_dev_context_init().
+ * These must be released using cxl_context_detach().
+ *
+ * Once a context has been inited, IRQs may be configured. Firstly these IRQs
+ * must be allocated (cxl_allocate_afu_irqs()), then individually mapped to
+ * specific handlers (cxl_map_afu_irq()).
+ *
+ * These IRQs can be unmapped (cxl_unmap_afu_irq()) and finally released
+ * (cxl_free_afu_irqs()).
+ *
+ * The AFU can be reset (cxl_afu_reset()). This will cause the PSL/AFU
+ * hardware to lose track of all contexts. It's upto the caller of
+ * cxl_afu_reset() to restart these contexts.
+ */
+
+/*
+ * On pci_enabled_device(), the cxl driver will init a single cxl context for
+ * use by the driver. It doesn't start this context (as that will likely
+ * generate DMA traffic for most AFUs).
+ *
+ * This gets the default context associated with this pci_dev.  This context
+ * doesn't need to be released as this will be done by the PCI subsystem on hot
+ * unplug.
+ */
+struct cxl_context *cxl_get_context(struct pci_dev *dev);
+/*
+ * Allocate and initalise a context associated with a AFU PCI device. This
+ * doesn't start the context in the AFU.
+ */
+struct cxl_context *cxl_dev_context_init(struct pci_dev *dev);
+/*
+ * Release and free a context. Context should be stopped before calling.
+ */
+int cxl_release_context(struct cxl_context *ctx);
+
+/*
+ * Allocate AFU interrupts for this context. num=0 will allocate the default
+ * for this AFU as given in the AFU descriptor. This number doesn't include the
+ * interrupt 0 (CAIA defines AFU IRQ 0 for page faults). Each interrupt to be
+ * used must map a handler with cxl_map_afu_irq.
+ */
+int cxl_allocate_afu_irqs(struct cxl_context *cxl, int num);
+/* Free allocated interrupts */
+void cxl_free_afu_irqs(struct cxl_context *cxl);
+
+/*
+ * Map a handler for an AFU interrupt associated with a particular context. AFU
+ * IRQS numbers start from 1 (CAIA defines AFU IRQ 0 for page faults). cookie
+ * is private data is that will be provided to the interrupt handler.
+ */
+int cxl_map_afu_irq(struct cxl_context *cxl, int num,
+		    irq_handler_t handler, void *cookie, char *name);
+/* unmap mapped IRQ handlers */
+void cxl_unmap_afu_irq(struct cxl_context *cxl, int num, void *cookie);
+
+/*
+ * Start work on the AFU. This starts an cxl context and associates it with a
+ * task. task == NULL will make it a kernel context.
+ */
+int cxl_start_context(struct cxl_context *ctx, u64 wed,
+		      struct task_struct *task);
+/*
+ * Stop a context and remove it from the PSL
+ */
+int cxl_stop_context(struct cxl_context *ctx);
+
+/* Reset the AFU */
+int cxl_afu_reset(struct cxl_context *ctx);
+
+/*
+ * Set a context as a master context.
+ * This sets the default problem space area mapped as the full space, rather
+ * than just the per context area (for slaves).
+ */
+void cxl_set_master(struct cxl_context *ctx);
+
+/*
+ * Map and unmap the AFU Problem Space area. The amount and location mapped
+ * depends on if this context is a master or slave.
+ */
+void __iomem *cxl_psa_map(struct cxl_context *ctx);
+void cxl_psa_unmap(void __iomem *addr);
+
+/*  Get the process element for this context */
+int cxl_process_element(struct cxl_context *ctx);
+
+
+/*
+ * These calls allow drivers to create their own file descriptors and make them
+ * identical to the cxl file descriptor user API. An example use case:
+ *
+ * struct file_operations cxl_my_fops = {};
+ * ......
+ *	// Init the context
+ *	ctx = cxl_dev_context_init(dev);
+ *	if (IS_ERR(ctx))
+ *		return PTR_ERR(ctx);
+ *	// Create and attach a new file descriptor to my file ops
+ *	file = cxl_get_fd(ctx, &cxl_my_fops, &fd);
+ *	// Start context
+ *	rc = cxl_start_work(ctx, &work.work);
+ *	if (rc) {
+ *		fput(file);
+ *		put_unused_fd(fd);
+ *		return -ENODEV;
+ *	}
+ *	// No error paths after installing the fd
+ *	fd_install(fd, file);
+ *	return fd;
+ *
+ * This inits a context, and gets a file descriptor and associates some file
+ * ops to that file descriptor. If the file ops are blank, the cxl driver will
+ * fill them in with the default ones that mimic the standard user API.  Once
+ * completed, the file descriptor can be installed. Once the file descriptor is
+ * installed, it's visible to the user so no errors must occur past this point.
+ *
+ * If cxl_fd_release() file op call is installed, the context will be stopped
+ * and released when the fd is released. Hence the driver won't need to manage
+ * this itself.
+ */
+
+/*
+ * Take a context and associate it with my file ops. Returns the associated
+ * file and file descriptor. Any file ops which are blank are filled in by the
+ * cxl driver with the default ops to mimic the standard API.
+ */
+struct file *cxl_get_fd(struct cxl_context *ctx, struct file_operations *fops,
+			int *fd);
+/* Get the context associated with this file */
+struct cxl_context *cxl_fops_get_context(struct file *file);
+/*
+ * Start a context associated a struct cxl_ioctl_start_work used by the
+ * standard cxl user API.
+ */
+int cxl_start_work(struct cxl_context *ctx,
+		   struct cxl_ioctl_start_work *work);
+/*
+ * Export all the existing fops so drivers can use them
+ */
+int cxl_fd_open(struct inode *inode, struct file *file);
+int cxl_fd_release(struct inode *inode, struct file *file);
+long cxl_fd_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int cxl_fd_mmap(struct file *file, struct vm_area_struct *vm);
+unsigned int cxl_fd_poll(struct file *file, struct poll_table_struct *poll);
+ssize_t cxl_fd_read(struct file *file, char __user *buf, size_t count,
+			   loff_t *off);
+
+#endif /* _MISC_CXL_H */
-- 
cgit v0.10.2


From 14aae78f084af5241476ede8e28b377fc751b191 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= <clg@fr.ibm.com>
Date: Wed, 8 Apr 2015 09:31:10 +0200
Subject: powerpc/powernv: convert OPAL codes returned by sysparam calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The opal_{get,set}_param calls return internal error codes which need
to be translated in errnos in Linux.

Signed-off-by: Cédric Le Goater <clg@fr.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c
index 2e52b47..afe66c5 100644
--- a/arch/powerpc/platforms/powernv/opal-sysparam.c
+++ b/arch/powerpc/platforms/powernv/opal-sysparam.c
@@ -55,8 +55,10 @@ static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer)
 	}
 
 	ret = opal_get_param(token, param_id, (u64)buffer, length);
-	if (ret != OPAL_ASYNC_COMPLETION)
+	if (ret != OPAL_ASYNC_COMPLETION) {
+		ret = opal_error_code(ret);
 		goto out_token;
+	}
 
 	ret = opal_async_wait_response(token, &msg);
 	if (ret) {
@@ -65,7 +67,7 @@ static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer)
 		goto out_token;
 	}
 
-	ret = be64_to_cpu(msg.params[1]);
+	ret = opal_error_code(be64_to_cpu(msg.params[1]));
 
 out_token:
 	opal_async_release_token(token);
@@ -89,8 +91,10 @@ static int opal_set_sys_param(u32 param_id, u32 length, void *buffer)
 
 	ret = opal_set_param(token, param_id, (u64)buffer, length);
 
-	if (ret != OPAL_ASYNC_COMPLETION)
+	if (ret != OPAL_ASYNC_COMPLETION) {
+		ret = opal_error_code(ret);
 		goto out_token;
+	}
 
 	ret = opal_async_wait_response(token, &msg);
 	if (ret) {
@@ -99,7 +103,7 @@ static int opal_set_sys_param(u32 param_id, u32 length, void *buffer)
 		goto out_token;
 	}
 
-	ret = be64_to_cpu(msg.params[1]);
+	ret = opal_error_code(be64_to_cpu(msg.params[1]));
 
 out_token:
 	opal_async_release_token(token);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 8403307..fd694bd 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -830,6 +830,7 @@ int opal_error_code(int rc)
 	case OPAL_ASYNC_COMPLETION:	return -EINPROGRESS;
 	case OPAL_BUSY_EVENT:		return -EBUSY;
 	case OPAL_NO_MEM:		return -ENOMEM;
+	case OPAL_PERMISSION:		return -EPERM;
 
 	case OPAL_UNSUPPORTED:		return -EIO;
 	case OPAL_HARDWARE:		return -EIO;
-- 
cgit v0.10.2


From 18725226af7a6a0eac8a5bc42a6c9c90af1fb0e7 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 13 May 2015 15:13:18 +1000
Subject: powerpc/config: Enable bnx2x on ppc64 and pseries defconfigs

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index aad501a..a97efc2 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -155,6 +155,7 @@ CONFIG_ACENIC=m
 CONFIG_ACENIC_OMIT_TIGON_I=y
 CONFIG_PCNET32=y
 CONFIG_TIGON3=y
+CONFIG_BNX2X=m
 CONFIG_CHELSIO_T1=m
 CONFIG_BE2NET=m
 CONFIG_S2IO=m
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 4da8260..0d9efce 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -154,6 +154,7 @@ CONFIG_ACENIC=m
 CONFIG_ACENIC_OMIT_TIGON_I=y
 CONFIG_PCNET32=y
 CONFIG_TIGON3=y
+CONFIG_BNX2X=m
 CONFIG_CHELSIO_T1=m
 CONFIG_BE2NET=m
 CONFIG_S2IO=m
-- 
cgit v0.10.2


From 48c06154952a18c5cd4d074ec3de627430cb6d23 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Wed, 20 May 2015 11:23:33 +0800
Subject: powerpc/powernv: Merge common platform device initialisation

opal_ipmi_init and opal_flash_init are equivalent, except for the
compatbile string. Merge these two into a common opal_pdev_init
function.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index fd694bd..fdce840 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -600,21 +600,13 @@ static void __init opal_dump_region_init(void)
 			"rc = %d\n", rc);
 }
 
-static void opal_flash_init(struct device_node *opal_node)
+static void opal_pdev_init(struct device_node *opal_node,
+		const char *compatible)
 {
 	struct device_node *np;
 
 	for_each_child_of_node(opal_node, np)
-		if (of_device_is_compatible(np, "ibm,opal-flash"))
-			of_platform_device_create(np, NULL, NULL);
-}
-
-static void opal_ipmi_init(struct device_node *opal_node)
-{
-	struct device_node *np;
-
-	for_each_child_of_node(opal_node, np)
-		if (of_device_is_compatible(np, "ibm,opal-ipmi"))
+		if (of_device_is_compatible(np, compatible))
 			of_platform_device_create(np, NULL, NULL);
 }
 
@@ -717,10 +709,9 @@ static int __init opal_init(void)
 		opal_msglog_init();
 	}
 
-	/* Initialize OPAL IPMI backend */
-	opal_ipmi_init(opal_node);
-
-	opal_flash_init(opal_node);
+	/* Initialize platform devices: IPMI backend & flash interface */
+	opal_pdev_init(opal_node, "ibm,opal-ipmi");
+	opal_pdev_init(opal_node, "ibm,opal-flash");
 
 	return 0;
 }
-- 
cgit v0.10.2


From 594fcb9ec9e17215b4f98b5d40ba5e3af618ba82 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Wed, 20 May 2015 11:23:33 +0800
Subject: powerpc/powernv: Expose OPAL APIs required by PRD interface

The (upcoming) opal-prd driver needs to access the message notifier and
xscom code, so add EXPORT_SYMBOL_GPL macros for these.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index fdce840..0379068 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -235,6 +235,7 @@ int opal_message_notifier_register(enum opal_msg_type msg_type,
 	return atomic_notifier_chain_register(
 				&opal_msg_notifier_head[msg_type], nb);
 }
+EXPORT_SYMBOL_GPL(opal_message_notifier_register);
 
 int opal_message_notifier_unregister(enum opal_msg_type msg_type,
 				     struct notifier_block *nb)
@@ -242,6 +243,7 @@ int opal_message_notifier_unregister(enum opal_msg_type msg_type,
 	return atomic_notifier_chain_unregister(
 			&opal_msg_notifier_head[msg_type], nb);
 }
+EXPORT_SYMBOL_GPL(opal_message_notifier_unregister);
 
 static void opal_message_do_notify(uint32_t msg_type, void *msg)
 {
@@ -743,6 +745,8 @@ void opal_shutdown(void)
 
 /* Export this so that test modules can use it */
 EXPORT_SYMBOL_GPL(opal_invalid_call);
+EXPORT_SYMBOL_GPL(opal_xscom_read);
+EXPORT_SYMBOL_GPL(opal_xscom_write);
 EXPORT_SYMBOL_GPL(opal_ipmi_send);
 EXPORT_SYMBOL_GPL(opal_ipmi_recv);
 EXPORT_SYMBOL_GPL(opal_flash_read);
-- 
cgit v0.10.2


From 0d7cd8550d30906c7461ced654306da30f1590e2 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Thu, 4 Jun 2015 21:51:47 +0800
Subject: powerpc/powernv: Add opal-prd channel

This change adds a char device to access the "PRD" (processor runtime
diagnostics) channel to OPAL firmware.

Includes contributions from Vaidyanathan Srinivasan, Neelesh Gupta &
Vishal Kulkarni.

Signed-off-by: Neelesh Gupta <neelegup@linux.vnet.ibm.com>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Acked-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index a49e5fa..e9e4c52 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -153,7 +153,8 @@
 #define OPAL_FLASH_READ				110
 #define OPAL_FLASH_WRITE			111
 #define OPAL_FLASH_ERASE			112
-#define OPAL_LAST				112
+#define OPAL_PRD_MSG				113
+#define OPAL_LAST				113
 
 /* Device tree flags */
 
@@ -359,6 +360,7 @@ enum opal_msg_type {
 	OPAL_MSG_SHUTDOWN,		/* params[0] = 1 reboot, 0 shutdown */
 	OPAL_MSG_HMI_EVT,
 	OPAL_MSG_DPO,
+	OPAL_MSG_PRD,
 	OPAL_MSG_TYPE_MAX,
 };
 
@@ -681,6 +683,23 @@ typedef struct oppanel_line {
 	__be64 line_len;
 } oppanel_line_t;
 
+enum opal_prd_msg_type {
+	OPAL_PRD_MSG_TYPE_INIT = 0,	/* HBRT --> OPAL */
+	OPAL_PRD_MSG_TYPE_FINI,		/* HBRT/kernel --> OPAL */
+	OPAL_PRD_MSG_TYPE_ATTN,		/* HBRT <-- OPAL */
+	OPAL_PRD_MSG_TYPE_ATTN_ACK,	/* HBRT --> OPAL */
+	OPAL_PRD_MSG_TYPE_OCC_ERROR,	/* HBRT <-- OPAL */
+	OPAL_PRD_MSG_TYPE_OCC_RESET,	/* HBRT <-- OPAL */
+};
+
+struct opal_prd_msg_header {
+	uint8_t		type;
+	uint8_t		pad[1];
+	__be16		size;
+};
+
+struct opal_prd_msg;
+
 /*
  * SG entries
  *
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 1412814..958e941 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -194,6 +194,7 @@ int64_t opal_ipmi_recv(uint64_t interface, struct opal_ipmi_msg *msg,
 		uint64_t *msg_len);
 int64_t opal_i2c_request(uint64_t async_token, uint32_t bus_id,
 			 struct opal_i2c_request *oreq);
+int64_t opal_prd_msg(struct opal_prd_msg *msg);
 
 int64_t opal_flash_read(uint64_t id, uint64_t offset, uint64_t buf,
 		uint64_t size, uint64_t token);
diff --git a/arch/powerpc/include/uapi/asm/opal-prd.h b/arch/powerpc/include/uapi/asm/opal-prd.h
new file mode 100644
index 0000000..319ff4a
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/opal-prd.h
@@ -0,0 +1,58 @@
+/*
+ * OPAL Runtime Diagnostics interface driver
+ * Supported on POWERNV platform
+ *
+ * (C) Copyright IBM 2015
+ *
+ * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
+ * Author: Jeremy Kerr <jk@ozlabs.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_ASM_POWERPC_OPAL_PRD_H_
+#define _UAPI_ASM_POWERPC_OPAL_PRD_H_
+
+#include <linux/types.h>
+
+/**
+ * The version of the kernel interface of the PRD system. This describes the
+ * interface available for the /dev/opal-prd device. The actual PRD message
+ * layout and content is private to the firmware <--> userspace interface, so
+ * is not covered by this versioning.
+ *
+ * Future interface versions are backwards-compatible; if a later kernel
+ * version is encountered, functionality provided in earlier versions
+ * will work.
+ */
+#define OPAL_PRD_KERNEL_VERSION		1
+
+#define OPAL_PRD_GET_INFO		_IOR('o', 0x01, struct opal_prd_info)
+#define OPAL_PRD_SCOM_READ		_IOR('o', 0x02, struct opal_prd_scom)
+#define OPAL_PRD_SCOM_WRITE		_IOW('o', 0x03, struct opal_prd_scom)
+
+#ifndef __ASSEMBLY__
+
+struct opal_prd_info {
+	__u64	version;
+	__u64	reserved[3];
+};
+
+struct opal_prd_scom {
+	__u64	chip;
+	__u64	addr;
+	__u64	data;
+	__s64	rc;
+};
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _UAPI_ASM_POWERPC_OPAL_PRD_H */
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 4b044d8..604190c 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -19,3 +19,10 @@ config PPC_POWERNV
 	select CPU_FREQ_GOV_CONSERVATIVE
 	select PPC_DOORBELL
 	default y
+
+config OPAL_PRD
+	tristate 'OPAL PRD driver'
+	depends on PPC_POWERNV
+	help
+	  This enables the opal-prd driver, a facility to run processor
+	  recovery diagnostics on OpenPower machines
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 26bd7e4..1c8cdb6 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_EEH)	+= eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
 obj-$(CONFIG_TRACEPOINTS)	+= opal-tracepoints.o
+obj-$(CONFIG_OPAL_PRD)	+= opal-prd.o
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
new file mode 100644
index 0000000..d9e72bd
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -0,0 +1,444 @@
+/*
+ * OPAL Runtime Diagnostics interface driver
+ * Supported on POWERNV platform
+ *
+ * Copyright IBM Corporation 2015
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "opal-prd: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/poll.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/opal-prd.h>
+#include <asm/opal.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+
+struct opal_prd_msg_queue_item {
+	struct opal_prd_msg_header	msg;
+	struct list_head		list;
+};
+
+static struct device_node *prd_node;
+static LIST_HEAD(opal_prd_msg_queue);
+static DEFINE_SPINLOCK(opal_prd_msg_queue_lock);
+static DECLARE_WAIT_QUEUE_HEAD(opal_prd_msg_wait);
+static atomic_t prd_usage;
+
+static bool opal_prd_range_is_valid(uint64_t addr, uint64_t size)
+{
+	struct device_node *parent, *node;
+	bool found;
+
+	if (addr + size < addr)
+		return false;
+
+	parent = of_find_node_by_path("/reserved-memory");
+	if (!parent)
+		return false;
+
+	found = false;
+
+	for_each_child_of_node(parent, node) {
+		uint64_t range_addr, range_size, range_end;
+		const __be32 *addrp;
+		const char *label;
+
+		addrp = of_get_address(node, 0, &range_size, NULL);
+
+		range_addr = of_read_number(addrp, 2);
+		range_end = range_addr + range_size;
+
+		label = of_get_property(node, "ibm,prd-label", NULL);
+
+		/* PRD ranges need a label */
+		if (!label)
+			continue;
+
+		if (range_end <= range_addr)
+			continue;
+
+		if (addr >= range_addr && addr + size <= range_end) {
+			found = true;
+			of_node_put(node);
+			break;
+		}
+	}
+
+	of_node_put(parent);
+	return found;
+}
+
+static int opal_prd_open(struct inode *inode, struct file *file)
+{
+	/*
+	 * Prevent multiple (separate) processes from concurrent interactions
+	 * with the FW PRD channel
+	 */
+	if (atomic_xchg(&prd_usage, 1) == 1)
+		return -EBUSY;
+
+	return 0;
+}
+
+/*
+ * opal_prd_mmap - maps firmware-provided ranges into userspace
+ * @file: file structure for the device
+ * @vma: VMA to map the registers into
+ */
+
+static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t addr, size;
+	int rc;
+
+	pr_devel("opal_prd_mmap(0x%016lx, 0x%016lx, 0x%lx, 0x%lx)\n",
+			vma->vm_start, vma->vm_end, vma->vm_pgoff,
+			vma->vm_flags);
+
+	addr = vma->vm_pgoff << PAGE_SHIFT;
+	size = vma->vm_end - vma->vm_start;
+
+	/* ensure we're mapping within one of the allowable ranges */
+	if (!opal_prd_range_is_valid(addr, size))
+		return -EINVAL;
+
+	vma->vm_page_prot = __pgprot(pgprot_val(phys_mem_access_prot(file,
+						vma->vm_pgoff,
+						 size, vma->vm_page_prot))
+					| _PAGE_SPECIAL);
+
+	rc = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size,
+			vma->vm_page_prot);
+
+	return rc;
+}
+
+static bool opal_msg_queue_empty(void)
+{
+	unsigned long flags;
+	bool ret;
+
+	spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+	ret = list_empty(&opal_prd_msg_queue);
+	spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+	return ret;
+}
+
+static unsigned int opal_prd_poll(struct file *file,
+		struct poll_table_struct *wait)
+{
+	poll_wait(file, &opal_prd_msg_wait, wait);
+
+	if (!opal_msg_queue_empty())
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static ssize_t opal_prd_read(struct file *file, char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct opal_prd_msg_queue_item *item;
+	unsigned long flags;
+	ssize_t size, err;
+	int rc;
+
+	/* we need at least a header's worth of data */
+	if (count < sizeof(item->msg))
+		return -EINVAL;
+
+	if (*ppos)
+		return -ESPIPE;
+
+	item = NULL;
+
+	for (;;) {
+
+		spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+		if (!list_empty(&opal_prd_msg_queue)) {
+			item = list_first_entry(&opal_prd_msg_queue,
+					struct opal_prd_msg_queue_item, list);
+			list_del(&item->list);
+		}
+		spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+		if (item)
+			break;
+
+		if (file->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		rc = wait_event_interruptible(opal_prd_msg_wait,
+				!opal_msg_queue_empty());
+		if (rc)
+			return -EINTR;
+	}
+
+	size = be16_to_cpu(item->msg.size);
+	if (size > count) {
+		err = -EINVAL;
+		goto err_requeue;
+	}
+
+	rc = copy_to_user(buf, &item->msg, size);
+	if (rc) {
+		err = -EFAULT;
+		goto err_requeue;
+	}
+
+	kfree(item);
+
+	return size;
+
+err_requeue:
+	/* eep! re-queue at the head of the list */
+	spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+	list_add(&item->list, &opal_prd_msg_queue);
+	spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+	return err;
+}
+
+static ssize_t opal_prd_write(struct file *file, const char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct opal_prd_msg_header hdr;
+	ssize_t size;
+	void *msg;
+	int rc;
+
+	size = sizeof(hdr);
+
+	if (count < size)
+		return -EINVAL;
+
+	/* grab the header */
+	rc = copy_from_user(&hdr, buf, sizeof(hdr));
+	if (rc)
+		return -EFAULT;
+
+	size = be16_to_cpu(hdr.size);
+
+	msg = kmalloc(size, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	rc = copy_from_user(msg, buf, size);
+	if (rc) {
+		size = -EFAULT;
+		goto out_free;
+	}
+
+	rc = opal_prd_msg(msg);
+	if (rc) {
+		pr_warn("write: opal_prd_msg returned %d\n", rc);
+		size = -EIO;
+	}
+
+out_free:
+	kfree(msg);
+
+	return size;
+}
+
+static int opal_prd_release(struct inode *inode, struct file *file)
+{
+	struct opal_prd_msg_header msg;
+
+	msg.size = cpu_to_be16(sizeof(msg));
+	msg.type = OPAL_PRD_MSG_TYPE_FINI;
+
+	opal_prd_msg((struct opal_prd_msg *)&msg);
+
+	atomic_xchg(&prd_usage, 0);
+
+	return 0;
+}
+
+static long opal_prd_ioctl(struct file *file, unsigned int cmd,
+		unsigned long param)
+{
+	struct opal_prd_info info;
+	struct opal_prd_scom scom;
+	int rc = 0;
+
+	switch (cmd) {
+	case OPAL_PRD_GET_INFO:
+		memset(&info, 0, sizeof(info));
+		info.version = OPAL_PRD_KERNEL_VERSION;
+		rc = copy_to_user((void __user *)param, &info, sizeof(info));
+		if (rc)
+			return -EFAULT;
+		break;
+
+	case OPAL_PRD_SCOM_READ:
+		rc = copy_from_user(&scom, (void __user *)param, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+
+		scom.rc = opal_xscom_read(scom.chip, scom.addr,
+				(__be64 *)&scom.data);
+		scom.data = be64_to_cpu(scom.data);
+		pr_devel("ioctl SCOM_READ: chip %llx addr %016llx data %016llx rc %lld\n",
+				scom.chip, scom.addr, scom.data, scom.rc);
+
+		rc = copy_to_user((void __user *)param, &scom, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+		break;
+
+	case OPAL_PRD_SCOM_WRITE:
+		rc = copy_from_user(&scom, (void __user *)param, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+
+		scom.rc = opal_xscom_write(scom.chip, scom.addr, scom.data);
+		pr_devel("ioctl SCOM_WRITE: chip %llx addr %016llx data %016llx rc %lld\n",
+				scom.chip, scom.addr, scom.data, scom.rc);
+
+		rc = copy_to_user((void __user *)param, &scom, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+		break;
+
+	default:
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+static const struct file_operations opal_prd_fops = {
+	.open		= opal_prd_open,
+	.mmap		= opal_prd_mmap,
+	.poll		= opal_prd_poll,
+	.read		= opal_prd_read,
+	.write		= opal_prd_write,
+	.unlocked_ioctl	= opal_prd_ioctl,
+	.release	= opal_prd_release,
+	.owner		= THIS_MODULE,
+};
+
+static struct miscdevice opal_prd_dev = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "opal-prd",
+	.fops		= &opal_prd_fops,
+};
+
+/* opal interface */
+static int opal_prd_msg_notifier(struct notifier_block *nb,
+		unsigned long msg_type, void *_msg)
+{
+	struct opal_prd_msg_queue_item *item;
+	struct opal_prd_msg_header *hdr;
+	struct opal_msg *msg = _msg;
+	unsigned long flags;
+	int size;
+
+	if (msg_type != OPAL_MSG_PRD)
+		return 0;
+
+	/* Calculate total size of the item we need to store. The 'size' field
+	 * in the header includes the header itself. */
+	hdr = (void *)msg->params;
+	size = (sizeof(*item) - sizeof(item->msg)) + be16_to_cpu(hdr->size);
+
+	item = kzalloc(size, GFP_ATOMIC);
+	if (!item)
+		return -ENOMEM;
+
+	memcpy(&item->msg, msg->params, size);
+
+	spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+	list_add_tail(&item->list, &opal_prd_msg_queue);
+	spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+	wake_up_interruptible(&opal_prd_msg_wait);
+
+	return 0;
+}
+
+static struct notifier_block opal_prd_event_nb = {
+	.notifier_call	= opal_prd_msg_notifier,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int opal_prd_probe(struct platform_device *pdev)
+{
+	int rc;
+
+	if (!pdev || !pdev->dev.of_node)
+		return -ENODEV;
+
+	/* We should only have one prd driver instance per machine; ensure
+	 * that we only get a valid probe on a single OF node.
+	 */
+	if (prd_node)
+		return -EBUSY;
+
+	prd_node = pdev->dev.of_node;
+
+	rc = opal_message_notifier_register(OPAL_MSG_PRD, &opal_prd_event_nb);
+	if (rc) {
+		pr_err("Couldn't register event notifier\n");
+		return rc;
+	}
+
+	rc = misc_register(&opal_prd_dev);
+	if (rc) {
+		pr_err("failed to register miscdev\n");
+		opal_message_notifier_unregister(OPAL_MSG_PRD,
+				&opal_prd_event_nb);
+		return rc;
+	}
+
+	return 0;
+}
+
+static int opal_prd_remove(struct platform_device *pdev)
+{
+	misc_deregister(&opal_prd_dev);
+	opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb);
+	return 0;
+}
+
+static const struct of_device_id opal_prd_match[] = {
+	{ .compatible = "ibm,opal-prd" },
+	{ },
+};
+
+static struct platform_driver opal_prd_driver = {
+	.driver = {
+		.name		= "opal-prd",
+		.owner		= THIS_MODULE,
+		.of_match_table	= opal_prd_match,
+	},
+	.probe	= opal_prd_probe,
+	.remove	= opal_prd_remove,
+};
+
+module_platform_driver(opal_prd_driver);
+
+MODULE_DEVICE_TABLE(of, opal_prd_match);
+MODULE_DESCRIPTION("PowerNV OPAL runtime diagnostic driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index bf15ead..d6a7b82 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -296,3 +296,4 @@ OPAL_CALL(opal_i2c_request,			OPAL_I2C_REQUEST);
 OPAL_CALL(opal_flash_read,			OPAL_FLASH_READ);
 OPAL_CALL(opal_flash_write,			OPAL_FLASH_WRITE);
 OPAL_CALL(opal_flash_erase,			OPAL_FLASH_ERASE);
+OPAL_CALL(opal_prd_msg,				OPAL_PRD_MSG);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 0379068..9e9c483 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -711,9 +711,10 @@ static int __init opal_init(void)
 		opal_msglog_init();
 	}
 
-	/* Initialize platform devices: IPMI backend & flash interface */
+	/* Initialize platform devices: IPMI backend, PRD & flash interface */
 	opal_pdev_init(opal_node, "ibm,opal-ipmi");
 	opal_pdev_init(opal_node, "ibm,opal-flash");
+	opal_pdev_init(opal_node, "ibm,opal-prd");
 
 	return 0;
 }
@@ -752,6 +753,7 @@ EXPORT_SYMBOL_GPL(opal_ipmi_recv);
 EXPORT_SYMBOL_GPL(opal_flash_read);
 EXPORT_SYMBOL_GPL(opal_flash_write);
 EXPORT_SYMBOL_GPL(opal_flash_erase);
+EXPORT_SYMBOL_GPL(opal_prd_msg);
 
 /* Convert a region of vmalloc memory to an opal sg list */
 struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
-- 
cgit v0.10.2


From 52aeeb3727af9b5ee4c85c42baff92501b5ceae1 Mon Sep 17 00:00:00 2001
From: Igal Liberman <igal.liberman@freescale.com>
Date: Mon, 1 Jun 2015 15:06:51 +0300
Subject: powerpc/fsl: Add FMan Port 10G compatibles

This patch adds two boolean properties to FMan Port.
FMan has 3 types of ports:
	- 1G ports
		By default, all ports support 1G rate
	- 10G Ports
		Port which use 10G hardware, and configured as 10G
	- 10G Best effort ports
		Ports which use 1G hardware, configured as 10G, in this case,
		the rate is not guaranteed.
The new properties help to distinguish the different type of ports.

Signed-off-by: Igal Liberman <igal.liberman@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/Documentation/devicetree/bindings/powerpc/fsl/fman.txt b/Documentation/devicetree/bindings/powerpc/fsl/fman.txt
index edda55f..1fc5328 100644
--- a/Documentation/devicetree/bindings/powerpc/fsl/fman.txt
+++ b/Documentation/devicetree/bindings/powerpc/fsl/fman.txt
@@ -189,6 +189,19 @@ PROPERTIES
 		Definition: There is one reg region describing the port
 		configuration registers.
 
+- fsl,fman-10g-port
+		Usage: optional
+		Value type: boolean
+		Definition: The default port rate is 1G.
+		If this property exists, the port is s 10G port.
+
+- fsl,fman-best-effort-port
+		Usage: optional
+		Value type: boolean
+		Definition: Can be defined only if 10G-support is set.
+		This property marks a best-effort 10G port (10G port that
+		may not be capable of line rate).
+
 EXAMPLE
 
 port@a8000 {
-- 
cgit v0.10.2


From f1b3b4450dcbfd0098b6fc3b00f4880701550c83 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 26 May 2015 12:55:51 +0200
Subject: powerpc/85xx: Replace CONFIG_USB_ISP1760_HCD by CONFIG_USB_ISP1760

Since commit 100832abf065bc18 ("usb: isp1760: Make HCD support
optional"), CONFIG_USB_ISP1760_HCD is automatically selected when
needed.  Enabling that option in the defconfig is now a no-op, and no
longer enables ISP1760 HCD support.

Re-enable the ISP1760 driver in the defconfig by enabling
USB_ISP1760_HOST_ROLE instead.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Scott Wood <scottwood@freescale.com>

diff --git a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
index 34f3ea1..858b539 100644
--- a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
+++ b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
@@ -108,7 +108,7 @@ CONFIG_SENSORS_LM90=y
 CONFIG_WATCHDOG=y
 CONFIG_USB=y
 CONFIG_USB_MON=y
-CONFIG_USB_ISP1760_HCD=y
+CONFIG_USB_ISP1760=y
 CONFIG_USB_STORAGE=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
-- 
cgit v0.10.2


From 502f159c0239863deebfc50e09c0892d0c157101 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 3 Jun 2015 14:52:59 +1000
Subject: powerpc/eeh: Fix trivial error in eeh_restore_dev_state()

Commit 28158cd "powerpc/eeh: Enhance pcibios_set_pcie_reset_state()"
introduced a fix for a problem where certain configurations could lead to
pci_reset_function() destroying the state of PCI devices other than the one
specified.

Unfortunately, the fix has a trivial bug - it calls pci_save_state() again,
when it should be calling pci_restore_state().  This corrects the problem.

Cc: Gavin Shan <gwshan@au1.ibm.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 23e0aa7..51dcdf6 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -717,7 +717,7 @@ static void *eeh_restore_dev_state(void *data, void *userdata)
 
 	/* The caller should restore state for the specified device */
 	if (pdev != dev)
-		pci_save_state(pdev);
+		pci_restore_state(pdev);
 
 	return NULL;
 }
-- 
cgit v0.10.2


From f67b4938af07e3ef3a925027a2fc62199c026e2b Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Fri, 5 Jun 2015 14:38:26 +1000
Subject: cxl: Reset default context for vPHB on release

When we release the device, we should also invalidate the default context.
With this cxl_get_context() will return null after removal.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c
index f733309..b1d1983a 100644
--- a/drivers/misc/cxl/vphb.c
+++ b/drivers/misc/cxl/vphb.c
@@ -72,6 +72,7 @@ static void cxl_pci_disable_device(struct pci_dev *dev)
 			dev_err(&dev->dev, "Default context started\n");
 			return;
 		}
+		dev->dev.archdata.cxl_ctx = NULL;
 		cxl_release_context(ctx);
 	}
 }
-- 
cgit v0.10.2


From c952c1c48233b95f838c8743f09feace1f0e6cab Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:01 +0530
Subject: powerpc: Fix handling of DSCR related facility unavailable exception

Currently DSCR (Data Stream Control Register) can be accessed with
mfspr or mtspr instructions inside a thread via two different SPR
numbers. One being the user accessible problem state SPR number 0x03
and the other being the privilege state SPR number 0x11. All access
through the privilege state SPR number get emulated through illegal
instruction exception. Any access through the problem state SPR number
raises one facility unavailable exception which sets the thread based
dscr_inherit bit and enables DSCR facility through FSCR register thus
allowing direct access to DSCR without going through this exception in
the future. We set the thread.dscr_inherit bit whether the access was
with mfspr or mtspr instruction which is neither correct nor does it
match the behaviour through the instruction emulation code path driven
from privilege state SPR number. User currently observes two different
kind of behaviour when accessing the DSCR through these two SPR numbers.
This problem can be observed through these two test cases by replacing
the privilege state SPR number with the problem state SPR number.

	(1) http://ozlabs.org/~anton/junkcode/dscr_default_test.c
	(2) http://ozlabs.org/~anton/junkcode/dscr_explicit_test.c

This patch fixes the problem by making sure that the behaviour visible
to the user remains the same irrespective of which SPR number is being
used. Inside facility unavailable exception, we check whether it was
cuased by a mfspr or a mtspr isntrucction. In case of mfspr instruction,
just emulate the instruction. In case of mtspr instruction, set the
thread based dscr_inherit bit and also enable the facility through FSCR.
All user SPR based mfspr instruction will be emulated till one user SPR
based mtspr has been executed.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 19e4744..6530f1b 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1377,6 +1377,7 @@ void facility_unavailable_exception(struct pt_regs *regs)
 	};
 	char *facility = "unknown";
 	u64 value;
+	u32 instword, rd;
 	u8 status;
 	bool hv;
 
@@ -1388,12 +1389,46 @@ void facility_unavailable_exception(struct pt_regs *regs)
 
 	status = value >> 56;
 	if (status == FSCR_DSCR_LG) {
-		/* User is acessing the DSCR.  Set the inherit bit and allow
-		 * the user to set it directly in future by setting via the
-		 * FSCR DSCR bit.  We always leave HFSCR DSCR set.
+		/*
+		 * User is accessing the DSCR register using the problem
+		 * state only SPR number (0x03) either through a mfspr or
+		 * a mtspr instruction. If it is a write attempt through
+		 * a mtspr, then we set the inherit bit. This also allows
+		 * the user to write or read the register directly in the
+		 * future by setting via the FSCR DSCR bit. But in case it
+		 * is a read DSCR attempt through a mfspr instruction, we
+		 * just emulate the instruction instead. This code path will
+		 * always emulate all the mfspr instructions till the user
+		 * has attempted atleast one mtspr instruction. This way it
+		 * preserves the same behaviour when the user is accessing
+		 * the DSCR through privilege level only SPR number (0x11)
+		 * which is emulated through illegal instruction exception.
+		 * We always leave HFSCR DSCR set.
 		 */
-		current->thread.dscr_inherit = 1;
-		mtspr(SPRN_FSCR, value | FSCR_DSCR);
+		if (get_user(instword, (u32 __user *)(regs->nip))) {
+			pr_err("Failed to fetch the user instruction\n");
+			return;
+		}
+
+		/* Write into DSCR (mtspr 0x03, RS) */
+		if ((instword & PPC_INST_MTSPR_DSCR_USER_MASK)
+				== PPC_INST_MTSPR_DSCR_USER) {
+			rd = (instword >> 21) & 0x1f;
+			current->thread.dscr = regs->gpr[rd];
+			current->thread.dscr_inherit = 1;
+			mtspr(SPRN_FSCR, value | FSCR_DSCR);
+		}
+
+		/* Read from DSCR (mfspr RT, 0x03) */
+		if ((instword & PPC_INST_MFSPR_DSCR_USER_MASK)
+				== PPC_INST_MFSPR_DSCR_USER) {
+			if (emulate_instruction(regs)) {
+				pr_err("DSCR based mfspr emulation failed\n");
+				return;
+			}
+			regs->nip += 4;
+			emulate_single_step(regs);
+		}
 		return;
 	}
 
-- 
cgit v0.10.2


From 280e109992831ee16a0f74ae887c08fb26c021f1 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:02 +0530
Subject: powerpc/kernel: Remove the unused extern dscr_default

The process context switch code no longer uses dscr_default variable
from the sysfs.c file. The variable became unused when we started
storing the CPU specific DSCR value in the PACA structure instead.
This patch just removes this extern declaration. It was originally
added by the following commit.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index febb50d..8005e18 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1112,7 +1112,6 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp)
 /*
  * Copy a thread..
  */
-extern unsigned long dscr_default; /* defined in arch/powerpc/kernel/sysfs.c */
 
 /*
  * Copy architecture-specific thread state
-- 
cgit v0.10.2


From 1db365258ad9c3624897f48c764f8c557f492b26 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:03 +0530
Subject: powerpc/kernel: Rename PACA_DSCR to PACA_DSCR_DEFAULT

PACA_DSCR offset macro tracks dscr_default element in the paca
structure. Better change the name of this macro to match that of the
data element it tracks. Makes the code more readable.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 0034b6b..9823057 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -247,7 +247,7 @@ int main(void)
 #endif
 	DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id));
 	DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state));
-	DEFINE(PACA_DSCR, offsetof(struct paca_struct, dscr_default));
+	DEFINE(PACA_DSCR_DEFAULT, offsetof(struct paca_struct, dscr_default));
 	DEFINE(PACA_STARTTIME, offsetof(struct paca_struct, starttime));
 	DEFINE(PACA_STARTTIME_USER, offsetof(struct paca_struct, starttime_user));
 	DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index afbc200..278888e 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -556,7 +556,7 @@ BEGIN_FTR_SECTION
 	ld	r0,THREAD_DSCR(r4)
 	cmpwi	r6,0
 	bne	1f
-	ld	r0,PACA_DSCR(r13)
+	ld	r0,PACA_DSCR_DEFAULT(r13)
 1:
 BEGIN_FTR_SECTION_NESTED(70)
 	mfspr	r8, SPRN_FSCR
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 5754b22..bf8f34a 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -293,7 +293,7 @@ dont_backup_fp:
 	ld	r2, STK_GOT(r1)
 
 	/* Load CPU's default DSCR */
-	ld	r0, PACA_DSCR(r13)
+	ld	r0, PACA_DSCR_DEFAULT(r13)
 	mtspr	SPRN_DSCR, r0
 
 	blr
@@ -473,7 +473,7 @@ restore_gprs:
 	ld	r2, STK_GOT(r1)
 
 	/* Load CPU's default DSCR */
-	ld	r0, PACA_DSCR(r13)
+	ld	r0, PACA_DSCR_DEFAULT(r13)
 	mtspr	SPRN_DSCR, r0
 
 	blr
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 4d70df2..faa86e9 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -324,7 +324,7 @@ kvm_start_guest:
 kvm_secondary_got_guest:
 
 	/* Set HSTATE_DSCR(r13) to something sensible */
-	ld	r6, PACA_DSCR(r13)
+	ld	r6, PACA_DSCR_DEFAULT(r13)
 	std	r6, HSTATE_DSCR(r13)
 
 	/* Order load of vcore, ptid etc. after load of vcpu */
-- 
cgit v0.10.2


From d3cb06e0cdc9841b289a0d68acaffd2868504902 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:04 +0530
Subject: powerpc/dscr: Add some in-code documentation

This patch adds some in-code documentation to the DSCR related code to
make it more readable without having any functional change to it.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index bf117d8..28ded5d 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -295,6 +295,15 @@ struct thread_struct {
 #endif
 #ifdef CONFIG_PPC64
 	unsigned long	dscr;
+	/*
+	 * This member element dscr_inherit indicates that the process
+	 * has explicitly attempted and changed the DSCR register value
+	 * for itself. Hence kernel wont use the default CPU DSCR value
+	 * contained in the PACA structure anymore during process context
+	 * switch. Once this variable is set, this behaviour will also be
+	 * inherited to all the children of this process from that point
+	 * onwards.
+	 */
 	int		dscr_inherit;
 	unsigned long	ppr;	/* used to save/restore SMT priority */
 #endif
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index fa1fd8a..692873b 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -496,13 +496,34 @@ static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
 static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
 static DEVICE_ATTR(pir, 0400, show_pir, NULL);
 
+/*
+ * This is the system wide DSCR register default value. Any
+ * change to this default value through the sysfs interface
+ * will update all per cpu DSCR default values across the
+ * system stored in their respective PACA structures.
+ */
 static unsigned long dscr_default;
 
+/**
+ * read_dscr() - Fetch the cpu specific DSCR default
+ * @val:	Returned cpu specific DSCR default value
+ *
+ * This function returns the per cpu DSCR default value
+ * for any cpu which is contained in it's PACA structure.
+ */
 static void read_dscr(void *val)
 {
 	*(unsigned long *)val = get_paca()->dscr_default;
 }
 
+
+/**
+ * write_dscr() - Update the cpu specific DSCR default
+ * @val:	New cpu specific DSCR default value to update
+ *
+ * This function updates the per cpu DSCR default value
+ * for any cpu which is contained in it's PACA structure.
+ */
 static void write_dscr(void *val)
 {
 	get_paca()->dscr_default = *(unsigned long *)val;
@@ -520,12 +541,29 @@ static void add_write_permission_dev_attr(struct device_attribute *attr)
 	attr->attr.mode |= 0200;
 }
 
+/**
+ * show_dscr_default() - Fetch the system wide DSCR default
+ * @dev:	Device structure
+ * @attr:	Device attribute structure
+ * @buf:	Interface buffer
+ *
+ * This function returns the system wide DSCR default value.
+ */
 static ssize_t show_dscr_default(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%lx\n", dscr_default);
 }
 
+/**
+ * store_dscr_default() - Update the system wide DSCR default
+ * @dev:	Device structure
+ * @attr:	Device attribute structure
+ * @buf:	Interface buffer
+ * @count:	Size of the update
+ *
+ * This function updates the system wide DSCR default value.
+ */
 static ssize_t __used store_dscr_default(struct device *dev,
 		struct device_attribute *attr, const char *buf,
 		size_t count)
-- 
cgit v0.10.2


From 4eb2088c07cdc90d43a5a02d13c2f8aa20c470a9 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:05 +0530
Subject: powerpc/dscr: Add documentation for DSCR support

This patch adds a new documentation file explaining the DSCR support on
powerpc platforms. This explains DSCR related data structure, code paths
and also available user interfaces. Any further functional changes to
the DSCR support in the kernel should definitely update the
documentation here.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/Documentation/powerpc/00-INDEX b/Documentation/powerpc/00-INDEX
index 6fd0e8b..9dc845c 100644
--- a/Documentation/powerpc/00-INDEX
+++ b/Documentation/powerpc/00-INDEX
@@ -30,3 +30,5 @@ ptrace.txt
 	- Information on the ptrace interfaces for hardware debug registers.
 transactional_memory.txt
 	- Overview of the Power8 transactional memory support.
+dscr.txt
+	- Overview DSCR (Data Stream Control Register) support.
diff --git a/Documentation/powerpc/dscr.txt b/Documentation/powerpc/dscr.txt
new file mode 100644
index 0000000..1ff4400
--- /dev/null
+++ b/Documentation/powerpc/dscr.txt
@@ -0,0 +1,83 @@
+			DSCR (Data Stream Control Register)
+		================================================
+
+DSCR register in powerpc allows user to have some control of prefetch of data
+stream in the processor. Please refer to the ISA documents or related manual
+for more detailed information regarding how to use this DSCR to attain this
+control of the pefetches . This document here provides an overview of kernel
+support for DSCR, related kernel objects, it's functionalities and exported
+user interface.
+
+(A) Data Structures:
+
+	(1) thread_struct:
+		dscr		/* Thread DSCR value */
+		dscr_inherit	/* Thread has changed default DSCR */
+
+	(2) PACA:
+		dscr_default	/* per-CPU DSCR default value */
+
+	(3) sysfs.c:
+		dscr_default	/* System DSCR default value */
+
+(B) Scheduler Changes:
+
+	Scheduler will write the per-CPU DSCR default which is stored in the
+	CPU's PACA value into the register if the thread has dscr_inherit value
+	cleared which means that it has not changed the default DSCR till now.
+	If the dscr_inherit value is set which means that it has changed the
+	default DSCR value, scheduler will write the changed value which will
+	now be contained in thread struct's dscr into the register instead of
+	the per-CPU default PACA based DSCR value.
+
+	NOTE: Please note here that the system wide global DSCR value never
+	gets used directly in the scheduler process context switch at all.
+
+(C) SYSFS Interface:
+
+	Global DSCR default:		/sys/devices/system/cpu/dscr_default
+	CPU specific DSCR default:	/sys/devices/system/cpu/cpuN/dscr
+
+	Changing the global DSCR default in the sysfs will change all the CPU
+	specific DSCR defaults immediately in their PACA structures. Again if
+	the current process has the dscr_inherit clear, it also writes the new
+	value into every CPU's DSCR register right away and updates the current
+	thread's DSCR value as well.
+
+	Changing the CPU specif DSCR default value in the sysfs does exactly
+	the same thing as above but unlike the global one above, it just changes
+	stuff for that particular CPU instead for all the CPUs on the system.
+
+(D) User Space Instructions:
+
+	The DSCR register can be accessed in the user space using any of these
+	two SPR numbers available for that purpose.
+
+	(1) Problem state SPR:		0x03	(Un-privileged, POWER8 only)
+	(2) Privileged state SPR:	0x11	(Privileged)
+
+	Accessing DSCR through privileged SPR number (0x11) from user space
+	works, as it is emulated following an illegal instruction exception
+	inside the kernel. Both mfspr and mtspr instructions are emulated.
+
+	Accessing DSCR through user level SPR (0x03) from user space will first
+	create a facility unavailable exception. Inside this exception handler
+	all mfspr isntruction based read attempts will get emulated and returned
+	where as the first mtspr instruction based write attempts will enable
+	the DSCR facility for the next time around (both for read and write) by
+	setting DSCR facility in the FSCR register.
+
+(E) Specifics about 'dscr_inherit':
+
+	The thread struct element 'dscr_inherit' represents whether the thread
+	in question has attempted and changed the DSCR itself using any of the
+	following methods. This element signifies whether the thread wants to
+	use the CPU default DSCR value or its own changed DSCR value in the
+	kernel.
+
+		(1) mtspr instruction	(SPR number 0x03)
+		(2) mtspr instruction	(SPR number 0x11)
+		(3) ptrace interface	(Explicitly set user DSCR value)
+
+	Any child of the process created after this event in the process inherits
+	this same behaviour as well.
-- 
cgit v0.10.2


From 4c6315f8f9387d638ccb1cc3a46a28a82a109928 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:06 +0530
Subject: selftests/powerpc: Add test for system wide DSCR default

This patch adds a test case for the system wide DSCR default value,
which when changed through it's sysfs interface must be visible to all
threads reading DSCR either through the privilege state SPR or the
problem state SPR. The DSCR value change should be immediate as well.

Acked-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index 5ad0423..03ca2e6 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -12,7 +12,7 @@ CFLAGS := -Wall -O2 -flto -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CUR
 
 export CFLAGS
 
-SUB_DIRS = pmu copyloops mm tm primitives stringloops vphn switch_endian
+SUB_DIRS = pmu copyloops mm tm primitives stringloops vphn switch_endian dscr
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile
new file mode 100644
index 0000000..8e97538
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/Makefile
@@ -0,0 +1,12 @@
+TEST_PROGS := dscr_default_test
+
+dscr_default_test: LDLIBS += -lpthread
+
+all: $(TEST_PROGS)
+
+$(TEST_PROGS): ../harness.c
+
+include ../../lib.mk
+
+clean:
+	rm -f $(TEST_PROGS) *.o
diff --git a/tools/testing/selftests/powerpc/dscr/dscr.h b/tools/testing/selftests/powerpc/dscr/dscr.h
new file mode 100644
index 0000000..a36af1b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr.h
@@ -0,0 +1,127 @@
+/*
+ * POWER Data Stream Control Register (DSCR)
+ *
+ * This header file contains helper functions and macros
+ * required for all the DSCR related test cases.
+ *
+ * Copyright 2012, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#ifndef _SELFTESTS_POWERPC_DSCR_DSCR_H
+#define _SELFTESTS_POWERPC_DSCR_DSCR_H
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <pthread.h>
+#include <sched.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#include "utils.h"
+
+#define SPRN_DSCR	0x11	/* Privilege state SPR */
+#define SPRN_DSCR_USR	0x03	/* Problem state SPR */
+#define THREADS		100	/* Max threads */
+#define COUNT		100	/* Max iterations */
+#define DSCR_MAX	16	/* Max DSCR value */
+#define LEN_MAX		100	/* Max name length */
+
+#define DSCR_DEFAULT	"/sys/devices/system/cpu/dscr_default"
+#define CPU_PATH	"/sys/devices/system/cpu/"
+
+#define rmb()  asm volatile("lwsync":::"memory")
+#define wmb()  asm volatile("lwsync":::"memory")
+
+#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+/* Prilvilege state DSCR access */
+inline unsigned long get_dscr(void)
+{
+	unsigned long ret;
+
+	asm volatile("mfspr %0,%1" : "=r" (ret): "i" (SPRN_DSCR));
+
+	return ret;
+}
+
+inline void set_dscr(unsigned long val)
+{
+	asm volatile("mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR));
+}
+
+/* Problem state DSCR access */
+inline unsigned long get_dscr_usr(void)
+{
+	unsigned long ret;
+
+	asm volatile("mfspr %0,%1" : "=r" (ret): "i" (SPRN_DSCR_USR));
+
+	return ret;
+}
+
+inline void set_dscr_usr(unsigned long val)
+{
+	asm volatile("mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR_USR));
+}
+
+/* Default DSCR access */
+unsigned long get_default_dscr(void)
+{
+	int fd = -1, ret;
+	char buf[16];
+	unsigned long val;
+
+	if (fd == -1) {
+		fd = open(DSCR_DEFAULT, O_RDONLY);
+		if (fd == -1) {
+			perror("open() failed");
+			exit(1);
+		}
+	}
+	memset(buf, 0, sizeof(buf));
+	lseek(fd, 0, SEEK_SET);
+	ret = read(fd, buf, sizeof(buf));
+	if (ret == -1) {
+		perror("read() failed");
+		exit(1);
+	}
+	sscanf(buf, "%lx", &val);
+	close(fd);
+	return val;
+}
+
+void set_default_dscr(unsigned long val)
+{
+	int fd = -1, ret;
+	char buf[16];
+
+	if (fd == -1) {
+		fd = open(DSCR_DEFAULT, O_RDWR);
+		if (fd == -1) {
+			perror("open() failed");
+			exit(1);
+		}
+	}
+	sprintf(buf, "%lx\n", val);
+	ret = write(fd, buf, strlen(buf));
+	if (ret == -1) {
+		perror("write() failed");
+		exit(1);
+	}
+	close(fd);
+}
+
+double uniform_deviate(int seed)
+{
+	return seed * (1.0 / (RAND_MAX + 1.0));
+}
+#endif	/* _SELFTESTS_POWERPC_DSCR_DSCR_H */
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_default_test.c b/tools/testing/selftests/powerpc/dscr/dscr_default_test.c
new file mode 100644
index 0000000..df17c3b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_default_test.c
@@ -0,0 +1,127 @@
+/*
+ * POWER Data Stream Control Register (DSCR) default test
+ *
+ * This test modifies the system wide default DSCR through
+ * it's sysfs interface and then verifies that all threads
+ * see the correct changed DSCR value immediately.
+ *
+ * Copyright 2012, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include "dscr.h"
+
+static unsigned long dscr;		/* System DSCR default */
+static unsigned long sequence;
+static unsigned long result[THREADS];
+
+static void *do_test(void *in)
+{
+	unsigned long thread = (unsigned long)in;
+	unsigned long i;
+
+	for (i = 0; i < COUNT; i++) {
+		unsigned long d, cur_dscr, cur_dscr_usr;
+		unsigned long s1, s2;
+
+		s1 = ACCESS_ONCE(sequence);
+		if (s1 & 1)
+			continue;
+		rmb();
+
+		d = dscr;
+		cur_dscr = get_dscr();
+		cur_dscr_usr = get_dscr_usr();
+
+		rmb();
+		s2 = sequence;
+
+		if (s1 != s2)
+			continue;
+
+		if (cur_dscr != d) {
+			fprintf(stderr, "thread %ld kernel DSCR should be %ld "
+				"but is %ld\n", thread, d, cur_dscr);
+			result[thread] = 1;
+			pthread_exit(&result[thread]);
+		}
+
+		if (cur_dscr_usr != d) {
+			fprintf(stderr, "thread %ld user DSCR should be %ld "
+				"but is %ld\n", thread, d, cur_dscr_usr);
+			result[thread] = 1;
+			pthread_exit(&result[thread]);
+		}
+	}
+	result[thread] = 0;
+	pthread_exit(&result[thread]);
+}
+
+int dscr_default(void)
+{
+	pthread_t threads[THREADS];
+	unsigned long i, *status[THREADS];
+	unsigned long orig_dscr_default;
+
+	orig_dscr_default = get_default_dscr();
+
+	/* Initial DSCR default */
+	dscr = 1;
+	set_default_dscr(dscr);
+
+	/* Spawn all testing threads */
+	for (i = 0; i < THREADS; i++) {
+		if (pthread_create(&threads[i], NULL, do_test, (void *)i)) {
+			perror("pthread_create() failed");
+			goto fail;
+		}
+	}
+
+	srand(getpid());
+
+	/* Keep changing the DSCR default */
+	for (i = 0; i < COUNT; i++) {
+		double ret = uniform_deviate(rand());
+
+		if (ret < 0.0001) {
+			sequence++;
+			wmb();
+
+			dscr++;
+			if (dscr > DSCR_MAX)
+				dscr = 0;
+
+			set_default_dscr(dscr);
+
+			wmb();
+			sequence++;
+		}
+	}
+
+	/* Individual testing thread exit status */
+	for (i = 0; i < THREADS; i++) {
+		if (pthread_join(threads[i], (void **)&(status[i]))) {
+			perror("pthread_join() failed");
+			goto fail;
+		}
+
+		if (*status[i]) {
+			printf("%ldth thread failed to join with %ld status\n",
+								i, *status[i]);
+			goto fail;
+		}
+	}
+	set_default_dscr(orig_dscr_default);
+	return 0;
+fail:
+	set_default_dscr(orig_dscr_default);
+	return 1;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(dscr_default, "dscr_default_test");
+}
-- 
cgit v0.10.2


From 1554f21a9890301d1b6586474396e7c9de586204 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:07 +0530
Subject: selftests/powerpc: Add test for explicitly changing DSCR value

This patch adds a test which modifies the DSCR using mtspr instruction
and verifies the change using mfspr instruction. It uses both the
privilege state SPR as well as the problem state SPR for the purpose.

Acked-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile
index 8e97538..d72c320 100644
--- a/tools/testing/selftests/powerpc/dscr/Makefile
+++ b/tools/testing/selftests/powerpc/dscr/Makefile
@@ -1,4 +1,4 @@
-TEST_PROGS := dscr_default_test
+TEST_PROGS := dscr_default_test dscr_explicit_test
 
 dscr_default_test: LDLIBS += -lpthread
 
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c b/tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c
new file mode 100644
index 0000000..ad9c3ec
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c
@@ -0,0 +1,71 @@
+/*
+ * POWER Data Stream Control Register (DSCR) explicit test
+ *
+ * This test modifies the DSCR value using mtspr instruction and
+ * verifies the change with mfspr instruction. It uses both the
+ * privilege state SPR and the problem state SPR for this purpose.
+ *
+ * When using the privilege state SPR, the instructions such as
+ * mfspr or mtspr are priviledged and the kernel emulates them
+ * for us. Instructions using problem state SPR can be exuecuted
+ * directly without any emulation if the HW supports them. Else
+ * they also get emulated by the kernel.
+ *
+ * Copyright 2012, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include "dscr.h"
+
+int dscr_explicit(void)
+{
+	unsigned long i, dscr = 0;
+
+	srand(getpid());
+	set_dscr(dscr);
+
+	for (i = 0; i < COUNT; i++) {
+		unsigned long cur_dscr, cur_dscr_usr;
+		double ret = uniform_deviate(rand());
+
+		if (ret < 0.001) {
+			dscr++;
+			if (dscr > DSCR_MAX)
+				dscr = 0;
+
+			set_dscr(dscr);
+		}
+
+		cur_dscr = get_dscr();
+		if (cur_dscr != dscr) {
+			fprintf(stderr, "Kernel DSCR should be %ld but "
+					"is %ld\n", dscr, cur_dscr);
+			return 1;
+		}
+
+		ret = uniform_deviate(rand());
+		if (ret < 0.001) {
+			dscr++;
+			if (dscr > DSCR_MAX)
+				dscr = 0;
+
+			set_dscr_usr(dscr);
+		}
+
+		cur_dscr_usr = get_dscr_usr();
+		if (cur_dscr_usr != dscr) {
+			fprintf(stderr, "User DSCR should be %ld but "
+					"is %ld\n", dscr, cur_dscr_usr);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(dscr_explicit, "dscr_explicit_test");
+}
-- 
cgit v0.10.2


From cc6a93f1c3d6923ebf2f305416d080e9afc7ee95 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:08 +0530
Subject: selftests/powerpc: Add test for DSCR SPR numbers

This patch adds a test which verifies that the DSCR privilege and
problem state SPR read & write accesses while making sure that the
results are always the same irrespective of which SPR number is being
used.

Acked-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile
index d72c320..3c3856b 100644
--- a/tools/testing/selftests/powerpc/dscr/Makefile
+++ b/tools/testing/selftests/powerpc/dscr/Makefile
@@ -1,4 +1,4 @@
-TEST_PROGS := dscr_default_test dscr_explicit_test
+TEST_PROGS := dscr_default_test dscr_explicit_test dscr_user_test
 
 dscr_default_test: LDLIBS += -lpthread
 
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_user_test.c b/tools/testing/selftests/powerpc/dscr/dscr_user_test.c
new file mode 100644
index 0000000..77d16b5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_user_test.c
@@ -0,0 +1,61 @@
+/*
+ * POWER Data Stream Control Register (DSCR) SPR test
+ *
+ * This test modifies the DSCR value through both the SPR number
+ * based mtspr instruction and then makes sure that the same is
+ * reflected through mfspr instruction using either of the SPR
+ * numbers.
+ *
+ * When using the privilege state SPR, the instructions such as
+ * mfspr or mtspr are priviledged and the kernel emulates them
+ * for us. Instructions using problem state SPR can be exuecuted
+ * directly without any emulation if the HW supports them. Else
+ * they also get emulated by the kernel.
+ *
+ * Copyright 2013, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include "dscr.h"
+
+static int check_dscr(char *str)
+{
+	unsigned long cur_dscr, cur_dscr_usr;
+
+	cur_dscr = get_dscr();
+	cur_dscr_usr = get_dscr_usr();
+	if (cur_dscr != cur_dscr_usr) {
+		printf("%s set, kernel get %lx != user get %lx\n",
+					str, cur_dscr, cur_dscr_usr);
+		return 1;
+	}
+	return 0;
+}
+
+int dscr_user(void)
+{
+	int i;
+
+	check_dscr("");
+
+	for (i = 0; i < COUNT; i++) {
+		set_dscr(i);
+		if (check_dscr("kernel"))
+			return 1;
+	}
+
+	for (i = 0; i < COUNT; i++) {
+		set_dscr_usr(i);
+		if (check_dscr("user"))
+			return 1;
+	}
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(dscr_user, "dscr_user_test");
+}
-- 
cgit v0.10.2


From cf9c4a5ecf806348940ee26b7ff0ff59f5530059 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:09 +0530
Subject: selftests/powerpc: Add test for DSCR value inheritence across fork

This patch adds a test to verify that the changed DSCR value inside any
process would be inherited to it's child process across the fork system
call.

Acked-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile
index 3c3856b..764fadc 100644
--- a/tools/testing/selftests/powerpc/dscr/Makefile
+++ b/tools/testing/selftests/powerpc/dscr/Makefile
@@ -1,4 +1,5 @@
-TEST_PROGS := dscr_default_test dscr_explicit_test dscr_user_test
+TEST_PROGS := dscr_default_test dscr_explicit_test dscr_user_test	\
+	      dscr_inherit_test
 
 dscr_default_test: LDLIBS += -lpthread
 
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c b/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c
new file mode 100644
index 0000000..4e414ca
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c
@@ -0,0 +1,95 @@
+/*
+ * POWER Data Stream Control Register (DSCR) fork test
+ *
+ * This testcase modifies the DSCR using mtspr, forks and then
+ * verifies that the child process has the correct changed DSCR
+ * value using mfspr.
+ *
+ * When using the privilege state SPR, the instructions such as
+ * mfspr or mtspr are priviledged and the kernel emulates them
+ * for us. Instructions using problem state SPR can be exuecuted
+ * directly without any emulation if the HW supports them. Else
+ * they also get emulated by the kernel.
+ *
+ * Copyright 2012, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include "dscr.h"
+
+int dscr_inherit(void)
+{
+	unsigned long i, dscr = 0;
+	pid_t pid;
+
+	srand(getpid());
+	set_dscr(dscr);
+
+	for (i = 0; i < COUNT; i++) {
+		unsigned long cur_dscr, cur_dscr_usr;
+
+		dscr++;
+		if (dscr > DSCR_MAX)
+			dscr = 0;
+
+		if (i % 2 == 0)
+			set_dscr_usr(dscr);
+		else
+			set_dscr(dscr);
+
+		/*
+		 * XXX: Force a context switch out so that DSCR
+		 * current value is copied into the thread struct
+		 * which is required for the child to inherit the
+		 * changed value.
+		 */
+		sleep(1);
+
+		pid = fork();
+		if (pid == -1) {
+			perror("fork() failed");
+			exit(1);
+		} else if (pid) {
+			int status;
+
+			if (waitpid(pid, &status, 0) == -1) {
+				perror("waitpid() failed");
+				exit(1);
+			}
+
+			if (!WIFEXITED(status)) {
+				fprintf(stderr, "Child didn't exit cleanly\n");
+				exit(1);
+			}
+
+			if (WEXITSTATUS(status) != 0) {
+				fprintf(stderr, "Child didn't exit cleanly\n");
+				return 1;
+			}
+		} else {
+			cur_dscr = get_dscr();
+			if (cur_dscr != dscr) {
+				fprintf(stderr, "Kernel DSCR should be %ld "
+					"but is %ld\n", dscr, cur_dscr);
+				exit(1);
+			}
+
+			cur_dscr_usr = get_dscr_usr();
+			if (cur_dscr_usr != dscr) {
+				fprintf(stderr, "User DSCR should be %ld "
+					"but is %ld\n", dscr, cur_dscr_usr);
+				exit(1);
+			}
+			exit(0);
+		}
+	}
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(dscr_inherit, "dscr_inherit_test");
+}
-- 
cgit v0.10.2


From 6f844261e16b3a66ccec29667b2713ce0bb8adc7 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:10 +0530
Subject: selftests/powerpc: Add test for DSCR inheritence across fork & exec

This patch adds a test case to verify that the changed DSCR value inside
any process would be inherited to it's child across the fork and exec
system call.

Acked-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile
index 764fadc..0f53a4d 100644
--- a/tools/testing/selftests/powerpc/dscr/Makefile
+++ b/tools/testing/selftests/powerpc/dscr/Makefile
@@ -1,5 +1,5 @@
 TEST_PROGS := dscr_default_test dscr_explicit_test dscr_user_test	\
-	      dscr_inherit_test
+	      dscr_inherit_test dscr_inherit_exec_test
 
 dscr_default_test: LDLIBS += -lpthread
 
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c b/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c
new file mode 100644
index 0000000..8265504
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c
@@ -0,0 +1,117 @@
+/*
+ * POWER Data Stream Control Register (DSCR) fork exec test
+ *
+ * This testcase modifies the DSCR using mtspr, forks & execs and
+ * verifies that the child is using the changed DSCR using mfspr.
+ *
+ * When using the privilege state SPR, the instructions such as
+ * mfspr or mtspr are priviledged and the kernel emulates them
+ * for us. Instructions using problem state SPR can be exuecuted
+ * directly without any emulation if the HW supports them. Else
+ * they also get emulated by the kernel.
+ *
+ * Copyright 2012, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include "dscr.h"
+
+static char prog[LEN_MAX];
+
+static void do_exec(unsigned long parent_dscr)
+{
+	unsigned long cur_dscr, cur_dscr_usr;
+
+	cur_dscr = get_dscr();
+	cur_dscr_usr = get_dscr_usr();
+
+	if (cur_dscr != parent_dscr) {
+		fprintf(stderr, "Parent DSCR %ld was not inherited "
+				"over exec (kernel value)\n", parent_dscr);
+		exit(1);
+	}
+
+	if (cur_dscr_usr != parent_dscr) {
+		fprintf(stderr, "Parent DSCR %ld was not inherited "
+				"over exec (user value)\n", parent_dscr);
+		exit(1);
+	}
+	exit(0);
+}
+
+int dscr_inherit_exec(void)
+{
+	unsigned long i, dscr = 0;
+	pid_t pid;
+
+	for (i = 0; i < COUNT; i++) {
+		dscr++;
+		if (dscr > DSCR_MAX)
+			dscr = 0;
+
+		if (dscr == get_default_dscr())
+			continue;
+
+		if (i % 2 == 0)
+			set_dscr_usr(dscr);
+		else
+			set_dscr(dscr);
+
+		/*
+		 * XXX: Force a context switch out so that DSCR
+		 * current value is copied into the thread struct
+		 * which is required for the child to inherit the
+		 * changed value.
+		 */
+		sleep(1);
+
+		pid = fork();
+		if (pid == -1) {
+			perror("fork() failed");
+			exit(1);
+		} else if (pid) {
+			int status;
+
+			if (waitpid(pid, &status, 0) == -1) {
+				perror("waitpid() failed");
+				exit(1);
+			}
+
+			if (!WIFEXITED(status)) {
+				fprintf(stderr, "Child didn't exit cleanly\n");
+				exit(1);
+			}
+
+			if (WEXITSTATUS(status) != 0) {
+				fprintf(stderr, "Child didn't exit cleanly\n");
+				return 1;
+			}
+		} else {
+			char dscr_str[16];
+
+			sprintf(dscr_str, "%ld", dscr);
+			execlp(prog, prog, "exec", dscr_str, NULL);
+			exit(1);
+		}
+	}
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	if (argc == 3 && !strcmp(argv[1], "exec")) {
+		unsigned long parent_dscr;
+
+		parent_dscr = atoi(argv[2]);
+		do_exec(parent_dscr);
+	} else if (argc != 1) {
+		fprintf(stderr, "Usage: %s\n", argv[0]);
+		exit(1);
+	}
+
+	strncpy(prog, argv[0], strlen(argv[0]));
+	return test_harness(dscr_inherit_exec, "dscr_inherit_exec_test");
+}
-- 
cgit v0.10.2


From ebd5858c904b1c795f1511758a7ac043afd44438 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:11 +0530
Subject: selftests/powerpc: Add test for all DSCR sysfs interfaces

This test continuously updates the system wide DSCR default value in the
sysfs interface and makes sure that the same is reflected across all the
sysfs interfaces for each individual CPUs present on the system.

Acked-by: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile
index 0f53a4d..de0b53b 100644
--- a/tools/testing/selftests/powerpc/dscr/Makefile
+++ b/tools/testing/selftests/powerpc/dscr/Makefile
@@ -1,5 +1,5 @@
 TEST_PROGS := dscr_default_test dscr_explicit_test dscr_user_test	\
-	      dscr_inherit_test dscr_inherit_exec_test
+	      dscr_inherit_test dscr_inherit_exec_test dscr_sysfs_test
 
 dscr_default_test: LDLIBS += -lpthread
 
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
new file mode 100644
index 0000000..17fb1b4
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
@@ -0,0 +1,97 @@
+/*
+ * POWER Data Stream Control Register (DSCR) sysfs interface test
+ *
+ * This test updates to system wide DSCR default through the sysfs interface
+ * and then verifies that all the CPU specific DSCR defaults are updated as
+ * well verified from their sysfs interfaces.
+ *
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#include "dscr.h"
+
+static int check_cpu_dscr_default(char *file, unsigned long val)
+{
+	char buf[10];
+	int fd, rc;
+
+	fd = open(file, O_RDWR);
+	if (fd == -1) {
+		perror("open() failed");
+		return 1;
+	}
+
+	rc = read(fd, buf, sizeof(buf));
+	if (rc == -1) {
+		perror("read() failed");
+		return 1;
+	}
+	close(fd);
+
+	buf[rc] = '\0';
+	if (strtol(buf, NULL, 16) != val) {
+		printf("DSCR match failed: %ld (system) %ld (cpu)\n",
+					val, strtol(buf, NULL, 16));
+		return 1;
+	}
+	return 0;
+}
+
+static int check_all_cpu_dscr_defaults(unsigned long val)
+{
+	DIR *sysfs;
+	struct dirent *dp;
+	char file[LEN_MAX];
+
+	sysfs = opendir(CPU_PATH);
+	if (!sysfs) {
+		perror("opendir() failed");
+		return 1;
+	}
+
+	while ((dp = readdir(sysfs))) {
+		if (!(dp->d_type & DT_DIR))
+			continue;
+		if (!strcmp(dp->d_name, "cpuidle"))
+			continue;
+		if (!strstr(dp->d_name, "cpu"))
+			continue;
+
+		sprintf(file, "%s%s/dscr", CPU_PATH, dp->d_name);
+		if (access(file, F_OK))
+			continue;
+
+		if (check_cpu_dscr_default(file, val))
+			return 1;
+	}
+	closedir(sysfs);
+	return 0;
+}
+
+int dscr_sysfs(void)
+{
+	unsigned long orig_dscr_default;
+	int i, j;
+
+	orig_dscr_default = get_default_dscr();
+	for (i = 0; i < COUNT; i++) {
+		for (j = 0; j < DSCR_MAX; j++) {
+			set_default_dscr(j);
+			if (check_all_cpu_dscr_defaults(j))
+				goto fail;
+		}
+	}
+	set_default_dscr(orig_dscr_default);
+	return 0;
+fail:
+	set_default_dscr(orig_dscr_default);
+	return 1;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(dscr_sysfs, "dscr_sysfs_test");
+}
-- 
cgit v0.10.2


From af0be08ee6ab83666f2cb3c56285cf7a312eaeab Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Sun, 7 Jun 2015 19:25:16 +1000
Subject: selftests/powerpc: Add thread based stress test for DSCR sysfs
 interfaces

This patch adds a test to update the system wide DSCR value repeatedly
and then verifies that any thread on any given CPU on the system must be
able to see the same DSCR value whether its is being read through the
problem state based SPR or the privilege state based SPR.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile
index de0b53b..49327ee 100644
--- a/tools/testing/selftests/powerpc/dscr/Makefile
+++ b/tools/testing/selftests/powerpc/dscr/Makefile
@@ -1,5 +1,6 @@
 TEST_PROGS := dscr_default_test dscr_explicit_test dscr_user_test	\
-	      dscr_inherit_test dscr_inherit_exec_test dscr_sysfs_test
+	      dscr_inherit_test dscr_inherit_exec_test dscr_sysfs_test	\
+	      dscr_sysfs_thread_test
 
 dscr_default_test: LDLIBS += -lpthread
 
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c
new file mode 100644
index 0000000..ad97b59
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c
@@ -0,0 +1,80 @@
+/*
+ * POWER Data Stream Control Register (DSCR) sysfs thread test
+ *
+ * This test updates the system wide DSCR default value through
+ * sysfs interface which should then update all the CPU specific
+ * DSCR default values which must also be then visible to threads
+ * executing on individual CPUs on the system.
+ *
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+#define _GNU_SOURCE
+#include "dscr.h"
+
+static int test_thread_dscr(unsigned long val)
+{
+	unsigned long cur_dscr, cur_dscr_usr;
+
+	cur_dscr = get_dscr();
+	cur_dscr_usr = get_dscr_usr();
+
+	if (val != cur_dscr) {
+		printf("[cpu %d] Kernel DSCR should be %ld but is %ld\n",
+					sched_getcpu(), val, cur_dscr);
+		return 1;
+	}
+
+	if (val != cur_dscr_usr) {
+		printf("[cpu %d] User DSCR should be %ld but is %ld\n",
+					sched_getcpu(), val, cur_dscr_usr);
+		return 1;
+	}
+	return 0;
+}
+
+static int check_cpu_dscr_thread(unsigned long val)
+{
+	cpu_set_t mask;
+	int cpu;
+
+	for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+		CPU_ZERO(&mask);
+		CPU_SET(cpu, &mask);
+		if (sched_setaffinity(0, sizeof(mask), &mask))
+			continue;
+
+		if (test_thread_dscr(val))
+			return 1;
+	}
+	return 0;
+
+}
+
+int dscr_sysfs_thread(void)
+{
+	unsigned long orig_dscr_default;
+	int i, j;
+
+	orig_dscr_default = get_default_dscr();
+	for (i = 0; i < COUNT; i++) {
+		for (j = 0; j < DSCR_MAX; j++) {
+			set_default_dscr(j);
+			if (check_cpu_dscr_thread(j))
+				goto fail;
+		}
+	}
+	set_default_dscr(orig_dscr_default);
+	return 0;
+fail:
+	set_default_dscr(orig_dscr_default);
+	return 1;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(dscr_sysfs_thread, "dscr_sysfs_thread_test");
+}
-- 
cgit v0.10.2


From 809fac67c8530704a214e2c476349692e2fb9299 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 21 May 2015 12:13:13 +0530
Subject: selftests/powerpc: Add gitignore file for the new DSCR tests

This patch adds .gitignore for all the newly added DSCR tests.

Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/tools/testing/selftests/powerpc/dscr/.gitignore b/tools/testing/selftests/powerpc/dscr/.gitignore
new file mode 100644
index 0000000..b585c6c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/.gitignore
@@ -0,0 +1,7 @@
+dscr_default_test
+dscr_explicit_test
+dscr_inherit_exec_test
+dscr_inherit_test
+dscr_sysfs_test
+dscr_sysfs_thread_test
+dscr_user_test
-- 
cgit v0.10.2


From cfcb3d80a28380ba027331eb548ba309c4b66559 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 14 Apr 2015 13:05:57 +0530
Subject: powerpc/mm: Add trace point for tracking hash pte fault

This enables us to understand how many hash fault we are taking
when running benchmarks.

For ex:
-bash-4.2# ./perf stat -e  powerpc:hash_fault -e page-faults /tmp/ebizzy.ppc64 -S 30  -P -n 1000
...

 Performance counter stats for '/tmp/ebizzy.ppc64 -S 30 -P -n 1000':

       1,10,04,075      powerpc:hash_fault
       1,10,03,429      page-faults

      30.865978991 seconds time elapsed

NOTE:
The impact of the tracepoint was not noticeable when running test. It was
within the run-time variance of the test. For ex:

without-patch:
--------------

 Performance counter stats for './a.out 3000 300':

	       643      page-faults               #    0.089 M/sec
	  7.236562      task-clock (msec)         #    0.928 CPUs utilized
	 2,179,213      stalled-cycles-frontend   #    0.00% frontend cycles idle
	17,174,367      stalled-cycles-backend    #    0.00% backend  cycles idle
		 0      context-switches          #    0.000 K/sec

       0.007794658 seconds time elapsed

And with-patch:
---------------

 Performance counter stats for './a.out 3000 300':

	       643      page-faults               #    0.089 M/sec
	  7.233746      task-clock (msec)         #    0.921 CPUs utilized
		 0      context-switches          #    0.000 K/sec

       0.007854876 seconds time elapsed

 Performance counter stats for './a.out 3000 300':

	       643      page-faults               #    0.087 M/sec
	       649      powerpc:hash_fault        #    0.087 M/sec
	  7.430376      task-clock (msec)         #    0.938 CPUs utilized
	 2,347,174      stalled-cycles-frontend   #    0.00% frontend cycles idle
	17,524,282      stalled-cycles-backend    #    0.00% backend  cycles idle
		 0      context-switches          #    0.000 K/sec

       0.007920284 seconds time elapsed

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index c15da60..8e86b48 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -144,6 +144,26 @@ TRACE_EVENT_FN(opal_exit,
 );
 #endif
 
+TRACE_EVENT(hash_fault,
+
+	    TP_PROTO(unsigned long addr, unsigned long access, unsigned long trap),
+	    TP_ARGS(addr, access, trap),
+	    TP_STRUCT__entry(
+		    __field(unsigned long, addr)
+		    __field(unsigned long, access)
+		    __field(unsigned long, trap)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->addr = addr;
+		    __entry->access = access;
+		    __entry->trap = trap;
+		    ),
+
+	    TP_printk("hash fault with addr 0x%lx and access = 0x%lx trap = 0x%lx",
+		      __entry->addr, __entry->access, __entry->trap)
+);
+
 #endif /* _TRACE_POWERPC_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index a04ca92..5ec987f 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -57,6 +57,7 @@
 #include <asm/fadump.h>
 #include <asm/firmware.h>
 #include <asm/tm.h>
+#include <asm/trace.h>
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
@@ -1004,6 +1005,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 
 	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
 		ea, access, trap);
+	trace_hash_fault(ea, access, trap);
 
 	/* Get region & vsid */
  	switch (REGION_ID(ea)) {
-- 
cgit v0.10.2


From 1cbb4a1c433a1ca054ef5363f4e6597b43d208cc Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Tue, 2 Jun 2015 14:26:09 +1000
Subject: mtd: powernv: Add powernv flash MTD abstraction driver

Powerpc powernv platforms allow access to certain system flash devices
through a firmwarwe interface. This change adds an mtd driver for these
flash devices.

Minor updates from Jeremy Kerr and Joel Stanley.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Reviewed-by: Neelesh Gupta <neelegup@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index c49d0b1..f73c416 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -195,6 +195,14 @@ config MTD_BLOCK2MTD
 	  Testing MTD users (eg JFFS2) on large media and media that might
 	  be removed during a write (using the floppy drive).
 
+config MTD_POWERNV_FLASH
+	tristate "powernv flash MTD driver"
+	depends on PPC_POWERNV
+	help
+	  This provides an MTD device to access flash on powernv OPAL
+	  platforms from Linux. This device abstracts away the
+	  firmware interface for flash access.
+
 comment "Disk-On-Chip Device Drivers"
 
 config MTD_DOCG3
diff --git a/drivers/mtd/devices/Makefile b/drivers/mtd/devices/Makefile
index f0b0e61..7912d3a 100644
--- a/drivers/mtd/devices/Makefile
+++ b/drivers/mtd/devices/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_MTD_SPEAR_SMI)	+= spear_smi.o
 obj-$(CONFIG_MTD_SST25L)	+= sst25l.o
 obj-$(CONFIG_MTD_BCM47XXSFLASH)	+= bcm47xxsflash.o
 obj-$(CONFIG_MTD_ST_SPI_FSM)    += st_spi_fsm.o
+obj-$(CONFIG_MTD_POWERNV_FLASH)	+= powernv_flash.o
 
 
 CFLAGS_docg3.o			+= -I$(src)
diff --git a/drivers/mtd/devices/powernv_flash.c b/drivers/mtd/devices/powernv_flash.c
new file mode 100644
index 0000000..d5b870b
--- /dev/null
+++ b/drivers/mtd/devices/powernv_flash.c
@@ -0,0 +1,285 @@
+/*
+ * OPAL PNOR flash MTD abstraction
+ *
+ * Copyright IBM 2015
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <asm/opal.h>
+
+
+/*
+ * This driver creates the a Linux MTD abstraction for platform PNOR flash
+ * backed by OPAL calls
+ */
+
+struct powernv_flash {
+	struct mtd_info	mtd;
+	u32 id;
+};
+
+enum flash_op {
+	FLASH_OP_READ,
+	FLASH_OP_WRITE,
+	FLASH_OP_ERASE,
+};
+
+static int powernv_flash_async_op(struct mtd_info *mtd, enum flash_op op,
+		loff_t offset, size_t len, size_t *retlen, u_char *buf)
+{
+	struct powernv_flash *info = (struct powernv_flash *)mtd->priv;
+	struct device *dev = &mtd->dev;
+	int token;
+	struct opal_msg msg;
+	int rc;
+
+	dev_dbg(dev, "%s(op=%d, offset=0x%llx, len=%zu)\n",
+			__func__, op, offset, len);
+
+	token = opal_async_get_token_interruptible();
+	if (token < 0) {
+		if (token != -ERESTARTSYS)
+			dev_err(dev, "Failed to get an async token\n");
+
+		return token;
+	}
+
+	switch (op) {
+	case FLASH_OP_READ:
+		rc = opal_flash_read(info->id, offset, __pa(buf), len, token);
+		break;
+	case FLASH_OP_WRITE:
+		rc = opal_flash_write(info->id, offset, __pa(buf), len, token);
+		break;
+	case FLASH_OP_ERASE:
+		rc = opal_flash_erase(info->id, offset, len, token);
+		break;
+	default:
+		BUG_ON(1);
+	}
+
+	if (rc != OPAL_ASYNC_COMPLETION) {
+		dev_err(dev, "opal_flash_async_op(op=%d) failed (rc %d)\n",
+				op, rc);
+		opal_async_release_token(token);
+		return -EIO;
+	}
+
+	rc = opal_async_wait_response(token, &msg);
+	opal_async_release_token(token);
+	if (rc) {
+		dev_err(dev, "opal async wait failed (rc %d)\n", rc);
+		return -EIO;
+	}
+
+	rc = be64_to_cpu(msg.params[1]);
+	if (rc == OPAL_SUCCESS) {
+		rc = 0;
+		if (retlen)
+			*retlen = len;
+	} else {
+		rc = -EIO;
+	}
+
+	return rc;
+}
+
+/**
+ * @mtd: the device
+ * @from: the offset to read from
+ * @len: the number of bytes to read
+ * @retlen: the number of bytes actually read
+ * @buf: the filled in buffer
+ *
+ * Returns 0 if read successful, or -ERRNO if an error occurred
+ */
+static int powernv_flash_read(struct mtd_info *mtd, loff_t from, size_t len,
+	     size_t *retlen, u_char *buf)
+{
+	return powernv_flash_async_op(mtd, FLASH_OP_READ, from,
+			len, retlen, buf);
+}
+
+/**
+ * @mtd: the device
+ * @to: the offset to write to
+ * @len: the number of bytes to write
+ * @retlen: the number of bytes actually written
+ * @buf: the buffer to get bytes from
+ *
+ * Returns 0 if write successful, -ERRNO if error occurred
+ */
+static int powernv_flash_write(struct mtd_info *mtd, loff_t to, size_t len,
+		     size_t *retlen, const u_char *buf)
+{
+	return powernv_flash_async_op(mtd, FLASH_OP_WRITE, to,
+			len, retlen, (u_char *)buf);
+}
+
+/**
+ * @mtd: the device
+ * @erase: the erase info
+ * Returns 0 if erase successful or -ERRNO if an error occurred
+ */
+static int powernv_flash_erase(struct mtd_info *mtd, struct erase_info *erase)
+{
+	int rc;
+
+	erase->state = MTD_ERASING;
+
+	/* todo: register our own notifier to do a true async implementation */
+	rc =  powernv_flash_async_op(mtd, FLASH_OP_ERASE, erase->addr,
+			erase->len, NULL, NULL);
+
+	if (rc) {
+		erase->fail_addr = erase->addr;
+		erase->state = MTD_ERASE_FAILED;
+	} else {
+		erase->state = MTD_ERASE_DONE;
+	}
+	mtd_erase_callback(erase);
+	return rc;
+}
+
+/**
+ * powernv_flash_set_driver_info - Fill the mtd_info structure and docg3
+ * structure @pdev: The platform device
+ * @mtd: The structure to fill
+ */
+static int powernv_flash_set_driver_info(struct device *dev,
+		struct mtd_info *mtd)
+{
+	u64 size;
+	u32 erase_size;
+	int rc;
+
+	rc = of_property_read_u32(dev->of_node, "ibm,flash-block-size",
+			&erase_size);
+	if (rc) {
+		dev_err(dev, "couldn't get resource block size information\n");
+		return rc;
+	}
+
+	rc = of_property_read_u64(dev->of_node, "reg", &size);
+	if (rc) {
+		dev_err(dev, "couldn't get resource size information\n");
+		return rc;
+	}
+
+	/*
+	 * Going to have to check what details I need to set and how to
+	 * get them
+	 */
+	mtd->name = of_get_property(dev->of_node, "name", NULL);
+	mtd->type = MTD_NORFLASH;
+	mtd->flags = MTD_WRITEABLE;
+	mtd->size = size;
+	mtd->erasesize = erase_size;
+	mtd->writebufsize = mtd->writesize = 1;
+	mtd->owner = THIS_MODULE;
+	mtd->_erase = powernv_flash_erase;
+	mtd->_read = powernv_flash_read;
+	mtd->_write = powernv_flash_write;
+	mtd->dev.parent = dev;
+	return 0;
+}
+
+/**
+ * powernv_flash_probe
+ * @pdev: platform device
+ *
+ * Returns 0 on success, -ENOMEM, -ENXIO on error
+ */
+static int powernv_flash_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct powernv_flash *data;
+	int ret;
+
+	data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
+	if (!data) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	data->mtd.priv = data;
+
+	ret = of_property_read_u32(dev->of_node, "ibm,opal-id", &(data->id));
+	if (ret) {
+		dev_err(dev, "no device property 'ibm,opal-id'\n");
+		goto out;
+	}
+
+	ret = powernv_flash_set_driver_info(dev, &data->mtd);
+	if (ret)
+		goto out;
+
+	dev_set_drvdata(dev, data);
+
+	/*
+	 * The current flash that skiboot exposes is one contiguous flash chip
+	 * with an ffs partition at the start, it should prove easier for users
+	 * to deal with partitions or not as they see fit
+	 */
+	ret = mtd_device_register(&data->mtd, NULL, 0);
+
+out:
+	return ret;
+}
+
+/**
+ * op_release - Release the driver
+ * @pdev: the platform device
+ *
+ * Returns 0
+ */
+static int powernv_flash_release(struct platform_device *pdev)
+{
+	struct powernv_flash *data = dev_get_drvdata(&(pdev->dev));
+
+	/* All resources should be freed automatically */
+	return mtd_device_unregister(&(data->mtd));
+}
+
+static const struct of_device_id powernv_flash_match[] = {
+	{ .compatible = "ibm,opal-flash" },
+	{}
+};
+
+static struct platform_driver powernv_flash_driver = {
+	.driver		= {
+		.name		= "powernv_flash",
+		.of_match_table	= powernv_flash_match,
+	},
+	.remove		= powernv_flash_release,
+	.probe		= powernv_flash_probe,
+};
+
+module_platform_driver(powernv_flash_driver);
+
+MODULE_DEVICE_TABLE(of, powernv_flash_match);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Cyril Bur <cyril.bur@au1.ibm.com>");
+MODULE_DESCRIPTION("MTD abstraction for OPAL flash");
-- 
cgit v0.10.2


From ea30e99e8eccb684490f40d011ea534ecd937e98 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:34:53 +1000
Subject: powerpc/eeh/ioda2: Use device::iommu_group to check IOMMU group

This relies on the fact that a PCI device always has an IOMMU table
which may not be the case when we get dynamic DMA windows so
let's use more reliable check for IOMMU group here.

As we do not rely on the table presence here, remove the workaround
from pnv_pci_ioda2_set_bypass(); also remove the @add_to_iommu_group
parameter from pnv_ioda_setup_bus_dma().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 51dcdf6..af9b597 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1410,13 +1410,11 @@ static int dev_has_iommu_table(struct device *dev, void *data)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct pci_dev **ppdev = data;
-	struct iommu_table *tbl;
 
 	if (!dev)
 		return 0;
 
-	tbl = get_iommu_table_base(dev);
-	if (tbl && tbl->it_group) {
+	if (dev->iommu_group) {
 		*ppdev = pdev;
 		return 1;
 	}
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index b284185..5491d2f 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1655,21 +1655,15 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb,
 }
 
 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
-				   struct pci_bus *bus,
-				   bool add_to_iommu_group)
+				   struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
-		if (add_to_iommu_group)
-			set_iommu_table_base_and_group(&dev->dev,
-						       pe->tce32_table);
-		else
-			set_iommu_table_base(&dev->dev, pe->tce32_table);
+		set_iommu_table_base_and_group(&dev->dev, pe->tce32_table);
 
 		if (dev->subordinate)
-			pnv_ioda_setup_bus_dma(pe, dev->subordinate,
-					       add_to_iommu_group);
+			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
 	}
 }
 
@@ -1846,7 +1840,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
-		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 	} else if (pe->flags & PNV_IODA_PE_VF) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
@@ -1883,17 +1877,6 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
 						     window_id,
 						     pe->tce_bypass_base,
 						     0);
-
-		/*
-		 * EEH needs the mapping between IOMMU table and group
-		 * of those VFIO/KVM pass-through devices. We can postpone
-		 * resetting DMA ops until the DMA mask is configured in
-		 * host side.
-		 */
-		if (pe->pdev)
-			set_iommu_table_base(&pe->pdev->dev, tbl);
-		else
-			pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
 	}
 	if (rc)
 		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
@@ -1985,7 +1968,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
-		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 	} else if (pe->flags & PNV_IODA_PE_VF) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
-- 
cgit v0.10.2


From 4617082ec049d92a797e8be0b4ba2ba6b59d90d1 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:34:54 +1000
Subject: powerpc/iommu/powernv: Get rid of set_iommu_table_base_and_group

The set_iommu_table_base_and_group() name suggests that the function
sets table base and add a device to an IOMMU group.

The actual purpose for table base setting is to put some reference
into a device so later iommu_add_device() can get the IOMMU group
reference and the device to the group.

At the moment a group cannot be explicitly passed to iommu_add_device()
as we want it to work from the bus notifier, we can fix it later and
remove confusing calls of set_iommu_table_base().

This replaces set_iommu_table_base_and_group() with a couple of
set_iommu_table_base() + iommu_add_device() which makes reading the code
easier.

This adds few comments why set_iommu_table_base() and iommu_add_device()
are called where they are called.

For IODA1/2, this essentially removes iommu_add_device() call from
the pnv_pci_ioda_dma_dev_setup() as it will always fail at this particular
place:
- for physical PE, the device is already attached by iommu_add_device()
in pnv_pci_ioda_setup_dma_pe();
- for virtual PE, the sysfs entries are not ready to create all symlinks
so actual adding is happening in tce_iommu_bus_notifier.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 1e27d63..8353c86 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -140,13 +140,6 @@ static inline int __init tce_iommu_bus_notifier_init(void)
 }
 #endif /* !CONFIG_IOMMU_API */
 
-static inline void set_iommu_table_base_and_group(struct device *dev,
-						  void *base)
-{
-	set_iommu_table_base(dev, base);
-	iommu_add_device(dev);
-}
-
 extern int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 			    struct scatterlist *sglist, int nelems,
 			    unsigned long mask,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5491d2f..fd594b2 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1598,7 +1598,13 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 
 	pe = &phb->ioda.pe_array[pdn->pe_number];
 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
-	set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table);
+	set_iommu_table_base(&pdev->dev, pe->tce32_table);
+	/*
+	 * Note: iommu_add_device() will fail here as
+	 * for physical PE: the device is already added by now;
+	 * for virtual PE: sysfs entries are not ready yet and
+	 * tce_iommu_bus_notifier will add the device to a group later.
+	 */
 }
 
 static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
@@ -1660,7 +1666,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
-		set_iommu_table_base_and_group(&dev->dev, pe->tce32_table);
+		set_iommu_table_base(&dev->dev, pe->tce32_table);
+		iommu_add_device(&dev->dev);
 
 		if (dev->subordinate)
 			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
@@ -1836,7 +1843,13 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	if (pe->flags & PNV_IODA_PE_DEV) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
-		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
+		/*
+		 * Setting table base here only for carrying iommu_group
+		 * further down to let iommu_add_device() do the job.
+		 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
+		 */
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+		iommu_add_device(&pe->pdev->dev);
 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
@@ -1964,7 +1977,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	if (pe->flags & PNV_IODA_PE_DEV) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
-		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
+		/*
+		 * Setting table base here only for carrying iommu_group
+		 * further down to let iommu_add_device() do the job.
+		 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
+		 */
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+		iommu_add_device(&pe->pdev->dev);
 	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
 		iommu_register_group(tbl, phb->hose->global_number,
 				     pe->pe_number);
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 7f826e8..61ae20a 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -92,7 +92,8 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 				pci_domain_nr(phb->hose->bus), phb->opal_id);
 	}
 
-	set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table);
+	set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
+	iommu_add_device(&pdev->dev);
 }
 
 static const struct pci_controller_ops pnv_pci_p5ioc2_controller_ops = {
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 61d5a17..05ab06d 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -688,8 +688,8 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		iommu_table_setparms(phb, dn, tbl);
 		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
 		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
-		set_iommu_table_base_and_group(&dev->dev,
-					       PCI_DN(dn)->iommu_table);
+		set_iommu_table_base(&dev->dev, tbl);
+		iommu_add_device(&dev->dev);
 		return;
 	}
 
@@ -700,10 +700,10 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 	while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)
 		dn = dn->parent;
 
-	if (dn && PCI_DN(dn))
-		set_iommu_table_base_and_group(&dev->dev,
-					       PCI_DN(dn)->iommu_table);
-	else
+	if (dn && PCI_DN(dn)) {
+		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
+		iommu_add_device(&dev->dev);
+	} else
 		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
 		       pci_name(dev));
 }
@@ -1115,7 +1115,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
 	}
 
-	set_iommu_table_base_and_group(&dev->dev, pci->iommu_table);
+	set_iommu_table_base(&dev->dev, pci->iommu_table);
+	iommu_add_device(&dev->dev);
 }
 
 static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
-- 
cgit v0.10.2


From c5773822c000bafd462ad02db1d61459acef0d52 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:34:55 +1000
Subject: powerpc/powernv/ioda: Clean up IOMMU group registration

The existing code has 3 calls to iommu_register_group() and
all 3 branches actually cover all possible cases.

This replaces 3 calls with one and moves the registration earlier;
the latter will make more sense when we add TCE table sharing.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index fd594b2..2b829eb 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1785,6 +1785,9 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
+	tbl = pe->tce32_table;
+	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
+
 	/* Grab a 32-bit TCE table */
 	pe->tce32_seg = base;
 	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
@@ -1819,7 +1822,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	}
 
 	/* Setup linux iommu table */
-	tbl = pe->tce32_table;
 	pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
 				  base << 28, IOMMU_PAGE_SHIFT_4K);
 
@@ -1841,8 +1843,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
 		/*
 		 * Setting table base here only for carrying iommu_group
 		 * further down to let iommu_add_device() do the job.
@@ -1850,14 +1850,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 		 */
 		set_iommu_table_base(&pe->pdev->dev, tbl);
 		iommu_add_device(&pe->pdev->dev);
-	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
+	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
-	} else if (pe->flags & PNV_IODA_PE_VF) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
-	}
 
 	return;
  fail:
@@ -1924,6 +1918,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
+	tbl = pe->tce32_table;
+	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
+
 	/* The PE will reserve all possible 32-bits space */
 	pe->tce32_seg = 0;
 	end = (1 << ilog2(phb->ioda.m32_pci_base));
@@ -1955,7 +1952,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	}
 
 	/* Setup linux iommu table */
-	tbl = pe->tce32_table;
 	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
 			IOMMU_PAGE_SHIFT_4K);
 
@@ -1975,8 +1971,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
 		/*
 		 * Setting table base here only for carrying iommu_group
 		 * further down to let iommu_add_device() do the job.
@@ -1984,14 +1978,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 		 */
 		set_iommu_table_base(&pe->pdev->dev, tbl);
 		iommu_add_device(&pe->pdev->dev);
-	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
+	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
-	} else if (pe->flags & PNV_IODA_PE_VF) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
-	}
 
 	/* Also create a bypass window */
 	if (!pnv_iommu_bypass_disabled)
-- 
cgit v0.10.2


From ac9a58891a965216e9cb549a0a2012b97e3abcce Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:34:56 +1000
Subject: powerpc/iommu: Put IOMMU group explicitly

So far an iommu_table lifetime was the same as PE. Dynamic DMA windows
will change this and iommu_free_table() will not always require
the group to be released.

This moves iommu_group_put() out of iommu_free_table().

This adds a iommu_pseries_free_table() helper which does
iommu_group_put() and iommu_free_table(). Later it will be
changed to receive a table_group and we will have to change less
lines then.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index b054f33..3d47eb3 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -726,13 +726,6 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	if (tbl->it_offset == 0)
 		clear_bit(0, tbl->it_map);
 
-#ifdef CONFIG_IOMMU_API
-	if (tbl->it_group) {
-		iommu_group_put(tbl->it_group);
-		BUG_ON(tbl->it_group);
-	}
-#endif
-
 	/* verify that table contains no entries */
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
 		pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2b829eb..8070dc9 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -23,6 +23,7 @@
 #include <linux/io.h>
 #include <linux/msi.h>
 #include <linux/memblock.h>
+#include <linux/iommu.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -1310,6 +1311,10 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	if (rc)
 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
 
+	if (tbl->it_group) {
+		iommu_group_put(tbl->it_group);
+		BUG_ON(tbl->it_group);
+	}
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
 	free_pages(addr, get_order(TCE32_TABLE_SIZE));
 	pe->tce32_table = NULL;
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 05ab06d..fe5117b 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -36,6 +36,7 @@
 #include <linux/crash_dump.h>
 #include <linux/memory.h>
 #include <linux/of.h>
+#include <linux/iommu.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -51,6 +52,18 @@
 
 #include "pseries.h"
 
+static void iommu_pseries_free_table(struct iommu_table *tbl,
+		const char *node_name)
+{
+#ifdef CONFIG_IOMMU_API
+	if (tbl->it_group) {
+		iommu_group_put(tbl->it_group);
+		BUG_ON(tbl->it_group);
+	}
+#endif
+	iommu_free_table(tbl, node_name);
+}
+
 static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
 				      __be64 *startp, __be64 *endp)
 {
@@ -1271,7 +1284,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 		 */
 		remove_ddw(np, false);
 		if (pci && pci->iommu_table)
-			iommu_free_table(pci->iommu_table, np->full_name);
+			iommu_pseries_free_table(pci->iommu_table,
+					np->full_name);
 
 		spin_lock(&direct_window_list_lock);
 		list_for_each_entry(window, &direct_window_list, list) {
-- 
cgit v0.10.2


From 8aca92d82d9f259e0b6b0c81e5eb9baaf471e6ad Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:34:57 +1000
Subject: powerpc/iommu: Always release iommu_table in iommu_free_table()

At the moment iommu_free_table() only releases memory if
the table was initialized for the platform code use, i.e. it had
it_map initialized (which purpose is to track DMA memory space use).

With dynamic DMA windows, we will need to be able to release
iommu_table even if it was used for VFIO in which case it_map is NULL
so does the patch.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 3d47eb3..73eb39a 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -713,9 +713,11 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	unsigned long bitmap_sz;
 	unsigned int order;
 
-	if (!tbl || !tbl->it_map) {
-		printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
-				node_name);
+	if (!tbl)
+		return;
+
+	if (!tbl->it_map) {
+		kfree(tbl);
 		return;
 	}
 
-- 
cgit v0.10.2


From 9b14a1ff8657d3ee844f8987482bc367a716848c Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:34:58 +1000
Subject: vfio: powerpc/spapr: Move page pinning from arch code to VFIO IOMMU
 driver

This moves page pinning (get_user_pages_fast()/put_page()) code out of
the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs
to as the platform code does not deal with page pinning.

This makes iommu_take_ownership()/iommu_release_ownership() deal with
the IOMMU table bitmap only.

This removes page unpinning from iommu_take_ownership() as the actual
TCE table might contain garbage and doing put_page() on it is undefined
behaviour.

Besides the last part, the rest of the patch is mechanical.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 8353c86..e94a5e3 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -194,10 +194,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 		unsigned long hwaddr, enum dma_data_direction direction);
 extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
 		unsigned long entry);
-extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-		unsigned long entry, unsigned long pages);
-extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
-		unsigned long entry, unsigned long tce);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 73eb39a..0019c80 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -986,30 +986,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
 }
 EXPORT_SYMBOL_GPL(iommu_clear_tce);
 
-int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-		unsigned long entry, unsigned long pages)
-{
-	unsigned long oldtce;
-	struct page *page;
-
-	for ( ; pages; --pages, ++entry) {
-		oldtce = iommu_clear_tce(tbl, entry);
-		if (!oldtce)
-			continue;
-
-		page = pfn_to_page(oldtce >> PAGE_SHIFT);
-		WARN_ON(!page);
-		if (page) {
-			if (oldtce & TCE_PCI_WRITE)
-				SetPageDirty(page);
-			put_page(page);
-		}
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
-
 /*
  * hwaddr is a kernel virtual address here (0xc... bazillion),
  * tce_build converts it to a physical address.
@@ -1039,35 +1015,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_build);
 
-int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
-		unsigned long tce)
-{
-	int ret;
-	struct page *page = NULL;
-	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
-	enum dma_data_direction direction = iommu_tce_direction(tce);
-
-	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
-			direction != DMA_TO_DEVICE, &page);
-	if (unlikely(ret != 1)) {
-		/* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
-				tce, entry << tbl->it_page_shift, ret); */
-		return -EFAULT;
-	}
-	hwaddr = (unsigned long) page_address(page) + offset;
-
-	ret = iommu_tce_build(tbl, entry, hwaddr, direction);
-	if (ret)
-		put_page(page);
-
-	if (ret < 0)
-		pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
-			__func__, entry << tbl->it_page_shift, tce, ret);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
-
 int iommu_take_ownership(struct iommu_table *tbl)
 {
 	unsigned long sz = (tbl->it_size + 7) >> 3;
@@ -1081,7 +1028,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
 	}
 
 	memset(tbl->it_map, 0xff, sz);
-	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
 
 	/*
 	 * Disable iommu bypass, otherwise the user can DMA to all of
@@ -1099,7 +1045,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
 {
 	unsigned long sz = (tbl->it_size + 7) >> 3;
 
-	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
 	memset(tbl->it_map, 0, sz);
 
 	/* Restore bit#0 set by iommu_init_table() */
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 730b4ef..b95fa2b 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -147,6 +147,67 @@ static void tce_iommu_release(void *iommu_data)
 	kfree(container);
 }
 
+static int tce_iommu_clear(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages)
+{
+	unsigned long oldtce;
+	struct page *page;
+
+	for ( ; pages; --pages, ++entry) {
+		oldtce = iommu_clear_tce(tbl, entry);
+		if (!oldtce)
+			continue;
+
+		page = pfn_to_page(oldtce >> PAGE_SHIFT);
+		WARN_ON(!page);
+		if (page) {
+			if (oldtce & TCE_PCI_WRITE)
+				SetPageDirty(page);
+			put_page(page);
+		}
+	}
+
+	return 0;
+}
+
+static long tce_iommu_build(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long tce, unsigned long pages)
+{
+	long i, ret = 0;
+	struct page *page = NULL;
+	unsigned long hva;
+	enum dma_data_direction direction = iommu_tce_direction(tce);
+
+	for (i = 0; i < pages; ++i) {
+		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
+
+		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+				direction != DMA_TO_DEVICE, &page);
+		if (unlikely(ret != 1)) {
+			ret = -EFAULT;
+			break;
+		}
+		hva = (unsigned long) page_address(page) + offset;
+
+		ret = iommu_tce_build(tbl, entry + i, hva, direction);
+		if (ret) {
+			put_page(page);
+			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
+					__func__, entry << tbl->it_page_shift,
+					tce, ret);
+			break;
+		}
+		tce += IOMMU_PAGE_SIZE_4K;
+	}
+
+	if (ret)
+		tce_iommu_clear(container, tbl, entry, i);
+
+	return ret;
+}
+
 static long tce_iommu_ioctl(void *iommu_data,
 				 unsigned int cmd, unsigned long arg)
 {
@@ -195,7 +256,7 @@ static long tce_iommu_ioctl(void *iommu_data,
 	case VFIO_IOMMU_MAP_DMA: {
 		struct vfio_iommu_type1_dma_map param;
 		struct iommu_table *tbl = container->tbl;
-		unsigned long tce, i;
+		unsigned long tce;
 
 		if (!tbl)
 			return -ENXIO;
@@ -229,17 +290,9 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (ret)
 			return ret;
 
-		for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) {
-			ret = iommu_put_tce_user_mode(tbl,
-					(param.iova >> IOMMU_PAGE_SHIFT_4K) + i,
-					tce);
-			if (ret)
-				break;
-			tce += IOMMU_PAGE_SIZE_4K;
-		}
-		if (ret)
-			iommu_clear_tces_and_put_pages(tbl,
-					param.iova >> IOMMU_PAGE_SHIFT_4K, i);
+		ret = tce_iommu_build(container, tbl,
+				param.iova >> IOMMU_PAGE_SHIFT_4K,
+				tce, param.size >> IOMMU_PAGE_SHIFT_4K);
 
 		iommu_flush_tce(tbl);
 
@@ -273,7 +326,7 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (ret)
 			return ret;
 
-		ret = iommu_clear_tces_and_put_pages(tbl,
+		ret = tce_iommu_clear(container, tbl,
 				param.iova >> IOMMU_PAGE_SHIFT_4K,
 				param.size >> IOMMU_PAGE_SHIFT_4K);
 		iommu_flush_tce(tbl);
@@ -357,6 +410,7 @@ static void tce_iommu_detach_group(void *iommu_data,
 		/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
 				iommu_group_id(iommu_group), iommu_group); */
 		container->tbl = NULL;
+		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 		iommu_release_ownership(tbl);
 	}
 	mutex_unlock(&container->lock);
-- 
cgit v0.10.2


From e432bc7e15d80288aff0bcccb2ff0f9ae572abdd Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:34:59 +1000
Subject: vfio: powerpc/spapr: Check that IOMMU page is fully contained by
 system page

This checks that the TCE table page size is not bigger that the size of
a page we just pinned and going to put its physical address to the table.

Otherwise the hardware gets unwanted access to physical memory between
the end of the actual page and the end of the aligned up TCE page.

Since compound_order() and compound_head() work correctly on non-huge
pages, there is no need for additional check whether the page is huge.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index b95fa2b..735b308 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -47,6 +47,16 @@ struct tce_container {
 	bool enabled;
 };
 
+static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+{
+	/*
+	 * Check that the TCE table granularity is not bigger than the size of
+	 * a page we just found. Otherwise the hardware can get access to
+	 * a bigger memory chunk that it should.
+	 */
+	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
 	int ret = 0;
@@ -189,6 +199,12 @@ static long tce_iommu_build(struct tce_container *container,
 			ret = -EFAULT;
 			break;
 		}
+
+		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
+			ret = -EPERM;
+			break;
+		}
+
 		hva = (unsigned long) page_address(page) + offset;
 
 		ret = iommu_tce_build(tbl, entry + i, hva, direction);
-- 
cgit v0.10.2


From 00663d4ee05dafe97033003fd5479cf9efd4bf96 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:00 +1000
Subject: vfio: powerpc/spapr: Use it_page_size

This makes use of the it_page_size from the iommu_table struct
as page size can differ.

This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
as recently introduced IOMMU_PAGE_XXX macros do not include
IOMMU_PAGE_SHIFT.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 735b308..64300cc 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -91,7 +91,7 @@ static int tce_iommu_enable(struct tce_container *container)
 	 * enforcing the limit based on the max that the guest can map.
 	 */
 	down_write(&current->mm->mmap_sem);
-	npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
+	npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
 	locked = current->mm->locked_vm + npages;
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
@@ -120,7 +120,7 @@ static void tce_iommu_disable(struct tce_container *container)
 
 	down_write(&current->mm->mmap_sem);
 	current->mm->locked_vm -= (container->tbl->it_size <<
-			IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
+			container->tbl->it_page_shift) >> PAGE_SHIFT;
 	up_write(&current->mm->mmap_sem);
 }
 
@@ -215,7 +215,7 @@ static long tce_iommu_build(struct tce_container *container,
 					tce, ret);
 			break;
 		}
-		tce += IOMMU_PAGE_SIZE_4K;
+		tce += IOMMU_PAGE_SIZE(tbl);
 	}
 
 	if (ret)
@@ -260,8 +260,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K;
-		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K;
+		info.dma32_window_start = tbl->it_offset << tbl->it_page_shift;
+		info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
 		info.flags = 0;
 
 		if (copy_to_user((void __user *)arg, &info, minsz))
@@ -291,8 +291,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 				VFIO_DMA_MAP_FLAG_WRITE))
 			return -EINVAL;
 
-		if ((param.size & ~IOMMU_PAGE_MASK_4K) ||
-				(param.vaddr & ~IOMMU_PAGE_MASK_4K))
+		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 			return -EINVAL;
 
 		/* iova is checked by the IOMMU API */
@@ -307,8 +307,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 			return ret;
 
 		ret = tce_iommu_build(container, tbl,
-				param.iova >> IOMMU_PAGE_SHIFT_4K,
-				tce, param.size >> IOMMU_PAGE_SHIFT_4K);
+				param.iova >> tbl->it_page_shift,
+				tce, param.size >> tbl->it_page_shift);
 
 		iommu_flush_tce(tbl);
 
@@ -334,17 +334,17 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (param.flags)
 			return -EINVAL;
 
-		if (param.size & ~IOMMU_PAGE_MASK_4K)
+		if (param.size & ~IOMMU_PAGE_MASK(tbl))
 			return -EINVAL;
 
 		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
-				param.size >> IOMMU_PAGE_SHIFT_4K);
+				param.size >> tbl->it_page_shift);
 		if (ret)
 			return ret;
 
 		ret = tce_iommu_clear(container, tbl,
-				param.iova >> IOMMU_PAGE_SHIFT_4K,
-				param.size >> IOMMU_PAGE_SHIFT_4K);
+				param.iova >> tbl->it_page_shift,
+				param.size >> tbl->it_page_shift);
 		iommu_flush_tce(tbl);
 
 		return ret;
-- 
cgit v0.10.2


From 2d270df8f71a0fded9ef01cb9282b46fd3bc0986 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:01 +1000
Subject: vfio: powerpc/spapr: Move locked_vm accounting to helpers

There moves locked pages accounting to helpers.
Later they will be reused for Dynamic DMA windows (DDW).

This reworks debug messages to show the current value and the limit.

This stores the locked pages number in the container so when unlocking
the iommu table pointer won't be needed. This does not have an effect
now but it will with the multiple tables per container as then we will
allow attaching/detaching groups on fly and we may end up having
a container with no group attached but with the counter incremented.

While we are here, update the comment explaining why RLIMIT_MEMLOCK
might be required to be bigger than the guest RAM. This also prints
pid of the current process in pr_warn/pr_debug.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 64300cc..6e2e15f 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -29,6 +29,51 @@
 static void tce_iommu_detach_group(void *iommu_data,
 		struct iommu_group *iommu_group);
 
+static long try_increment_locked_vm(long npages)
+{
+	long ret = 0, locked, lock_limit;
+
+	if (!current || !current->mm)
+		return -ESRCH; /* process exited */
+
+	if (!npages)
+		return 0;
+
+	down_write(&current->mm->mmap_sem);
+	locked = current->mm->locked_vm + npages;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+		ret = -ENOMEM;
+	else
+		current->mm->locked_vm += npages;
+
+	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
+			npages << PAGE_SHIFT,
+			current->mm->locked_vm << PAGE_SHIFT,
+			rlimit(RLIMIT_MEMLOCK),
+			ret ? " - exceeded" : "");
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+static void decrement_locked_vm(long npages)
+{
+	if (!current || !current->mm || !npages)
+		return; /* process exited */
+
+	down_write(&current->mm->mmap_sem);
+	if (WARN_ON_ONCE(npages > current->mm->locked_vm))
+		npages = current->mm->locked_vm;
+	current->mm->locked_vm -= npages;
+	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
+			npages << PAGE_SHIFT,
+			current->mm->locked_vm << PAGE_SHIFT,
+			rlimit(RLIMIT_MEMLOCK));
+	up_write(&current->mm->mmap_sem);
+}
+
 /*
  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  *
@@ -45,6 +90,7 @@ struct tce_container {
 	struct mutex lock;
 	struct iommu_table *tbl;
 	bool enabled;
+	unsigned long locked_pages;
 };
 
 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 static int tce_iommu_enable(struct tce_container *container)
 {
 	int ret = 0;
-	unsigned long locked, lock_limit, npages;
+	unsigned long locked;
 	struct iommu_table *tbl = container->tbl;
 
 	if (!container->tbl)
@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container *container)
 	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 	 * that would effectively kill the guest at random points, much better
 	 * enforcing the limit based on the max that the guest can map.
+	 *
+	 * Unfortunately at the moment it counts whole tables, no matter how
+	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
+	 * each with 2GB DMA window, 8GB will be counted here. The reason for
+	 * this is that we cannot tell here the amount of RAM used by the guest
+	 * as this information is only available from KVM and VFIO is
+	 * KVM agnostic.
 	 */
-	down_write(&current->mm->mmap_sem);
-	npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
-	locked = current->mm->locked_vm + npages;
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
-		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
-				rlimit(RLIMIT_MEMLOCK));
-		ret = -ENOMEM;
-	} else {
+	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
+	ret = try_increment_locked_vm(locked);
+	if (ret)
+		return ret;
 
-		current->mm->locked_vm += npages;
-		container->enabled = true;
-	}
-	up_write(&current->mm->mmap_sem);
+	container->locked_pages = locked;
+
+	container->enabled = true;
 
 	return ret;
 }
@@ -115,13 +162,10 @@ static void tce_iommu_disable(struct tce_container *container)
 
 	container->enabled = false;
 
-	if (!container->tbl || !current->mm)
+	if (!current->mm)
 		return;
 
-	down_write(&current->mm->mmap_sem);
-	current->mm->locked_vm -= (container->tbl->it_size <<
-			container->tbl->it_page_shift) >> PAGE_SHIFT;
-	up_write(&current->mm->mmap_sem);
+	decrement_locked_vm(container->locked_pages);
 }
 
 static void *tce_iommu_open(unsigned long arg)
-- 
cgit v0.10.2


From 3c56e822f8fb0105949d04bda0e549e9d08713cd Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:02 +1000
Subject: vfio: powerpc/spapr: Disable DMA mappings on disabled container

At the moment DMA map/unmap requests are handled irrespective to
the container's state. This allows the user space to pin memory which
it might not be allowed to pin.

This adds checks to MAP/UNMAP that the container is enabled, otherwise
-EPERM is returned.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 6e2e15f..5bbdf37 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -318,6 +318,9 @@ static long tce_iommu_ioctl(void *iommu_data,
 		struct iommu_table *tbl = container->tbl;
 		unsigned long tce;
 
+		if (!container->enabled)
+			return -EPERM;
+
 		if (!tbl)
 			return -ENXIO;
 
@@ -362,6 +365,9 @@ static long tce_iommu_ioctl(void *iommu_data,
 		struct vfio_iommu_type1_dma_unmap param;
 		struct iommu_table *tbl = container->tbl;
 
+		if (!container->enabled)
+			return -EPERM;
+
 		if (WARN_ON(!tbl))
 			return -ENXIO;
 
-- 
cgit v0.10.2


From 649354b75dca1ebcb55d8db41e1d6b59ef69ac77 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:03 +1000
Subject: vfio: powerpc/spapr: Moving pinning/unpinning to helpers

This is a pretty mechanical patch to make next patches simpler.

New tce_iommu_unuse_page() helper does put_page() now but it might skip
that after the memory registering patch applied.

As we are here, this removes unnecessary checks for a value returned
by pfn_to_page() as it cannot possibly return NULL.

This moves tce_iommu_disable() later to let tce_iommu_clear() know if
the container has been enabled because if it has not been, then
put_page() must not be called on TCEs from the TCE table. This situation
is not yet possible but it will after KVM acceleration patchset is
applied.

This changes code to work with physical addresses rather than linear
mapping addresses for better code readability. Following patches will
add an xchg() callback for an IOMMU table which will accept/return
physical addresses (unlike current tce_build()) which will eliminate
redundant conversions.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 5bbdf37..cf5d4a1 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -191,69 +191,90 @@ static void tce_iommu_release(void *iommu_data)
 	struct tce_container *container = iommu_data;
 
 	WARN_ON(container->tbl && !container->tbl->it_group);
-	tce_iommu_disable(container);
 
 	if (container->tbl && container->tbl->it_group)
 		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
 
+	tce_iommu_disable(container);
 	mutex_destroy(&container->lock);
 
 	kfree(container);
 }
 
+static void tce_iommu_unuse_page(struct tce_container *container,
+		unsigned long oldtce)
+{
+	struct page *page;
+
+	if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE)))
+		return;
+
+	page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+	if (oldtce & TCE_PCI_WRITE)
+		SetPageDirty(page);
+
+	put_page(page);
+}
+
 static int tce_iommu_clear(struct tce_container *container,
 		struct iommu_table *tbl,
 		unsigned long entry, unsigned long pages)
 {
 	unsigned long oldtce;
-	struct page *page;
 
 	for ( ; pages; --pages, ++entry) {
 		oldtce = iommu_clear_tce(tbl, entry);
 		if (!oldtce)
 			continue;
 
-		page = pfn_to_page(oldtce >> PAGE_SHIFT);
-		WARN_ON(!page);
-		if (page) {
-			if (oldtce & TCE_PCI_WRITE)
-				SetPageDirty(page);
-			put_page(page);
-		}
+		tce_iommu_unuse_page(container, oldtce);
 	}
 
 	return 0;
 }
 
+static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
+{
+	struct page *page = NULL;
+	enum dma_data_direction direction = iommu_tce_direction(tce);
+
+	if (get_user_pages_fast(tce & PAGE_MASK, 1,
+			direction != DMA_TO_DEVICE, &page) != 1)
+		return -EFAULT;
+
+	*hpa = __pa((unsigned long) page_address(page));
+
+	return 0;
+}
+
 static long tce_iommu_build(struct tce_container *container,
 		struct iommu_table *tbl,
 		unsigned long entry, unsigned long tce, unsigned long pages)
 {
 	long i, ret = 0;
-	struct page *page = NULL;
-	unsigned long hva;
+	struct page *page;
+	unsigned long hpa;
 	enum dma_data_direction direction = iommu_tce_direction(tce);
 
 	for (i = 0; i < pages; ++i) {
 		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 
-		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
-				direction != DMA_TO_DEVICE, &page);
-		if (unlikely(ret != 1)) {
-			ret = -EFAULT;
+		ret = tce_iommu_use_page(tce, &hpa);
+		if (ret)
 			break;
-		}
 
+		page = pfn_to_page(hpa >> PAGE_SHIFT);
 		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
 			ret = -EPERM;
 			break;
 		}
 
-		hva = (unsigned long) page_address(page) + offset;
-
-		ret = iommu_tce_build(tbl, entry + i, hva, direction);
+		hpa |= offset;
+		ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va(hpa),
+				direction);
 		if (ret) {
-			put_page(page);
+			tce_iommu_unuse_page(container, hpa);
 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 					__func__, entry << tbl->it_page_shift,
 					tce, ret);
-- 
cgit v0.10.2


From 22af48596e9c92313d475306b684f844301ea4cd Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:04 +1000
Subject: vfio: powerpc/spapr: Rework groups attaching

This is to make extended ownership and multiple groups support patches
simpler for review.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index cf5d4a1..e65bc73 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -460,16 +460,21 @@ static int tce_iommu_attach_group(void *iommu_data,
 				iommu_group_id(container->tbl->it_group),
 				iommu_group_id(iommu_group));
 		ret = -EBUSY;
-	} else if (container->enabled) {
+		goto unlock_exit;
+	}
+
+	if (container->enabled) {
 		pr_err("tce_vfio: attaching group #%u to enabled container\n",
 				iommu_group_id(iommu_group));
 		ret = -EBUSY;
-	} else {
-		ret = iommu_take_ownership(tbl);
-		if (!ret)
-			container->tbl = tbl;
+		goto unlock_exit;
 	}
 
+	ret = iommu_take_ownership(tbl);
+	if (!ret)
+		container->tbl = tbl;
+
+unlock_exit:
 	mutex_unlock(&container->lock);
 
 	return ret;
@@ -487,19 +492,22 @@ static void tce_iommu_detach_group(void *iommu_data,
 		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
 				iommu_group_id(iommu_group),
 				iommu_group_id(tbl->it_group));
-	} else {
-		if (container->enabled) {
-			pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
-					iommu_group_id(tbl->it_group));
-			tce_iommu_disable(container);
-		}
+		goto unlock_exit;
+	}
 
-		/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
-				iommu_group_id(iommu_group), iommu_group); */
-		container->tbl = NULL;
-		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
-		iommu_release_ownership(tbl);
+	if (container->enabled) {
+		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
+				iommu_group_id(tbl->it_group));
+		tce_iommu_disable(container);
 	}
+
+	/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+	   iommu_group_id(iommu_group), iommu_group); */
+	container->tbl = NULL;
+	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+	iommu_release_ownership(tbl);
+
+unlock_exit:
 	mutex_unlock(&container->lock);
 }
 
-- 
cgit v0.10.2


From 10b35b2b7485c342334a48cf199063eed8b8748e Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:05 +1000
Subject: powerpc/powernv: Do not set "read" flag if direction==DMA_NONE

Normally a bitmap from the iommu_table is used to track what TCE entry
is in use. Since we are going to use iommu_table without its locks and
do xchg() instead, it becomes essential not to put bits which are not
implied in the direction flag as the old TCE value (more precisely -
the permission bits) will be used to decide whether to put the page or not.

This adds iommu_direction_to_tce_perm() (its counterpart is there already)
and uses it for powernv's pnv_tce_build().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index e94a5e3..d91bd69 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -200,6 +200,7 @@ extern int iommu_take_ownership(struct iommu_table *tbl);
 extern void iommu_release_ownership(struct iommu_table *tbl);
 
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
+extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0019c80..ac2f959 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -866,6 +866,21 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 	}
 }
 
+unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
+{
+	switch (dir) {
+	case DMA_BIDIRECTIONAL:
+		return TCE_PCI_READ | TCE_PCI_WRITE;
+	case DMA_FROM_DEVICE:
+		return TCE_PCI_WRITE;
+	case DMA_TO_DEVICE:
+		return TCE_PCI_READ;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
+
 #ifdef CONFIG_IOMMU_API
 /*
  * SPAPR TCE API
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 3c35487..1477422 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -576,15 +576,10 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 			 unsigned long uaddr, enum dma_data_direction direction,
 			 struct dma_attrs *attrs, bool rm)
 {
-	u64 proto_tce;
+	u64 proto_tce = iommu_direction_to_tce_perm(direction);
 	__be64 *tcep, *tces;
 	u64 rpn;
 
-	proto_tce = TCE_PCI_READ; // Read allowed
-
-	if (direction != DMA_TO_DEVICE)
-		proto_tce |= TCE_PCI_WRITE;
-
 	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
 	rpn = __pa(uaddr) >> tbl->it_page_shift;
 
-- 
cgit v0.10.2


From da004c3600f52e4f05017f60970e5010978006bc Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:06 +1000
Subject: powerpc/iommu: Move tce_xxx callbacks from ppc_md to iommu_table

This adds a iommu_table_ops struct and puts pointer to it into
the iommu_table struct. This moves tce_build/tce_free/tce_get/tce_flush
callbacks from ppc_md to the new struct where they really belong to.

This adds the requirement for @it_ops to be initialized before calling
iommu_init_table() to make sure that we do not leave any IOMMU table
with iommu_table_ops uninitialized. This is not a parameter of
iommu_init_table() though as there will be cases when iommu_init_table()
will not be called on TCE tables, for example - VFIO.

This does s/tce_build/set/, s/tce_free/clear/ and removes "tce_"
redundant prefixes.

This removes tce_xxx_rm handlers from ppc_md but does not add
them to iommu_table_ops as this will be done later if we decide to
support TCE hypercalls in real mode. This removes _vm callbacks as
only virtual mode is supported by now so this also removes @rm parameter.

For pSeries, this always uses tce_buildmulti_pSeriesLP/
tce_buildmulti_pSeriesLP. This changes multi callback to fall back to
tce_build_pSeriesLP/tce_free_pSeriesLP if FW_FEATURE_MULTITCE is not
present. The reason for this is we still have to support "multitce=off"
boot parameter in disable_multitce() and we do not want to walk through
all IOMMU tables in the system and replace "multi" callbacks with single
ones.

For powernv, this defines _ops per PHB type which are P5IOC2/IODA1/IODA2.
This makes the callbacks for them public. Later patches will extend
callbacks for IODA1/2.

No change in behaviour is expected.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index d91bd69..e2a45c3 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -44,6 +44,22 @@
 extern int iommu_is_off;
 extern int iommu_force_on;
 
+struct iommu_table_ops {
+	int (*set)(struct iommu_table *tbl,
+			long index, long npages,
+			unsigned long uaddr,
+			enum dma_data_direction direction,
+			struct dma_attrs *attrs);
+	void (*clear)(struct iommu_table *tbl,
+			long index, long npages);
+	unsigned long (*get)(struct iommu_table *tbl, long index);
+	void (*flush)(struct iommu_table *tbl);
+};
+
+/* These are used by VIO */
+extern struct iommu_table_ops iommu_table_lpar_multi_ops;
+extern struct iommu_table_ops iommu_table_pseries_ops;
+
 /*
  * IOMAP_MAX_ORDER defines the largest contiguous block
  * of dma space we can get.  IOMAP_MAX_ORDER = 13
@@ -78,6 +94,7 @@ struct iommu_table {
 #ifdef CONFIG_IOMMU_API
 	struct iommu_group *it_group;
 #endif
+	struct iommu_table_ops *it_ops;
 	void (*set_bypass)(struct iommu_table *tbl, bool enable);
 #ifdef CONFIG_PPC_POWERNV
 	void           *data;
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 0d387d0..952579f 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -65,31 +65,6 @@ struct machdep_calls {
 	 * destroyed as well */
 	void		(*hpte_clear_all)(void);
 
-	int		(*tce_build)(struct iommu_table *tbl,
-				     long index,
-				     long npages,
-				     unsigned long uaddr,
-				     enum dma_data_direction direction,
-				     struct dma_attrs *attrs);
-	void		(*tce_free)(struct iommu_table *tbl,
-				    long index,
-				    long npages);
-	unsigned long	(*tce_get)(struct iommu_table *tbl,
-				    long index);
-	void		(*tce_flush)(struct iommu_table *tbl);
-
-	/* _rm versions are for real mode use only */
-	int		(*tce_build_rm)(struct iommu_table *tbl,
-				     long index,
-				     long npages,
-				     unsigned long uaddr,
-				     enum dma_data_direction direction,
-				     struct dma_attrs *attrs);
-	void		(*tce_free_rm)(struct iommu_table *tbl,
-				    long index,
-				    long npages);
-	void		(*tce_flush_rm)(struct iommu_table *tbl);
-
 	void __iomem *	(*ioremap)(phys_addr_t addr, unsigned long size,
 				   unsigned long flags, void *caller);
 	void		(*iounmap)(volatile void __iomem *token);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ac2f959..c0e67e9 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -322,11 +322,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	ret = entry << tbl->it_page_shift;	/* Set the return dma address */
 
 	/* Put the TCEs in the HW table */
-	build_fail = ppc_md.tce_build(tbl, entry, npages,
+	build_fail = tbl->it_ops->set(tbl, entry, npages,
 				      (unsigned long)page &
 				      IOMMU_PAGE_MASK(tbl), direction, attrs);
 
-	/* ppc_md.tce_build() only returns non-zero for transient errors.
+	/* tbl->it_ops->set() only returns non-zero for transient errors.
 	 * Clean up the table bitmap in this case and return
 	 * DMA_ERROR_CODE. For all other errors the functionality is
 	 * not altered.
@@ -337,8 +337,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	}
 
 	/* Flush/invalidate TLB caches if necessary */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 
 	/* Make sure updates are seen by hardware */
 	mb();
@@ -408,7 +408,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	if (!iommu_free_check(tbl, dma_addr, npages))
 		return;
 
-	ppc_md.tce_free(tbl, entry, npages);
+	tbl->it_ops->clear(tbl, entry, npages);
 
 	spin_lock_irqsave(&(pool->lock), flags);
 	bitmap_clear(tbl->it_map, free_entry, npages);
@@ -424,8 +424,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	 * not do an mb() here on purpose, it is not needed on any of
 	 * the current platforms.
 	 */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 }
 
 int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
@@ -495,7 +495,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 			    npages, entry, dma_addr);
 
 		/* Insert into HW table */
-		build_fail = ppc_md.tce_build(tbl, entry, npages,
+		build_fail = tbl->it_ops->set(tbl, entry, npages,
 					      vaddr & IOMMU_PAGE_MASK(tbl),
 					      direction, attrs);
 		if(unlikely(build_fail))
@@ -534,8 +534,8 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 	}
 
 	/* Flush/invalidate TLB caches if necessary */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 
 	DBG("mapped %d elements:\n", outcount);
 
@@ -600,8 +600,8 @@ void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 	 * do not do an mb() here, the affected platforms do not need it
 	 * when freeing.
 	 */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 }
 
 static void iommu_table_clear(struct iommu_table *tbl)
@@ -613,17 +613,17 @@ static void iommu_table_clear(struct iommu_table *tbl)
 	 */
 	if (!is_kdump_kernel() || is_fadump_active()) {
 		/* Clear the table in case firmware left allocations in it */
-		ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size);
+		tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size);
 		return;
 	}
 
 #ifdef CONFIG_CRASH_DUMP
-	if (ppc_md.tce_get) {
+	if (tbl->it_ops->get) {
 		unsigned long index, tceval, tcecount = 0;
 
 		/* Reserve the existing mappings left by the first kernel. */
 		for (index = 0; index < tbl->it_size; index++) {
-			tceval = ppc_md.tce_get(tbl, index + tbl->it_offset);
+			tceval = tbl->it_ops->get(tbl, index + tbl->it_offset);
 			/*
 			 * Freed TCE entry contains 0x7fffffffffffffff on JS20
 			 */
@@ -657,6 +657,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	unsigned int i;
 	struct iommu_pool *p;
 
+	BUG_ON(!tbl->it_ops);
+
 	/* number of bytes needed for the bitmap */
 	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
 
@@ -929,8 +931,8 @@ EXPORT_SYMBOL_GPL(iommu_tce_direction);
 void iommu_flush_tce(struct iommu_table *tbl)
 {
 	/* Flush/invalidate TLB caches if necessary */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 
 	/* Make sure updates are seen by hardware */
 	mb();
@@ -941,7 +943,7 @@ int iommu_tce_clear_param_check(struct iommu_table *tbl,
 		unsigned long ioba, unsigned long tce_value,
 		unsigned long npages)
 {
-	/* ppc_md.tce_free() does not support any value but 0 */
+	/* tbl->it_ops->clear() does not support any value but 0 */
 	if (tce_value)
 		return -EINVAL;
 
@@ -989,9 +991,9 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
 
 	spin_lock(&(pool->lock));
 
-	oldtce = ppc_md.tce_get(tbl, entry);
+	oldtce = tbl->it_ops->get(tbl, entry);
 	if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
-		ppc_md.tce_free(tbl, entry, 1);
+		tbl->it_ops->clear(tbl, entry, 1);
 	else
 		oldtce = 0;
 
@@ -1014,10 +1016,10 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 
 	spin_lock(&(pool->lock));
 
-	oldtce = ppc_md.tce_get(tbl, entry);
+	oldtce = tbl->it_ops->get(tbl, entry);
 	/* Add new entry if it is not busy */
 	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
-		ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL);
+		ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL);
 
 	spin_unlock(&(pool->lock));
 
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 5bfdab90..b41426c 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1196,6 +1196,11 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 	tbl->it_type = TCE_VB;
 	tbl->it_blocksize = 16;
 
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
+	else
+		tbl->it_ops = &iommu_table_pseries_ops;
+
 	return iommu_init_table(tbl, -1);
 }
 
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 21b5023..14a582b 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -466,6 +466,11 @@ static inline u32 cell_iommu_get_ioid(struct device_node *np)
 	return *ioid;
 }
 
+static struct iommu_table_ops cell_iommu_ops = {
+	.set = tce_build_cell,
+	.clear = tce_free_cell
+};
+
 static struct iommu_window * __init
 cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 			unsigned long offset, unsigned long size,
@@ -492,6 +497,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 	window->table.it_offset =
 		(offset >> window->table.it_page_shift) + pte_offset;
 	window->table.it_size = size >> window->table.it_page_shift;
+	window->table.it_ops = &cell_iommu_ops;
 
 	iommu_init_table(&window->table, iommu->nid);
 
@@ -1201,8 +1207,6 @@ static int __init cell_iommu_init(void)
 	/* Setup various callbacks */
 	cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup;
 	ppc_md.dma_get_required_mask = cell_dma_get_required_mask;
-	ppc_md.tce_build = tce_build_cell;
-	ppc_md.tce_free = tce_free_cell;
 
 	if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0)
 		goto bail;
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index b8f567b..c929644 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -134,6 +134,10 @@ static void iobmap_free(struct iommu_table *tbl, long index,
 	}
 }
 
+static struct iommu_table_ops iommu_table_iobmap_ops = {
+	.set = iobmap_build,
+	.clear  = iobmap_free
+};
 
 static void iommu_table_iobmap_setup(void)
 {
@@ -153,6 +157,7 @@ static void iommu_table_iobmap_setup(void)
 	 * Should probably be 8 (64 bytes)
 	 */
 	iommu_table_iobmap.it_blocksize = 4;
+	iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops;
 	iommu_init_table(&iommu_table_iobmap, 0);
 	pr_debug(" <- %s\n", __func__);
 }
@@ -252,8 +257,6 @@ void __init iommu_init_early_pasemi(void)
 
 	pasemi_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pasemi;
 	pasemi_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pasemi;
-	ppc_md.tce_build = iobmap_build;
-	ppc_md.tce_free  = iobmap_free;
 	set_pci_dma_ops(&dma_iommu_ops);
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8070dc9..b0a14c0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1726,6 +1726,12 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
 	 */
 }
 
+static struct iommu_table_ops pnv_ioda1_iommu_ops = {
+	.set = pnv_tce_build,
+	.clear = pnv_tce_free,
+	.get = pnv_tce_get,
+};
+
 static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
 					 struct iommu_table *tbl,
 					 __be64 *startp, __be64 *endp, bool rm)
@@ -1770,6 +1776,12 @@ void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
 		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
 }
 
+static struct iommu_table_ops pnv_ioda2_iommu_ops = {
+	.set = pnv_tce_build,
+	.clear = pnv_tce_free,
+	.get = pnv_tce_get,
+};
+
 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 				      struct pnv_ioda_pe *pe, unsigned int base,
 				      unsigned int segs)
@@ -1845,6 +1857,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 				 TCE_PCI_SWINV_FREE   |
 				 TCE_PCI_SWINV_PAIR);
 	}
+	tbl->it_ops = &pnv_ioda1_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
@@ -1973,6 +1986,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 				8);
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
 	}
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 61ae20a..5a387a9 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -83,10 +83,17 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb)
 static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
 #endif /* CONFIG_PCI_MSI */
 
+static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
+	.set = pnv_tce_build,
+	.clear = pnv_tce_free,
+	.get = pnv_tce_get,
+};
+
 static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 					 struct pci_dev *pdev)
 {
 	if (phb->p5ioc2.iommu_table.it_map == NULL) {
+		phb->p5ioc2.iommu_table.it_ops = &pnv_p5ioc2_iommu_ops;
 		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
 		iommu_register_group(&phb->p5ioc2.iommu_table,
 				pci_domain_nr(phb->hose->bus), phb->opal_id);
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 1477422..2793b9d 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -572,9 +572,9 @@ struct pci_ops pnv_pci_ops = {
 	.write = pnv_pci_write_config,
 };
 
-static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
-			 unsigned long uaddr, enum dma_data_direction direction,
-			 struct dma_attrs *attrs, bool rm)
+int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+		unsigned long uaddr, enum dma_data_direction direction,
+		struct dma_attrs *attrs)
 {
 	u64 proto_tce = iommu_direction_to_tce_perm(direction);
 	__be64 *tcep, *tces;
@@ -592,22 +592,12 @@ static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 	 * of flags if that becomes the case
 	 */
 	if (tbl->it_type & TCE_PCI_SWINV_CREATE)
-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
+		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
 
 	return 0;
 }
 
-static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages,
-			    unsigned long uaddr,
-			    enum dma_data_direction direction,
-			    struct dma_attrs *attrs)
-{
-	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs,
-			false);
-}
-
-static void pnv_tce_free(struct iommu_table *tbl, long index, long npages,
-		bool rm)
+void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 {
 	__be64 *tcep, *tces;
 
@@ -617,32 +607,14 @@ static void pnv_tce_free(struct iommu_table *tbl, long index, long npages,
 		*(tcep++) = cpu_to_be64(0);
 
 	if (tbl->it_type & TCE_PCI_SWINV_FREE)
-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
-}
-
-static void pnv_tce_free_vm(struct iommu_table *tbl, long index, long npages)
-{
-	pnv_tce_free(tbl, index, npages, false);
+		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
 }
 
-static unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
+unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
 {
 	return ((u64 *)tbl->it_base)[index - tbl->it_offset];
 }
 
-static int pnv_tce_build_rm(struct iommu_table *tbl, long index, long npages,
-			    unsigned long uaddr,
-			    enum dma_data_direction direction,
-			    struct dma_attrs *attrs)
-{
-	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, true);
-}
-
-static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages)
-{
-	pnv_tce_free(tbl, index, npages, true);
-}
-
 void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 			       void *tce_mem, u64 tce_size,
 			       u64 dma_offset, unsigned page_shift)
@@ -744,11 +716,6 @@ void __init pnv_pci_init(void)
 	pci_devs_phb_init();
 
 	/* Configure IOMMU DMA hooks */
-	ppc_md.tce_build = pnv_tce_build_vm;
-	ppc_md.tce_free = pnv_tce_free_vm;
-	ppc_md.tce_build_rm = pnv_tce_build_rm;
-	ppc_md.tce_free_rm = pnv_tce_free_rm;
-	ppc_md.tce_get = pnv_tce_get;
 	set_pci_dma_ops(&dma_iommu_ops);
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 714ee3c..45a7560 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -197,6 +197,11 @@ struct pnv_phb {
 };
 
 extern struct pci_ops pnv_pci_ops;
+extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+		unsigned long uaddr, enum dma_data_direction direction,
+		struct dma_attrs *attrs);
+extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
+extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
 
 void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
 				unsigned char *log_buff);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index fe5117b..33f3a85 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -206,7 +206,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	int ret = 0;
 	unsigned long flags;
 
-	if (npages == 1) {
+	if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
 		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
 		                           direction, attrs);
 	}
@@ -298,6 +298,9 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n
 {
 	u64 rc;
 
+	if (!firmware_has_feature(FW_FEATURE_MULTITCE))
+		return tce_free_pSeriesLP(tbl, tcenum, npages);
+
 	rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
 
 	if (rc && printk_ratelimit()) {
@@ -473,7 +476,6 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
 	return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
 }
 
-
 #ifdef CONFIG_PCI
 static void iommu_table_setparms(struct pci_controller *phb,
 				 struct device_node *dn,
@@ -559,6 +561,12 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
 	tbl->it_size = size >> tbl->it_page_shift;
 }
 
+struct iommu_table_ops iommu_table_pseries_ops = {
+	.set = tce_build_pSeries,
+	.clear = tce_free_pSeries,
+	.get = tce_get_pseries
+};
+
 static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 {
 	struct device_node *dn;
@@ -627,6 +635,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 			   pci->phb->node);
 
 	iommu_table_setparms(pci->phb, dn, tbl);
+	tbl->it_ops = &iommu_table_pseries_ops;
 	pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
 	iommu_register_group(tbl, pci_domain_nr(bus), 0);
 
@@ -638,6 +647,11 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
 }
 
+struct iommu_table_ops iommu_table_lpar_multi_ops = {
+	.set = tce_buildmulti_pSeriesLP,
+	.clear = tce_freemulti_pSeriesLP,
+	.get = tce_get_pSeriesLP
+};
 
 static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 {
@@ -672,6 +686,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
 				   ppci->phb->node);
 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
 		iommu_register_group(tbl, pci_domain_nr(bus), 0);
 		pr_debug("  created table: %p\n", ppci->iommu_table);
@@ -699,6 +714,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
 				   phb->node);
 		iommu_table_setparms(phb, dn, tbl);
+		tbl->it_ops = &iommu_table_pseries_ops;
 		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
 		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
 		set_iommu_table_base(&dev->dev, tbl);
@@ -1121,6 +1137,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
 				   pci->phb->node);
 		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
 		pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
 		iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
 		pr_debug("  created table: %p\n", pci->iommu_table);
@@ -1315,22 +1332,11 @@ void iommu_init_early_pSeries(void)
 		return;
 
 	if (firmware_has_feature(FW_FEATURE_LPAR)) {
-		if (firmware_has_feature(FW_FEATURE_MULTITCE)) {
-			ppc_md.tce_build = tce_buildmulti_pSeriesLP;
-			ppc_md.tce_free	 = tce_freemulti_pSeriesLP;
-		} else {
-			ppc_md.tce_build = tce_build_pSeriesLP;
-			ppc_md.tce_free	 = tce_free_pSeriesLP;
-		}
-		ppc_md.tce_get   = tce_get_pSeriesLP;
 		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
 		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
 		ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
 		ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP;
 	} else {
-		ppc_md.tce_build = tce_build_pSeries;
-		ppc_md.tce_free  = tce_free_pSeries;
-		ppc_md.tce_get   = tce_get_pseries;
 		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
 		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
 	}
@@ -1348,8 +1354,6 @@ static int __init disable_multitce(char *str)
 	    firmware_has_feature(FW_FEATURE_LPAR) &&
 	    firmware_has_feature(FW_FEATURE_MULTITCE)) {
 		printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
-		ppc_md.tce_build = tce_build_pSeriesLP;
-		ppc_md.tce_free	 = tce_free_pSeriesLP;
 		powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;
 	}
 	return 1;
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index d00a566..90bcdfe 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -286,6 +286,12 @@ static int __init dart_init(struct device_node *dart_node)
 	return 0;
 }
 
+static struct iommu_table_ops iommu_dart_ops = {
+	.set = dart_build,
+	.clear = dart_free,
+	.flush = dart_flush,
+};
+
 static void iommu_table_dart_setup(void)
 {
 	iommu_table_dart.it_busno = 0;
@@ -298,6 +304,7 @@ static void iommu_table_dart_setup(void)
 	iommu_table_dart.it_base = (unsigned long)dart_vbase;
 	iommu_table_dart.it_index = 0;
 	iommu_table_dart.it_blocksize = 1;
+	iommu_table_dart.it_ops = &iommu_dart_ops;
 	iommu_init_table(&iommu_table_dart, -1);
 
 	/* Reserve the last page of the DART to avoid possible prefetch
@@ -386,11 +393,6 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops)
 	if (dart_init(dn) != 0)
 		goto bail;
 
-	/* Setup low level TCE operations for the core IOMMU code */
-	ppc_md.tce_build = dart_build;
-	ppc_md.tce_free  = dart_free;
-	ppc_md.tce_flush = dart_flush;
-
 	/* Setup bypass if supported */
 	if (dart_is_u4)
 		ppc_md.dma_set_mask = dart_dma_set_mask;
-- 
cgit v0.10.2


From decbda25728ddfbb28b77749d2545028e892ca99 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:07 +1000
Subject: powerpc/powernv/ioda/ioda2: Rework TCE invalidation in
 tce_build()/tce_free()

The pnv_pci_ioda_tce_invalidate() helper invalidates TCE cache. It is
supposed to be called on IODA1/2 and not called on p5ioc2. It receives
start and end host addresses of TCE table.

IODA2 actually needs PCI addresses to invalidate the cache. Those
can be calculated from host addresses but since we are going
to implement multi-level TCE tables, calculating PCI address from
a host address might get either tricky or ugly as TCE table remains flat
on PCI bus but not in RAM.

This moves pnv_pci_ioda_tce_invalidate() from generic pnv_tce_build/
pnt_tce_free and defines IODA1/2-specific callbacks which call generic
ones and do PHB-model-specific TCE cache invalidation. P5IOC2 keeps
using generic callbacks as before.

This changes pnv_pci_ioda2_tce_invalidate() to receives TCE index and
number of pages which are PCI addresses shifted by IOMMU page shift.

No change in behaviour is expected.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index b0a14c0..f710729 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1679,18 +1679,19 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 	}
 }
 
-static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
-					 struct iommu_table *tbl,
-					 __be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
+		unsigned long index, unsigned long npages, bool rm)
 {
+	struct pnv_ioda_pe *pe = tbl->data;
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
 		(__be64 __iomem *)tbl->it_index;
 	unsigned long start, end, inc;
 	const unsigned shift = tbl->it_page_shift;
 
-	start = __pa(startp);
-	end = __pa(endp);
+	start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
+	end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
+			npages - 1);
 
 	/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
 	if (tbl->it_busno) {
@@ -1726,16 +1727,39 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
 	 */
 }
 
+static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
+		long npages, unsigned long uaddr,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
+{
+	int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+			attrs);
+
+	if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
+		pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+
+	return ret;
+}
+
+static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
+		long npages)
+{
+	pnv_tce_free(tbl, index, npages);
+
+	if (tbl->it_type & TCE_PCI_SWINV_FREE)
+		pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+}
+
 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
-	.set = pnv_tce_build,
-	.clear = pnv_tce_free,
+	.set = pnv_ioda1_tce_build,
+	.clear = pnv_ioda1_tce_free,
 	.get = pnv_tce_get,
 };
 
-static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
-					 struct iommu_table *tbl,
-					 __be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+		unsigned long index, unsigned long npages, bool rm)
 {
+	struct pnv_ioda_pe *pe = tbl->data;
 	unsigned long start, end, inc;
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
@@ -1748,10 +1772,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
 	end = start;
 
 	/* Figure out the start, end and step */
-	inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64));
-	start |= (inc << shift);
-	inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64));
-	end |= (inc << shift);
+	start |= (index << shift);
+	end |= ((index + npages - 1) << shift);
 	inc = (0x1ull << shift);
 	mb();
 
@@ -1764,21 +1786,32 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
 	}
 }
 
-void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
-				 __be64 *startp, __be64 *endp, bool rm)
+static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
+		long npages, unsigned long uaddr,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
 {
-	struct pnv_ioda_pe *pe = tbl->data;
-	struct pnv_phb *phb = pe->phb;
+	int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+			attrs);
 
-	if (phb->type == PNV_PHB_IODA1)
-		pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm);
-	else
-		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
+	if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
+		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+
+	return ret;
+}
+
+static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
+		long npages)
+{
+	pnv_tce_free(tbl, index, npages);
+
+	if (tbl->it_type & TCE_PCI_SWINV_FREE)
+		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
 }
 
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
-	.set = pnv_tce_build,
-	.clear = pnv_tce_free,
+	.set = pnv_ioda2_tce_build,
+	.clear = pnv_ioda2_tce_free,
 	.get = pnv_tce_get,
 };
 
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 2793b9d..f72fc6e 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -577,37 +577,28 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 		struct dma_attrs *attrs)
 {
 	u64 proto_tce = iommu_direction_to_tce_perm(direction);
-	__be64 *tcep, *tces;
+	__be64 *tcep;
 	u64 rpn;
 
-	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
+	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
 	rpn = __pa(uaddr) >> tbl->it_page_shift;
 
 	while (npages--)
 		*(tcep++) = cpu_to_be64(proto_tce |
 				(rpn++ << tbl->it_page_shift));
 
-	/* Some implementations won't cache invalid TCEs and thus may not
-	 * need that flush. We'll probably turn it_type into a bit mask
-	 * of flags if that becomes the case
-	 */
-	if (tbl->it_type & TCE_PCI_SWINV_CREATE)
-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
 
 	return 0;
 }
 
 void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 {
-	__be64 *tcep, *tces;
+	__be64 *tcep;
 
-	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
+	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
 
 	while (npages--)
 		*(tcep++) = cpu_to_be64(0);
-
-	if (tbl->it_type & TCE_PCI_SWINV_FREE)
-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, false);
 }
 
 unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
-- 
cgit v0.10.2


From b348aa65297659c310943221ac1d3f4b4491ea44 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:08 +1000
Subject: powerpc/spapr: vfio: Replace iommu_table with iommu_table_group

Modern IBM POWERPC systems support multiple (currently two) TCE tables
per IOMMU group (a.k.a. PE). This adds a iommu_table_group container
for TCE tables. Right now just one table is supported.

This defines iommu_table_group struct which stores pointers to
iommu_group and iommu_table(s). This replaces iommu_table with
iommu_table_group where iommu_table was used to identify a group:
- iommu_register_group();
- iommudata of generic iommu_group;

This removes @data from iommu_table as it_table_group provides
same access to pnv_ioda_pe.

For IODA, instead of embedding iommu_table, the new iommu_table_group
keeps pointers to those. The iommu_table structs are allocated
dynamically.

For P5IOC2, both iommu_table_group and iommu_table are embedded into
PE struct. As there is no EEH and SRIOV support for P5IOC2,
iommu_free_table() should not be called on iommu_table struct pointers
so we can keep it embedded in pnv_phb::p5ioc2.

For pSeries, this replaces multiple calls of kzalloc_node() with a new
iommu_pseries_alloc_group() helper and stores the table group struct
pointer into the pci_dn struct. For release, a iommu_table_free_group()
helper is added.

This moves iommu_table struct allocation from SR-IOV code to
the generic DMA initialization code in pnv_pci_ioda_setup_dma_pe and
pnv_pci_ioda2_setup_dma_pe as this is where DMA is actually initialized.
This change is here because those lines had to be changed anyway.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index e2a45c3..5a7267f 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -91,14 +91,9 @@ struct iommu_table {
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
 	unsigned long  it_page_shift;/* table iommu page size */
-#ifdef CONFIG_IOMMU_API
-	struct iommu_group *it_group;
-#endif
+	struct iommu_table_group *it_table_group;
 	struct iommu_table_ops *it_ops;
 	void (*set_bypass)(struct iommu_table *tbl, bool enable);
-#ifdef CONFIG_PPC_POWERNV
-	void           *data;
-#endif
 };
 
 /* Pure 2^n version of get_order */
@@ -129,14 +124,22 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 					    int nid);
+#define IOMMU_TABLE_GROUP_MAX_TABLES	1
+
+struct iommu_table_group {
+	struct iommu_group *group;
+	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+};
+
 #ifdef CONFIG_IOMMU_API
-extern void iommu_register_group(struct iommu_table *tbl,
+
+extern void iommu_register_group(struct iommu_table_group *table_group,
 				 int pci_domain_number, unsigned long pe_num);
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
 #else
-static inline void iommu_register_group(struct iommu_table *tbl,
+static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
 					unsigned long pe_num)
 {
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index b76cd56..712add5 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -199,7 +199,7 @@ struct pci_dn {
 
 	struct  pci_dn *parent;
 	struct  pci_controller *phb;	/* for pci devices */
-	struct	iommu_table *iommu_table;	/* for phb's or bridges */
+	struct	iommu_table_group *table_group;	/* for phb's or bridges */
 	struct	device_node *node;	/* back-pointer to the device_node */
 
 	int	pci_ext_config_space;	/* for pci devices */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index c0e67e9..719f048 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -889,11 +889,12 @@ EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
  */
 static void group_release(void *iommu_data)
 {
-	struct iommu_table *tbl = iommu_data;
-	tbl->it_group = NULL;
+	struct iommu_table_group *table_group = iommu_data;
+
+	table_group->group = NULL;
 }
 
-void iommu_register_group(struct iommu_table *tbl,
+void iommu_register_group(struct iommu_table_group *table_group,
 		int pci_domain_number, unsigned long pe_num)
 {
 	struct iommu_group *grp;
@@ -905,8 +906,8 @@ void iommu_register_group(struct iommu_table *tbl,
 				PTR_ERR(grp));
 		return;
 	}
-	tbl->it_group = grp;
-	iommu_group_set_iommudata(grp, tbl, group_release);
+	table_group->group = grp;
+	iommu_group_set_iommudata(grp, table_group, group_release);
 	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
 			pci_domain_number, pe_num);
 	if (!name)
@@ -1094,7 +1095,7 @@ int iommu_add_device(struct device *dev)
 	}
 
 	tbl = get_iommu_table_base(dev);
-	if (!tbl || !tbl->it_group) {
+	if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) {
 		pr_debug("%s: Skipping device %s with no tbl\n",
 			 __func__, dev_name(dev));
 		return 0;
@@ -1102,7 +1103,7 @@ int iommu_add_device(struct device *dev)
 
 	pr_debug("%s: Adding %s to iommu group %d\n",
 		 __func__, dev_name(dev),
-		 iommu_group_id(tbl->it_group));
+		 iommu_group_id(tbl->it_table_group->group));
 
 	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
 		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
@@ -1111,7 +1112,7 @@ int iommu_add_device(struct device *dev)
 		return -EINVAL;
 	}
 
-	return iommu_group_add_device(tbl->it_group, dev);
+	return iommu_group_add_device(tbl->it_table_group->group, dev);
 }
 EXPORT_SYMBOL_GPL(iommu_add_device);
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index f710729..279dadf 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1087,10 +1087,6 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
 		return;
 	}
 
-	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
-			GFP_KERNEL, hose->node);
-	pe->tce32_table->data = pe;
-
 	/* Associate it with all child devices */
 	pnv_ioda_setup_same_PE(bus, pe);
 
@@ -1292,11 +1288,12 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	struct iommu_table    *tbl;
 	unsigned long         addr;
 	int64_t               rc;
+	struct iommu_table_group *table_group;
 
 	bus = dev->bus;
 	hose = pci_bus_to_host(bus);
 	phb = hose->private_data;
-	tbl = pe->tce32_table;
+	tbl = pe->table_group.tables[0];
 	addr = tbl->it_base;
 
 	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
@@ -1311,13 +1308,14 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	if (rc)
 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
 
-	if (tbl->it_group) {
-		iommu_group_put(tbl->it_group);
-		BUG_ON(tbl->it_group);
+	table_group = tbl->it_table_group;
+	if (table_group->group) {
+		iommu_group_put(table_group->group);
+		BUG_ON(table_group->group);
 	}
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
 	free_pages(addr, get_order(TCE32_TABLE_SIZE));
-	pe->tce32_table = NULL;
+	pe->table_group.tables[0] = NULL;
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
@@ -1465,10 +1463,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 			continue;
 		}
 
-		pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
-				GFP_KERNEL, hose->node);
-		pe->tce32_table->data = pe;
-
 		/* Put PE to the list */
 		mutex_lock(&phb->ioda.pe_list_mutex);
 		list_add_tail(&pe->list, &phb->ioda.pe_list);
@@ -1603,7 +1597,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 
 	pe = &phb->ioda.pe_array[pdn->pe_number];
 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
-	set_iommu_table_base(&pdev->dev, pe->tce32_table);
+	set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
 	/*
 	 * Note: iommu_add_device() will fail here as
 	 * for physical PE: the device is already added by now;
@@ -1637,7 +1631,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 	} else {
 		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
 		set_dma_ops(&pdev->dev, &dma_iommu_ops);
-		set_iommu_table_base(&pdev->dev, pe->tce32_table);
+		set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
 	}
 	*pdev->dev.dma_mask = dma_mask;
 	return 0;
@@ -1671,7 +1665,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
-		set_iommu_table_base(&dev->dev, pe->tce32_table);
+		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
 		iommu_add_device(&dev->dev);
 
 		if (dev->subordinate)
@@ -1682,7 +1676,8 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
 		unsigned long index, unsigned long npages, bool rm)
 {
-	struct pnv_ioda_pe *pe = tbl->data;
+	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+			struct pnv_ioda_pe, table_group);
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
 		(__be64 __iomem *)tbl->it_index;
@@ -1759,7 +1754,8 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 		unsigned long index, unsigned long npages, bool rm)
 {
-	struct pnv_ioda_pe *pe = tbl->data;
+	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+			struct pnv_ioda_pe, table_group);
 	unsigned long start, end, inc;
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
@@ -1835,8 +1831,12 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
-	tbl = pe->tce32_table;
-	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+			phb->hose->node);
+	tbl->it_table_group = &pe->table_group;
+	pe->table_group.tables[0] = tbl;
+	iommu_register_group(&pe->table_group, phb->hose->global_number,
+			pe->pe_number);
 
 	/* Grab a 32-bit TCE table */
 	pe->tce32_seg = base;
@@ -1915,7 +1915,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 
 static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
 {
-	struct pnv_ioda_pe *pe = tbl->data;
+	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+			struct pnv_ioda_pe, table_group);
 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
 	int64_t rc;
 
@@ -1949,10 +1950,10 @@ static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
 	pe->tce_bypass_base = 1ull << 59;
 
 	/* Install set_bypass callback for VFIO */
-	pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass;
+	pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass;
 
 	/* Enable bypass by default */
-	pnv_pci_ioda2_set_bypass(pe->tce32_table, true);
+	pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true);
 }
 
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
@@ -1969,8 +1970,12 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
-	tbl = pe->tce32_table;
-	iommu_register_group(tbl, phb->hose->global_number, pe->pe_number);
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
+			phb->hose->node);
+	tbl->it_table_group = &pe->table_group;
+	pe->table_group.tables[0] = tbl;
+	iommu_register_group(&pe->table_group, phb->hose->global_number,
+			pe->pe_number);
 
 	/* The PE will reserve all possible 32-bits space */
 	pe->tce32_seg = 0;
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 5a387a9..6e00104 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -92,14 +92,16 @@ static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
 static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 					 struct pci_dev *pdev)
 {
-	if (phb->p5ioc2.iommu_table.it_map == NULL) {
-		phb->p5ioc2.iommu_table.it_ops = &pnv_p5ioc2_iommu_ops;
-		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
-		iommu_register_group(&phb->p5ioc2.iommu_table,
+	struct iommu_table *tbl = phb->p5ioc2.table_group.tables[0];
+
+	if (!tbl->it_map) {
+		tbl->it_ops = &pnv_p5ioc2_iommu_ops;
+		iommu_init_table(tbl, phb->hose->node);
+		iommu_register_group(&phb->p5ioc2.table_group,
 				pci_domain_nr(phb->hose->bus), phb->opal_id);
 	}
 
-	set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
+	set_iommu_table_base(&pdev->dev, tbl);
 	iommu_add_device(&pdev->dev);
 }
 
@@ -188,6 +190,12 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 	pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table,
 				  tce_mem, tce_size, 0,
 				  IOMMU_PAGE_SHIFT_4K);
+	/*
+	 * We do not allocate iommu_table as we do not support
+	 * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table()
+	 * should not be called for phb->p5ioc2.table_group.tables[0] ever.
+	 */
+	phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
 }
 
 void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 45a7560..d8ba34d 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -57,7 +57,7 @@ struct pnv_ioda_pe {
 	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
 	int			tce32_seg;
 	int			tce32_segcount;
-	struct iommu_table	*tce32_table;
+	struct iommu_table_group table_group;
 	phys_addr_t		tce_inval_reg_phys;
 
 	/* 64-bit TCE bypass region */
@@ -120,6 +120,7 @@ struct pnv_phb {
 	union {
 		struct {
 			struct iommu_table iommu_table;
+			struct iommu_table_group table_group;
 		} p5ioc2;
 
 		struct {
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 33f3a85..307d704 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -52,16 +52,51 @@
 
 #include "pseries.h"
 
-static void iommu_pseries_free_table(struct iommu_table *tbl,
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+	struct iommu_table_group *table_group = NULL;
+	struct iommu_table *tbl = NULL;
+
+	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
+			   node);
+	if (!table_group)
+		goto fail_exit;
+
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
+	if (!tbl)
+		goto fail_exit;
+
+	tbl->it_table_group = table_group;
+	table_group->tables[0] = tbl;
+
+	return table_group;
+
+fail_exit:
+	kfree(table_group);
+	kfree(tbl);
+
+	return NULL;
+}
+
+static void iommu_pseries_free_group(struct iommu_table_group *table_group,
 		const char *node_name)
 {
+	struct iommu_table *tbl;
+
+	if (!table_group)
+		return;
+
 #ifdef CONFIG_IOMMU_API
-	if (tbl->it_group) {
-		iommu_group_put(tbl->it_group);
-		BUG_ON(tbl->it_group);
+	if (table_group->group) {
+		iommu_group_put(table_group->group);
+		BUG_ON(table_group->group);
 	}
 #endif
+
+	tbl = table_group->tables[0];
 	iommu_free_table(tbl, node_name);
+
+	kfree(table_group);
 }
 
 static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
@@ -631,13 +666,13 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	pci->phb->dma_window_size = 0x8000000ul;
 	pci->phb->dma_window_base_cur = 0x8000000ul;
 
-	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-			   pci->phb->node);
+	pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
+	tbl = pci->table_group->tables[0];
 
 	iommu_table_setparms(pci->phb, dn, tbl);
 	tbl->it_ops = &iommu_table_pseries_ops;
-	pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
-	iommu_register_group(tbl, pci_domain_nr(bus), 0);
+	iommu_init_table(tbl, pci->phb->node);
+	iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
 
 	/* Divide the rest (1.75GB) among the children */
 	pci->phb->dma_window_size = 0x80000000ul;
@@ -680,16 +715,17 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 	ppci = PCI_DN(pdn);
 
 	pr_debug("  parent is %s, iommu_table: 0x%p\n",
-		 pdn->full_name, ppci->iommu_table);
+		 pdn->full_name, ppci->table_group);
 
-	if (!ppci->iommu_table) {
-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-				   ppci->phb->node);
+	if (!ppci->table_group) {
+		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
+		tbl = ppci->table_group->tables[0];
 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
-		ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
-		iommu_register_group(tbl, pci_domain_nr(bus), 0);
-		pr_debug("  created table: %p\n", ppci->iommu_table);
+		iommu_init_table(tbl, ppci->phb->node);
+		iommu_register_group(ppci->table_group,
+				pci_domain_nr(bus), 0);
+		pr_debug("  created table: %p\n", ppci->table_group);
 	}
 }
 
@@ -711,12 +747,13 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		struct pci_controller *phb = PCI_DN(dn)->phb;
 
 		pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-				   phb->node);
+		PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
+		tbl = PCI_DN(dn)->table_group->tables[0];
 		iommu_table_setparms(phb, dn, tbl);
 		tbl->it_ops = &iommu_table_pseries_ops;
-		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
-		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
+		iommu_init_table(tbl, phb->node);
+		iommu_register_group(PCI_DN(dn)->table_group,
+				pci_domain_nr(phb->bus), 0);
 		set_iommu_table_base(&dev->dev, tbl);
 		iommu_add_device(&dev->dev);
 		return;
@@ -726,11 +763,12 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 	 * an already allocated iommu table is found and use that.
 	 */
 
-	while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)
+	while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
 		dn = dn->parent;
 
 	if (dn && PCI_DN(dn)) {
-		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
+		set_iommu_table_base(&dev->dev,
+				PCI_DN(dn)->table_group->tables[0]);
 		iommu_add_device(&dev->dev);
 	} else
 		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
@@ -1117,7 +1155,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	dn = pci_device_to_OF_node(dev);
 	pr_debug("  node is %s\n", dn->full_name);
 
-	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
+	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
 	     pdn = pdn->parent) {
 		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
 		if (dma_window)
@@ -1133,19 +1171,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	pr_debug("  parent is %s\n", pdn->full_name);
 
 	pci = PCI_DN(pdn);
-	if (!pci->iommu_table) {
-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-				   pci->phb->node);
+	if (!pci->table_group) {
+		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
+		tbl = pci->table_group->tables[0];
 		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
 		tbl->it_ops = &iommu_table_lpar_multi_ops;
-		pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
-		iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
-		pr_debug("  created table: %p\n", pci->iommu_table);
+		iommu_init_table(tbl, pci->phb->node);
+		iommu_register_group(pci->table_group,
+				pci_domain_nr(pci->phb->bus), 0);
+		pr_debug("  created table: %p\n", pci->table_group);
 	} else {
-		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
+		pr_debug("  found DMA window, table: %p\n", pci->table_group);
 	}
 
-	set_iommu_table_base(&dev->dev, pci->iommu_table);
+	set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
 	iommu_add_device(&dev->dev);
 }
 
@@ -1176,7 +1215,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
 		 * search upwards in the tree until we either hit a dma-window
 		 * property, OR find a parent with a table already allocated.
 		 */
-		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
+		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
 				pdn = pdn->parent) {
 			dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
 			if (dma_window)
@@ -1220,7 +1259,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
 		dn = pci_device_to_OF_node(pdev);
 
 		/* search upwards for ibm,dma-window */
-		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table;
+		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group;
 				dn = dn->parent)
 			if (of_get_property(dn, "ibm,dma-window", NULL))
 				break;
@@ -1300,8 +1339,8 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 		 * the device node.
 		 */
 		remove_ddw(np, false);
-		if (pci && pci->iommu_table)
-			iommu_pseries_free_table(pci->iommu_table,
+		if (pci && pci->table_group)
+			iommu_pseries_free_group(pci->table_group,
 					np->full_name);
 
 		spin_lock(&direct_window_list_lock);
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index e65bc73..c4bc345 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -190,10 +190,11 @@ static void tce_iommu_release(void *iommu_data)
 {
 	struct tce_container *container = iommu_data;
 
-	WARN_ON(container->tbl && !container->tbl->it_group);
+	WARN_ON(container->tbl && !container->tbl->it_table_group->group);
 
-	if (container->tbl && container->tbl->it_group)
-		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
+	if (container->tbl && container->tbl->it_table_group->group)
+		tce_iommu_detach_group(iommu_data,
+				container->tbl->it_table_group->group);
 
 	tce_iommu_disable(container);
 	mutex_destroy(&container->lock);
@@ -345,7 +346,7 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (!tbl)
 			return -ENXIO;
 
-		BUG_ON(!tbl->it_group);
+		BUG_ON(!tbl->it_table_group->group);
 
 		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 
@@ -433,11 +434,12 @@ static long tce_iommu_ioctl(void *iommu_data,
 		mutex_unlock(&container->lock);
 		return 0;
 	case VFIO_EEH_PE_OP:
-		if (!container->tbl || !container->tbl->it_group)
+		if (!container->tbl || !container->tbl->it_table_group->group)
 			return -ENODEV;
 
-		return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group,
-						  cmd, arg);
+		return vfio_spapr_iommu_eeh_ioctl(
+				container->tbl->it_table_group->group,
+				cmd, arg);
 	}
 
 	return -ENOTTY;
@@ -457,7 +459,8 @@ static int tce_iommu_attach_group(void *iommu_data,
 			iommu_group_id(iommu_group), iommu_group); */
 	if (container->tbl) {
 		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
-				iommu_group_id(container->tbl->it_group),
+				iommu_group_id(container->tbl->
+						it_table_group->group),
 				iommu_group_id(iommu_group));
 		ret = -EBUSY;
 		goto unlock_exit;
@@ -491,13 +494,13 @@ static void tce_iommu_detach_group(void *iommu_data,
 	if (tbl != container->tbl) {
 		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
 				iommu_group_id(iommu_group),
-				iommu_group_id(tbl->it_group));
+				iommu_group_id(tbl->it_table_group->group));
 		goto unlock_exit;
 	}
 
 	if (container->enabled) {
 		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
-				iommu_group_id(tbl->it_group));
+				iommu_group_id(tbl->it_table_group->group));
 		tce_iommu_disable(container);
 	}
 
-- 
cgit v0.10.2


From 0eaf4defc7c44ed5dd33a03cab12a5f88c9b4b86 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:09 +1000
Subject: powerpc/spapr: vfio: Switch from iommu_table to new iommu_table_group

So far one TCE table could only be used by one IOMMU group. However
IODA2 hardware allows programming the same TCE table address to
multiple PE allowing sharing tables.

This replaces a single pointer to a group in a iommu_table struct
with a linked list of groups which provides the way of invalidating
TCE cache for every PE when an actual TCE table is updated. This adds
pnv_pci_link_table_and_group() and pnv_pci_unlink_table_and_group()
helpers to manage the list. However without VFIO, it is still going
to be a single IOMMU group per iommu_table.

This changes iommu_add_device() to add a device to a first group
from the group list of a table as it is only called from the platform
init code or PCI bus notifier and at these moments there is only
one group per table.

This does not change TCE invalidation code to loop through all
attached groups in order to simplify this patch and because
it is not really needed in most cases. IODA2 is fixed in a later
patch.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 5a7267f..44a20cc 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -91,7 +91,7 @@ struct iommu_table {
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
 	unsigned long  it_page_shift;/* table iommu page size */
-	struct iommu_table_group *it_table_group;
+	struct list_head it_group_list;/* List of iommu_table_group_link */
 	struct iommu_table_ops *it_ops;
 	void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
@@ -126,6 +126,12 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 					    int nid);
 #define IOMMU_TABLE_GROUP_MAX_TABLES	1
 
+struct iommu_table_group_link {
+	struct list_head next;
+	struct rcu_head rcu;
+	struct iommu_table_group *table_group;
+};
+
 struct iommu_table_group {
 	struct iommu_group *group;
 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 719f048..be258b2 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1078,6 +1078,7 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership);
 int iommu_add_device(struct device *dev)
 {
 	struct iommu_table *tbl;
+	struct iommu_table_group_link *tgl;
 
 	/*
 	 * The sysfs entries should be populated before
@@ -1095,15 +1096,22 @@ int iommu_add_device(struct device *dev)
 	}
 
 	tbl = get_iommu_table_base(dev);
-	if (!tbl || !tbl->it_table_group || !tbl->it_table_group->group) {
+	if (!tbl) {
 		pr_debug("%s: Skipping device %s with no tbl\n",
 			 __func__, dev_name(dev));
 		return 0;
 	}
 
+	tgl = list_first_entry_or_null(&tbl->it_group_list,
+			struct iommu_table_group_link, next);
+	if (!tgl) {
+		pr_debug("%s: Skipping device %s with no group\n",
+			 __func__, dev_name(dev));
+		return 0;
+	}
 	pr_debug("%s: Adding %s to iommu group %d\n",
 		 __func__, dev_name(dev),
-		 iommu_group_id(tbl->it_table_group->group));
+		 iommu_group_id(tgl->table_group->group));
 
 	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
 		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
@@ -1112,7 +1120,7 @@ int iommu_add_device(struct device *dev)
 		return -EINVAL;
 	}
 
-	return iommu_group_add_device(tbl->it_table_group->group, dev);
+	return iommu_group_add_device(tgl->table_group->group, dev);
 }
 EXPORT_SYMBOL_GPL(iommu_add_device);
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 279dadf..3b41306 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1288,7 +1288,6 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	struct iommu_table    *tbl;
 	unsigned long         addr;
 	int64_t               rc;
-	struct iommu_table_group *table_group;
 
 	bus = dev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1308,14 +1307,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	if (rc)
 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
 
-	table_group = tbl->it_table_group;
-	if (table_group->group) {
-		iommu_group_put(table_group->group);
-		BUG_ON(table_group->group);
+	pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
+	if (pe->table_group.group) {
+		iommu_group_put(pe->table_group.group);
+		BUG_ON(pe->table_group.group);
 	}
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
 	free_pages(addr, get_order(TCE32_TABLE_SIZE));
-	pe->table_group.tables[0] = NULL;
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
@@ -1676,7 +1674,10 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
 		unsigned long index, unsigned long npages, bool rm)
 {
-	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+	struct iommu_table_group_link *tgl = list_first_entry_or_null(
+			&tbl->it_group_list, struct iommu_table_group_link,
+			next);
+	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 			struct pnv_ioda_pe, table_group);
 	__be64 __iomem *invalidate = rm ?
 		(__be64 __iomem *)pe->tce_inval_reg_phys :
@@ -1754,7 +1755,10 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 		unsigned long index, unsigned long npages, bool rm)
 {
-	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+	struct iommu_table_group_link *tgl = list_first_entry_or_null(
+			&tbl->it_group_list, struct iommu_table_group_link,
+			next);
+	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 			struct pnv_ioda_pe, table_group);
 	unsigned long start, end, inc;
 	__be64 __iomem *invalidate = rm ?
@@ -1831,12 +1835,10 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
-	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-			phb->hose->node);
-	tbl->it_table_group = &pe->table_group;
-	pe->table_group.tables[0] = tbl;
+	tbl = pnv_pci_table_alloc(phb->hose->node);
 	iommu_register_group(&pe->table_group, phb->hose->global_number,
 			pe->pe_number);
+	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
 	/* Grab a 32-bit TCE table */
 	pe->tce32_seg = base;
@@ -1911,11 +1913,18 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 		pe->tce32_seg = -1;
 	if (tce_mem)
 		__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
+	if (tbl) {
+		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
+		iommu_free_table(tbl, "pnv");
+	}
 }
 
 static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
 {
-	struct pnv_ioda_pe *pe = container_of(tbl->it_table_group,
+	struct iommu_table_group_link *tgl = list_first_entry_or_null(
+			&tbl->it_group_list, struct iommu_table_group_link,
+			next);
+	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 			struct pnv_ioda_pe, table_group);
 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
 	int64_t rc;
@@ -1970,12 +1979,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
-	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-			phb->hose->node);
-	tbl->it_table_group = &pe->table_group;
-	pe->table_group.tables[0] = tbl;
+	tbl = pnv_pci_table_alloc(phb->hose->node);
 	iommu_register_group(&pe->table_group, phb->hose->global_number,
 			pe->pe_number);
+	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
 	/* The PE will reserve all possible 32-bits space */
 	pe->tce32_seg = 0;
@@ -2048,6 +2055,10 @@ fail:
 		pe->tce32_seg = -1;
 	if (tce_mem)
 		__free_pages(tce_mem, get_order(tce_table_size));
+	if (tbl) {
+		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
+		iommu_free_table(tbl, "pnv");
+	}
 }
 
 static void pnv_ioda_setup_dma(struct pnv_phb *phb)
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 6e00104..f80e5a1 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -99,6 +99,9 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 		iommu_init_table(tbl, phb->hose->node);
 		iommu_register_group(&phb->p5ioc2.table_group,
 				pci_domain_nr(phb->hose->bus), phb->opal_id);
+		INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+		pnv_pci_link_table_and_group(phb->hose->node, 0,
+				tbl, &phb->p5ioc2.table_group);
 	}
 
 	set_iommu_table_base(&pdev->dev, tbl);
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index f72fc6e..00f1abd 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -606,6 +606,81 @@ unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
 	return ((u64 *)tbl->it_base)[index - tbl->it_offset];
 }
 
+struct iommu_table *pnv_pci_table_alloc(int nid)
+{
+	struct iommu_table *tbl;
+
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
+	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+
+	return tbl;
+}
+
+long pnv_pci_link_table_and_group(int node, int num,
+		struct iommu_table *tbl,
+		struct iommu_table_group *table_group)
+{
+	struct iommu_table_group_link *tgl = NULL;
+
+	if (WARN_ON(!tbl || !table_group))
+		return -EINVAL;
+
+	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
+			node);
+	if (!tgl)
+		return -ENOMEM;
+
+	tgl->table_group = table_group;
+	list_add_rcu(&tgl->next, &tbl->it_group_list);
+
+	table_group->tables[num] = tbl;
+
+	return 0;
+}
+
+static void pnv_iommu_table_group_link_free(struct rcu_head *head)
+{
+	struct iommu_table_group_link *tgl = container_of(head,
+			struct iommu_table_group_link, rcu);
+
+	kfree(tgl);
+}
+
+void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
+		struct iommu_table_group *table_group)
+{
+	long i;
+	bool found;
+	struct iommu_table_group_link *tgl;
+
+	if (!tbl || !table_group)
+		return;
+
+	/* Remove link to a group from table's list of attached groups */
+	found = false;
+	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+		if (tgl->table_group == table_group) {
+			list_del_rcu(&tgl->next);
+			call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free);
+			found = true;
+			break;
+		}
+	}
+	if (WARN_ON(!found))
+		return;
+
+	/* Clean a pointer to iommu_table in iommu_table_group::tables[] */
+	found = false;
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		if (table_group->tables[i] == tbl) {
+			table_group->tables[i] = NULL;
+			found = true;
+			break;
+		}
+	}
+	WARN_ON(!found);
+}
+
 void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 			       void *tce_mem, u64 tce_size,
 			       u64 dma_offset, unsigned page_shift)
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index d8ba34d..d37cb4d 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -210,6 +210,13 @@ int pnv_pci_cfg_read(struct pci_dn *pdn,
 		     int where, int size, u32 *val);
 int pnv_pci_cfg_write(struct pci_dn *pdn,
 		      int where, int size, u32 val);
+extern struct iommu_table *pnv_pci_table_alloc(int nid);
+
+extern long pnv_pci_link_table_and_group(int node, int num,
+		struct iommu_table *tbl,
+		struct iommu_table_group *table_group);
+extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
+		struct iommu_table_group *table_group);
 extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 				      void *tce_mem, u64 tce_size,
 				      u64 dma_offset, unsigned page_shift);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 307d704..10510de 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -37,6 +37,7 @@
 #include <linux/memory.h>
 #include <linux/of.h>
 #include <linux/iommu.h>
+#include <linux/rculist.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -56,6 +57,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
 	struct iommu_table_group *table_group = NULL;
 	struct iommu_table *tbl = NULL;
+	struct iommu_table_group_link *tgl = NULL;
 
 	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
 			   node);
@@ -66,12 +68,21 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 	if (!tbl)
 		goto fail_exit;
 
-	tbl->it_table_group = table_group;
+	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
+			node);
+	if (!tgl)
+		goto fail_exit;
+
+	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	tgl->table_group = table_group;
+	list_add_rcu(&tgl->next, &tbl->it_group_list);
+
 	table_group->tables[0] = tbl;
 
 	return table_group;
 
 fail_exit:
+	kfree(tgl);
 	kfree(table_group);
 	kfree(tbl);
 
@@ -82,18 +93,28 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
 		const char *node_name)
 {
 	struct iommu_table *tbl;
+#ifdef CONFIG_IOMMU_API
+	struct iommu_table_group_link *tgl;
+#endif
 
 	if (!table_group)
 		return;
 
+	tbl = table_group->tables[0];
 #ifdef CONFIG_IOMMU_API
+	tgl = list_first_entry_or_null(&tbl->it_group_list,
+			struct iommu_table_group_link, next);
+
+	WARN_ON_ONCE(!tgl);
+	if (tgl) {
+		list_del_rcu(&tgl->next);
+		kfree(tgl);
+	}
 	if (table_group->group) {
 		iommu_group_put(table_group->group);
 		BUG_ON(table_group->group);
 	}
 #endif
-
-	tbl = table_group->tables[0];
 	iommu_free_table(tbl, node_name);
 
 	kfree(table_group);
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index c4bc345..ffc634a 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -88,7 +88,7 @@ static void decrement_locked_vm(long npages)
  */
 struct tce_container {
 	struct mutex lock;
-	struct iommu_table *tbl;
+	struct iommu_group *grp;
 	bool enabled;
 	unsigned long locked_pages;
 };
@@ -103,13 +103,42 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
 }
 
+static long tce_iommu_find_table(struct tce_container *container,
+		phys_addr_t ioba, struct iommu_table **ptbl)
+{
+	long i;
+	struct iommu_table_group *table_group;
+
+	table_group = iommu_group_get_iommudata(container->grp);
+	if (!table_group)
+		return -1;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = table_group->tables[i];
+
+		if (tbl) {
+			unsigned long entry = ioba >> tbl->it_page_shift;
+			unsigned long start = tbl->it_offset;
+			unsigned long end = start + tbl->it_size;
+
+			if ((start <= entry) && (entry < end)) {
+				*ptbl = tbl;
+				return i;
+			}
+		}
+	}
+
+	return -1;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
 	int ret = 0;
 	unsigned long locked;
-	struct iommu_table *tbl = container->tbl;
+	struct iommu_table *tbl;
+	struct iommu_table_group *table_group;
 
-	if (!container->tbl)
+	if (!container->grp)
 		return -ENXIO;
 
 	if (!current->mm)
@@ -143,6 +172,11 @@ static int tce_iommu_enable(struct tce_container *container)
 	 * as this information is only available from KVM and VFIO is
 	 * KVM agnostic.
 	 */
+	table_group = iommu_group_get_iommudata(container->grp);
+	if (!table_group)
+		return -ENODEV;
+
+	tbl = table_group->tables[0];
 	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
 	ret = try_increment_locked_vm(locked);
 	if (ret)
@@ -190,11 +224,10 @@ static void tce_iommu_release(void *iommu_data)
 {
 	struct tce_container *container = iommu_data;
 
-	WARN_ON(container->tbl && !container->tbl->it_table_group->group);
+	WARN_ON(container->grp);
 
-	if (container->tbl && container->tbl->it_table_group->group)
-		tce_iommu_detach_group(iommu_data,
-				container->tbl->it_table_group->group);
+	if (container->grp)
+		tce_iommu_detach_group(iommu_data, container->grp);
 
 	tce_iommu_disable(container);
 	mutex_destroy(&container->lock);
@@ -312,9 +345,16 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 		struct vfio_iommu_spapr_tce_info info;
-		struct iommu_table *tbl = container->tbl;
+		struct iommu_table *tbl;
+		struct iommu_table_group *table_group;
+
+		if (WARN_ON(!container->grp))
+			return -ENXIO;
+
+		table_group = iommu_group_get_iommudata(container->grp);
 
-		if (WARN_ON(!tbl))
+		tbl = table_group->tables[0];
+		if (WARN_ON_ONCE(!tbl))
 			return -ENXIO;
 
 		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
@@ -337,17 +377,13 @@ static long tce_iommu_ioctl(void *iommu_data,
 	}
 	case VFIO_IOMMU_MAP_DMA: {
 		struct vfio_iommu_type1_dma_map param;
-		struct iommu_table *tbl = container->tbl;
+		struct iommu_table *tbl = NULL;
 		unsigned long tce;
+		long num;
 
 		if (!container->enabled)
 			return -EPERM;
 
-		if (!tbl)
-			return -ENXIO;
-
-		BUG_ON(!tbl->it_table_group->group);
-
 		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 
 		if (copy_from_user(&param, (void __user *)arg, minsz))
@@ -360,6 +396,10 @@ static long tce_iommu_ioctl(void *iommu_data,
 				VFIO_DMA_MAP_FLAG_WRITE))
 			return -EINVAL;
 
+		num = tce_iommu_find_table(container, param.iova, &tbl);
+		if (num < 0)
+			return -ENXIO;
+
 		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 			return -EINVAL;
@@ -385,14 +425,12 @@ static long tce_iommu_ioctl(void *iommu_data,
 	}
 	case VFIO_IOMMU_UNMAP_DMA: {
 		struct vfio_iommu_type1_dma_unmap param;
-		struct iommu_table *tbl = container->tbl;
+		struct iommu_table *tbl = NULL;
+		long num;
 
 		if (!container->enabled)
 			return -EPERM;
 
-		if (WARN_ON(!tbl))
-			return -ENXIO;
-
 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 				size);
 
@@ -406,6 +444,10 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (param.flags)
 			return -EINVAL;
 
+		num = tce_iommu_find_table(container, param.iova, &tbl);
+		if (num < 0)
+			return -ENXIO;
+
 		if (param.size & ~IOMMU_PAGE_MASK(tbl))
 			return -EINVAL;
 
@@ -434,12 +476,11 @@ static long tce_iommu_ioctl(void *iommu_data,
 		mutex_unlock(&container->lock);
 		return 0;
 	case VFIO_EEH_PE_OP:
-		if (!container->tbl || !container->tbl->it_table_group->group)
+		if (!container->grp)
 			return -ENODEV;
 
-		return vfio_spapr_iommu_eeh_ioctl(
-				container->tbl->it_table_group->group,
-				cmd, arg);
+		return vfio_spapr_iommu_eeh_ioctl(container->grp,
+						  cmd, arg);
 	}
 
 	return -ENOTTY;
@@ -450,17 +491,15 @@ static int tce_iommu_attach_group(void *iommu_data,
 {
 	int ret;
 	struct tce_container *container = iommu_data;
-	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+	struct iommu_table_group *table_group;
 
-	BUG_ON(!tbl);
 	mutex_lock(&container->lock);
 
 	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
 			iommu_group_id(iommu_group), iommu_group); */
-	if (container->tbl) {
+	if (container->grp) {
 		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
-				iommu_group_id(container->tbl->
-						it_table_group->group),
+				iommu_group_id(container->grp),
 				iommu_group_id(iommu_group));
 		ret = -EBUSY;
 		goto unlock_exit;
@@ -473,9 +512,15 @@ static int tce_iommu_attach_group(void *iommu_data,
 		goto unlock_exit;
 	}
 
-	ret = iommu_take_ownership(tbl);
+	table_group = iommu_group_get_iommudata(iommu_group);
+	if (!table_group) {
+		ret = -ENXIO;
+		goto unlock_exit;
+	}
+
+	ret = iommu_take_ownership(table_group->tables[0]);
 	if (!ret)
-		container->tbl = tbl;
+		container->grp = iommu_group;
 
 unlock_exit:
 	mutex_unlock(&container->lock);
@@ -487,26 +532,31 @@ static void tce_iommu_detach_group(void *iommu_data,
 		struct iommu_group *iommu_group)
 {
 	struct tce_container *container = iommu_data;
-	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
+	struct iommu_table_group *table_group;
+	struct iommu_table *tbl;
 
-	BUG_ON(!tbl);
 	mutex_lock(&container->lock);
-	if (tbl != container->tbl) {
+	if (iommu_group != container->grp) {
 		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
 				iommu_group_id(iommu_group),
-				iommu_group_id(tbl->it_table_group->group));
+				iommu_group_id(container->grp));
 		goto unlock_exit;
 	}
 
 	if (container->enabled) {
 		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
-				iommu_group_id(tbl->it_table_group->group));
+				iommu_group_id(container->grp));
 		tce_iommu_disable(container);
 	}
 
 	/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
 	   iommu_group_id(iommu_group), iommu_group); */
-	container->tbl = NULL;
+	container->grp = NULL;
+
+	table_group = iommu_group_get_iommudata(iommu_group);
+	BUG_ON(!table_group);
+
+	tbl = table_group->tables[0];
 	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 	iommu_release_ownership(tbl);
 
-- 
cgit v0.10.2


From f87a88642e660edd8912ad39fe77848c6f9927a2 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:10 +1000
Subject: vfio: powerpc/spapr/iommu/powernv/ioda2: Rework IOMMU ownership
 control

This adds tce_iommu_take_ownership() and tce_iommu_release_ownership
which call in a loop iommu_take_ownership()/iommu_release_ownership()
for every table on the group. As there is just one now, no change in
behaviour is expected.

At the moment the iommu_table struct has a set_bypass() which enables/
disables DMA bypass on IODA2 PHB. This is exposed to POWERPC IOMMU code
which calls this callback when external IOMMU users such as VFIO are
about to get over a PHB.

The set_bypass() callback is not really an iommu_table function but
IOMMU/PE function. This introduces a iommu_table_group_ops struct and
adds take_ownership()/release_ownership() callbacks to it which are
called when an external user takes/releases control over the IOMMU.

This replaces set_bypass() with ownership callbacks as it is not
necessarily just bypass enabling, it can be something else/more
so let's give it more generic name.

The callbacks is implemented for IODA2 only. Other platforms (P5IOC2,
IODA1) will use the old iommu_take_ownership/iommu_release_ownership API.
The following patches will replace iommu_take_ownership/
iommu_release_ownership calls in IODA2 with full IOMMU table release/
create.

As we here and touching bypass control, this removes
pnv_pci_ioda2_setup_bypass_pe() as it does not do much
more compared to pnv_pci_ioda2_set_bypass. This moves tce_bypass_base
initialization to pnv_pci_ioda2_setup_dma_pe.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 44a20cc..489133c 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -93,7 +93,6 @@ struct iommu_table {
 	unsigned long  it_page_shift;/* table iommu page size */
 	struct list_head it_group_list;/* List of iommu_table_group_link */
 	struct iommu_table_ops *it_ops;
-	void (*set_bypass)(struct iommu_table *tbl, bool enable);
 };
 
 /* Pure 2^n version of get_order */
@@ -126,6 +125,15 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 					    int nid);
 #define IOMMU_TABLE_GROUP_MAX_TABLES	1
 
+struct iommu_table_group;
+
+struct iommu_table_group_ops {
+	/* Switch ownership from platform code to external user (e.g. VFIO) */
+	void (*take_ownership)(struct iommu_table_group *table_group);
+	/* Switch ownership from external user (e.g. VFIO) back to core */
+	void (*release_ownership)(struct iommu_table_group *table_group);
+};
+
 struct iommu_table_group_link {
 	struct list_head next;
 	struct rcu_head rcu;
@@ -135,6 +143,7 @@ struct iommu_table_group_link {
 struct iommu_table_group {
 	struct iommu_group *group;
 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+	struct iommu_table_group_ops *ops;
 };
 
 #ifdef CONFIG_IOMMU_API
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index be258b2..e7f81b7 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1047,14 +1047,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
 
 	memset(tbl->it_map, 0xff, sz);
 
-	/*
-	 * Disable iommu bypass, otherwise the user can DMA to all of
-	 * our physical memory via the bypass window instead of just
-	 * the pages that has been explicitly mapped into the iommu
-	 */
-	if (tbl->set_bypass)
-		tbl->set_bypass(tbl, false);
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
@@ -1068,10 +1060,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
 	/* Restore bit#0 set by iommu_init_table() */
 	if (tbl->it_offset == 0)
 		set_bit(0, tbl->it_map);
-
-	/* The kernel owns the device now, we can restore the iommu bypass */
-	if (tbl->set_bypass)
-		tbl->set_bypass(tbl, true);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3b41306..17a7752 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1919,13 +1919,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	}
 }
 
-static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
-	struct iommu_table_group_link *tgl = list_first_entry_or_null(
-			&tbl->it_group_list, struct iommu_table_group_link,
-			next);
-	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
-			struct pnv_ioda_pe, table_group);
 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
 	int64_t rc;
 
@@ -1952,19 +1947,31 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
 		pe->tce_bypass_enabled = enable;
 }
 
-static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
-					  struct pnv_ioda_pe *pe)
+#ifdef CONFIG_IOMMU_API
+static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 {
-	/* TVE #1 is selected by PCI address bit 59 */
-	pe->tce_bypass_base = 1ull << 59;
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+						table_group);
 
-	/* Install set_bypass callback for VFIO */
-	pe->table_group.tables[0]->set_bypass = pnv_pci_ioda2_set_bypass;
+	iommu_take_ownership(table_group->tables[0]);
+	pnv_pci_ioda2_set_bypass(pe, false);
+}
 
-	/* Enable bypass by default */
-	pnv_pci_ioda2_set_bypass(pe->table_group.tables[0], true);
+static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+						table_group);
+
+	iommu_release_ownership(table_group->tables[0]);
+	pnv_pci_ioda2_set_bypass(pe, true);
 }
 
+static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+	.take_ownership = pnv_ioda2_take_ownership,
+	.release_ownership = pnv_ioda2_release_ownership,
+};
+#endif
+
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 				       struct pnv_ioda_pe *pe)
 {
@@ -1979,6 +1986,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
+	/* TVE #1 is selected by PCI address bit 59 */
+	pe->tce_bypass_base = 1ull << 59;
+
 	tbl = pnv_pci_table_alloc(phb->hose->node);
 	iommu_register_group(&pe->table_group, phb->hose->global_number,
 			pe->pe_number);
@@ -2033,6 +2043,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	}
 	tbl->it_ops = &pnv_ioda2_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
+#ifdef CONFIG_IOMMU_API
+	pe->table_group.ops = &pnv_pci_ioda2_ops;
+#endif
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
 		/*
@@ -2047,7 +2060,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 
 	/* Also create a bypass window */
 	if (!pnv_iommu_bypass_disabled)
-		pnv_pci_ioda2_setup_bypass_pe(phb, pe);
+		pnv_pci_ioda2_set_bypass(pe, true);
 
 	return;
 fail:
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index ffc634a..9c720de 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -486,6 +486,61 @@ static long tce_iommu_ioctl(void *iommu_data,
 	return -ENOTTY;
 }
 
+static void tce_iommu_release_ownership(struct tce_container *container,
+		struct iommu_table_group *table_group)
+{
+	int i;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = table_group->tables[i];
+
+		if (!tbl)
+			continue;
+
+		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+		if (tbl->it_map)
+			iommu_release_ownership(tbl);
+	}
+}
+
+static int tce_iommu_take_ownership(struct tce_container *container,
+		struct iommu_table_group *table_group)
+{
+	int i, j, rc = 0;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = table_group->tables[i];
+
+		if (!tbl || !tbl->it_map)
+			continue;
+
+		rc = iommu_take_ownership(tbl);
+		if (rc) {
+			for (j = 0; j < i; ++j)
+				iommu_release_ownership(
+						table_group->tables[j]);
+
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static void tce_iommu_release_ownership_ddw(struct tce_container *container,
+		struct iommu_table_group *table_group)
+{
+	table_group->ops->release_ownership(table_group);
+}
+
+static long tce_iommu_take_ownership_ddw(struct tce_container *container,
+		struct iommu_table_group *table_group)
+{
+	table_group->ops->take_ownership(table_group);
+
+	return 0;
+}
+
 static int tce_iommu_attach_group(void *iommu_data,
 		struct iommu_group *iommu_group)
 {
@@ -518,7 +573,12 @@ static int tce_iommu_attach_group(void *iommu_data,
 		goto unlock_exit;
 	}
 
-	ret = iommu_take_ownership(table_group->tables[0]);
+	if (!table_group->ops || !table_group->ops->take_ownership ||
+			!table_group->ops->release_ownership)
+		ret = tce_iommu_take_ownership(container, table_group);
+	else
+		ret = tce_iommu_take_ownership_ddw(container, table_group);
+
 	if (!ret)
 		container->grp = iommu_group;
 
@@ -533,7 +593,6 @@ static void tce_iommu_detach_group(void *iommu_data,
 {
 	struct tce_container *container = iommu_data;
 	struct iommu_table_group *table_group;
-	struct iommu_table *tbl;
 
 	mutex_lock(&container->lock);
 	if (iommu_group != container->grp) {
@@ -556,9 +615,10 @@ static void tce_iommu_detach_group(void *iommu_data,
 	table_group = iommu_group_get_iommudata(iommu_group);
 	BUG_ON(!table_group);
 
-	tbl = table_group->tables[0];
-	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
-	iommu_release_ownership(tbl);
+	if (!table_group->ops || !table_group->ops->release_ownership)
+		tce_iommu_release_ownership(container, table_group);
+	else
+		tce_iommu_release_ownership_ddw(container, table_group);
 
 unlock_exit:
 	mutex_unlock(&container->lock);
-- 
cgit v0.10.2


From b82c75bfbeb5b4c7c23ee09dd6f295c7f06aeb25 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:11 +1000
Subject: powerpc/iommu: Fix IOMMU ownership control functions

This adds missing locks in iommu_take_ownership()/
iommu_release_ownership().

This marks all pages busy in iommu_table::it_map in order to catch
errors if there is an attempt to use this table while ownership over it
is taken.

This only clears TCE content if there is no page marked busy in it_map.
Clearing must be done outside of the table locks as iommu_clear_tce()
called from iommu_clear_tces_and_put_pages() does this.

In order to use bitmap_empty(), the existing code clears bit#0 which
is set even in an empty table if it is bus-mapped at 0 as
iommu_init_table() reserves page#0 to prevent buggy drivers
from crashing when allocated page is bus-mapped at zero
(which is correct). This restores the bit in the case of failure
to bring the it_map to the state it was in when we called
iommu_take_ownership().

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index e7f81b7..0fb8800 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1035,31 +1035,51 @@ EXPORT_SYMBOL_GPL(iommu_tce_build);
 
 int iommu_take_ownership(struct iommu_table *tbl)
 {
-	unsigned long sz = (tbl->it_size + 7) >> 3;
+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
+	int ret = 0;
+
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_lock(&tbl->pools[i].lock);
 
 	if (tbl->it_offset == 0)
 		clear_bit(0, tbl->it_map);
 
 	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
 		pr_err("iommu_tce: it_map is not empty");
-		return -EBUSY;
+		ret = -EBUSY;
+		/* Restore bit#0 set by iommu_init_table() */
+		if (tbl->it_offset == 0)
+			set_bit(0, tbl->it_map);
+	} else {
+		memset(tbl->it_map, 0xff, sz);
 	}
 
-	memset(tbl->it_map, 0xff, sz);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_unlock(&tbl->pools[i].lock);
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
 
 void iommu_release_ownership(struct iommu_table *tbl)
 {
-	unsigned long sz = (tbl->it_size + 7) >> 3;
+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
+
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_lock(&tbl->pools[i].lock);
 
 	memset(tbl->it_map, 0, sz);
 
 	/* Restore bit#0 set by iommu_init_table() */
 	if (tbl->it_offset == 0)
 		set_bit(0, tbl->it_map);
+
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_unlock(&tbl->pools[i].lock);
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
-- 
cgit v0.10.2


From 5780fb04263c16fdb9affd1012d5eb956e0cbcd9 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:12 +1000
Subject: powerpc/powernv/ioda2: Move TCE kill register address to PE

At the moment the DMA setup code looks for the "ibm,opal-tce-kill"
property which contains the TCE kill register address. Writing to
this register invalidates TCE cache on IODA/IODA2 hub.

This moves the register address from iommu_table to pnv_pnb as this
register belongs to PHB and invalidates TCE cache for all tables of
all attached PEs.

This moves the property reading/remapping code to a helper which is
called when DMA is being configured for PE and which does DMA setup
for both IODA1 and IODA2.

This adds a new pnv_pci_ioda2_tce_invalidate_entire() helper which
invalidates cache for the entire table. It should be called after
every call to opal_pci_map_pe_dma_window(). It was not required before
because there was just a single TCE table and 64bit DMA was handled via
bypass window (which has no table so no cache was used) but this is going
to change with Dynamic DMA windows (DDW).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 17a7752..6bbb2bb 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1680,8 +1680,8 @@ static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
 	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
 			struct pnv_ioda_pe, table_group);
 	__be64 __iomem *invalidate = rm ?
-		(__be64 __iomem *)pe->tce_inval_reg_phys :
-		(__be64 __iomem *)tbl->it_index;
+		(__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
+		pe->phb->ioda.tce_inval_reg;
 	unsigned long start, end, inc;
 	const unsigned shift = tbl->it_page_shift;
 
@@ -1752,6 +1752,19 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 	.get = pnv_tce_get,
 };
 
+static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
+{
+	/* 01xb - invalidate TCEs that match the specified PE# */
+	unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF);
+	struct pnv_phb *phb = pe->phb;
+
+	if (!phb->ioda.tce_inval_reg)
+		return;
+
+	mb(); /* Ensure above stores are visible */
+	__raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
+}
+
 static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 		unsigned long index, unsigned long npages, bool rm)
 {
@@ -1762,8 +1775,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 			struct pnv_ioda_pe, table_group);
 	unsigned long start, end, inc;
 	__be64 __iomem *invalidate = rm ?
-		(__be64 __iomem *)pe->tce_inval_reg_phys :
-		(__be64 __iomem *)tbl->it_index;
+		(__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
+		pe->phb->ioda.tce_inval_reg;
 	const unsigned shift = tbl->it_page_shift;
 
 	/* We'll invalidate DMA address in PE scope */
@@ -1821,7 +1834,6 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 {
 
 	struct page *tce_mem = NULL;
-	const __be64 *swinvp;
 	struct iommu_table *tbl;
 	unsigned int i;
 	int64_t rc;
@@ -1878,20 +1890,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 				  base << 28, IOMMU_PAGE_SHIFT_4K);
 
 	/* OPAL variant of P7IOC SW invalidated TCEs */
-	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
-	if (swinvp) {
-		/* We need a couple more fields -- an address and a data
-		 * to or.  Since the bus is only printed out on table free
-		 * errors, and on the first pass the data will be a relative
-		 * bus number, print that out instead.
-		 */
-		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
-		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
-				8);
+	if (phb->ioda.tce_inval_reg)
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE |
 				 TCE_PCI_SWINV_FREE   |
 				 TCE_PCI_SWINV_PAIR);
-	}
+
 	tbl->it_ops = &pnv_ioda1_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
 
@@ -1972,12 +1975,24 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
 };
 #endif
 
+static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
+{
+	const __be64 *swinvp;
+
+	/* OPAL variant of PHB3 invalidated TCEs */
+	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
+	if (!swinvp)
+		return;
+
+	phb->ioda.tce_inval_reg_phys = be64_to_cpup(swinvp);
+	phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8);
+}
+
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 				       struct pnv_ioda_pe *pe)
 {
 	struct page *tce_mem = NULL;
 	void *addr;
-	const __be64 *swinvp;
 	struct iommu_table *tbl;
 	unsigned int tce_table_size, end;
 	int64_t rc;
@@ -2024,23 +2039,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 		goto fail;
 	}
 
+	pnv_pci_ioda2_tce_invalidate_entire(pe);
+
 	/* Setup linux iommu table */
 	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
 			IOMMU_PAGE_SHIFT_4K);
 
 	/* OPAL variant of PHB3 invalidated TCEs */
-	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
-	if (swinvp) {
-		/* We need a couple more fields -- an address and a data
-		 * to or.  Since the bus is only printed out on table free
-		 * errors, and on the first pass the data will be a relative
-		 * bus number, print that out instead.
-		 */
-		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
-		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
-				8);
+	if (phb->ioda.tce_inval_reg)
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
-	}
+
 	tbl->it_ops = &pnv_ioda2_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
 #ifdef CONFIG_IOMMU_API
@@ -2096,6 +2104,8 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb)
 	pr_info("PCI: %d PE# for a total weight of %d\n",
 		phb->ioda.dma_pe_count, phb->ioda.dma_weight);
 
+	pnv_pci_ioda_setup_opal_tce_kill(phb);
+
 	/* Walk our PE list and configure their DMA segments, hand them
 	 * out one base segment plus any residual segments based on
 	 * weight
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index d37cb4d..792a723 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -58,7 +58,6 @@ struct pnv_ioda_pe {
 	int			tce32_seg;
 	int			tce32_segcount;
 	struct iommu_table_group table_group;
-	phys_addr_t		tce_inval_reg_phys;
 
 	/* 64-bit TCE bypass region */
 	bool			tce_bypass_enabled;
@@ -184,6 +183,12 @@ struct pnv_phb {
 			 * boot for resource allocation purposes
 			 */
 			struct list_head	pe_dma_list;
+
+			/* TCE cache invalidate registers (physical and
+			 * remapped)
+			 */
+			phys_addr_t		tce_inval_reg_phys;
+			__be64 __iomem		*tce_inval_reg;
 		} ioda;
 	};
 
-- 
cgit v0.10.2


From e57080f17da5d50f7b3b9d6c045f28632d71309d Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:13 +1000
Subject: powerpc/powernv/ioda2: Add TCE invalidation for all attached groups

The iommu_table struct keeps a list of IOMMU groups it is used for.
At the moment there is just a single group attached but further
patches will add TCE table sharing. When sharing is enabled, TCE cache
in each PE needs to be invalidated so does the patch.

This does not change pnv_pci_ioda1_tce_invalidate() as there is no plan
to enable TCE table sharing on PHBs older than IODA2.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 6bbb2bb..3908636 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -24,6 +24,7 @@
 #include <linux/msi.h>
 #include <linux/memblock.h>
 #include <linux/iommu.h>
+#include <linux/rculist.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -1765,23 +1766,15 @@ static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
 	__raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
 }
 
-static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
-		unsigned long index, unsigned long npages, bool rm)
+static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
+		__be64 __iomem *invalidate, unsigned shift,
+		unsigned long index, unsigned long npages)
 {
-	struct iommu_table_group_link *tgl = list_first_entry_or_null(
-			&tbl->it_group_list, struct iommu_table_group_link,
-			next);
-	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
-			struct pnv_ioda_pe, table_group);
 	unsigned long start, end, inc;
-	__be64 __iomem *invalidate = rm ?
-		(__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
-		pe->phb->ioda.tce_inval_reg;
-	const unsigned shift = tbl->it_page_shift;
 
 	/* We'll invalidate DMA address in PE scope */
 	start = 0x2ull << 60;
-	start |= (pe->pe_number & 0xFF);
+	start |= (pe_number & 0xFF);
 	end = start;
 
 	/* Figure out the start, end and step */
@@ -1799,6 +1792,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
 	}
 }
 
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+		unsigned long index, unsigned long npages, bool rm)
+{
+	struct iommu_table_group_link *tgl;
+
+	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
+				struct pnv_ioda_pe, table_group);
+		__be64 __iomem *invalidate = rm ?
+			(__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
+			pe->phb->ioda.tce_inval_reg;
+
+		pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
+			invalidate, tbl->it_page_shift,
+			index, npages);
+	}
+}
+
 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
 		long npages, unsigned long uaddr,
 		enum dma_data_direction direction,
-- 
cgit v0.10.2


From c5bb44edee19b2c19221a0b5a68add37ea5733c5 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:14 +1000
Subject: powerpc/powernv: Implement accessor to TCE entry

This replaces direct accesses to TCE table with a helper which
returns an TCE entry address. This does not make difference now but will
when multi-level TCE tables get introduces.

No change in behavior is expected.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 00f1abd..a07b832 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -572,38 +572,46 @@ struct pci_ops pnv_pci_ops = {
 	.write = pnv_pci_write_config,
 };
 
+static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
+{
+	__be64 *tmp = ((__be64 *)tbl->it_base);
+
+	return tmp + idx;
+}
+
 int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction,
 		struct dma_attrs *attrs)
 {
 	u64 proto_tce = iommu_direction_to_tce_perm(direction);
-	__be64 *tcep;
-	u64 rpn;
-
-	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
-	rpn = __pa(uaddr) >> tbl->it_page_shift;
+	u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
+	long i;
 
-	while (npages--)
-		*(tcep++) = cpu_to_be64(proto_tce |
-				(rpn++ << tbl->it_page_shift));
+	for (i = 0; i < npages; i++) {
+		unsigned long newtce = proto_tce |
+			((rpn + i) << tbl->it_page_shift);
+		unsigned long idx = index - tbl->it_offset + i;
 
+		*(pnv_tce(tbl, idx)) = cpu_to_be64(newtce);
+	}
 
 	return 0;
 }
 
 void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 {
-	__be64 *tcep;
+	long i;
 
-	tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
+	for (i = 0; i < npages; i++) {
+		unsigned long idx = index - tbl->it_offset + i;
 
-	while (npages--)
-		*(tcep++) = cpu_to_be64(0);
+		*(pnv_tce(tbl, idx)) = cpu_to_be64(0);
+	}
 }
 
 unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
 {
-	return ((u64 *)tbl->it_base)[index - tbl->it_offset];
+	return *(pnv_tce(tbl, index - tbl->it_offset));
 }
 
 struct iommu_table *pnv_pci_table_alloc(int nid)
-- 
cgit v0.10.2


From 05c6cfb9dce0d13d37e9d007ee6a4af36f1c0a58 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:15 +1000
Subject: powerpc/iommu/powernv: Release replaced TCE

At the moment writing new TCE value to the IOMMU table fails with EBUSY
if there is a valid entry already. However PAPR specification allows
the guest to write new TCE value without clearing it first.

Another problem this patch is addressing is the use of pool locks for
external IOMMU users such as VFIO. The pool locks are to protect
DMA page allocator rather than entries and since the host kernel does
not control what pages are in use, there is no point in pool locks and
exchange()+put_page(oldtce) is sufficient to avoid possible races.

This adds an exchange() callback to iommu_table_ops which does the same
thing as set() plus it returns replaced TCE and DMA direction so
the caller can release the pages afterwards. The exchange() receives
a physical address unlike set() which receives linear mapping address;
and returns a physical address as the clear() does.

This implements exchange() for P5IOC2/IODA/IODA2. This adds a requirement
for a platform to have exchange() implemented in order to support VFIO.

This replaces iommu_tce_build() and iommu_clear_tce() with
a single iommu_tce_xchg().

This makes sure that TCE permission bits are not set in TCE passed to
IOMMU API as those are to be calculated by platform code from
DMA direction.

This moves SetPageDirty() to the IOMMU code to make it work for both
VFIO ioctl interface in in-kernel TCE acceleration (when it becomes
available later).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 489133c..4636734 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -45,13 +45,29 @@ extern int iommu_is_off;
 extern int iommu_force_on;
 
 struct iommu_table_ops {
+	/*
+	 * When called with direction==DMA_NONE, it is equal to clear().
+	 * uaddr is a linear map address.
+	 */
 	int (*set)(struct iommu_table *tbl,
 			long index, long npages,
 			unsigned long uaddr,
 			enum dma_data_direction direction,
 			struct dma_attrs *attrs);
+#ifdef CONFIG_IOMMU_API
+	/*
+	 * Exchanges existing TCE with new TCE plus direction bits;
+	 * returns old TCE and DMA direction mask.
+	 * @tce is a physical address.
+	 */
+	int (*exchange)(struct iommu_table *tbl,
+			long index,
+			unsigned long *hpa,
+			enum dma_data_direction *direction);
+#endif
 	void (*clear)(struct iommu_table *tbl,
 			long index, long npages);
+	/* get() returns a physical address */
 	unsigned long (*get)(struct iommu_table *tbl, long index);
 	void (*flush)(struct iommu_table *tbl);
 };
@@ -153,6 +169,8 @@ extern void iommu_register_group(struct iommu_table_group *table_group,
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
+extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
@@ -225,10 +243,6 @@ extern int iommu_tce_clear_param_check(struct iommu_table *tbl,
 		unsigned long npages);
 extern int iommu_tce_put_param_check(struct iommu_table *tbl,
 		unsigned long ioba, unsigned long tce);
-extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-		unsigned long hwaddr, enum dma_data_direction direction);
-extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
-		unsigned long entry);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0fb8800..a8e3490 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -965,10 +965,7 @@ EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
 int iommu_tce_put_param_check(struct iommu_table *tbl,
 		unsigned long ioba, unsigned long tce)
 {
-	if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
-		return -EINVAL;
-
-	if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ))
+	if (tce & ~IOMMU_PAGE_MASK(tbl))
 		return -EINVAL;
 
 	if (ioba & ~IOMMU_PAGE_MASK(tbl))
@@ -985,44 +982,16 @@ int iommu_tce_put_param_check(struct iommu_table *tbl,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
 
-unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
+long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction)
 {
-	unsigned long oldtce;
-	struct iommu_pool *pool = get_pool(tbl, entry);
+	long ret;
 
-	spin_lock(&(pool->lock));
+	ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
 
-	oldtce = tbl->it_ops->get(tbl, entry);
-	if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
-		tbl->it_ops->clear(tbl, entry, 1);
-	else
-		oldtce = 0;
-
-	spin_unlock(&(pool->lock));
-
-	return oldtce;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tce);
-
-/*
- * hwaddr is a kernel virtual address here (0xc... bazillion),
- * tce_build converts it to a physical address.
- */
-int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-		unsigned long hwaddr, enum dma_data_direction direction)
-{
-	int ret = -EBUSY;
-	unsigned long oldtce;
-	struct iommu_pool *pool = get_pool(tbl, entry);
-
-	spin_lock(&(pool->lock));
-
-	oldtce = tbl->it_ops->get(tbl, entry);
-	/* Add new entry if it is not busy */
-	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
-		ret = tbl->it_ops->set(tbl, entry, 1, hwaddr, direction, NULL);
-
-	spin_unlock(&(pool->lock));
+	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
+			(*direction == DMA_BIDIRECTIONAL)))
+		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
 
 	/* if (unlikely(ret))
 		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
@@ -1031,13 +1000,23 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(iommu_tce_build);
+EXPORT_SYMBOL_GPL(iommu_tce_xchg);
 
 int iommu_take_ownership(struct iommu_table *tbl)
 {
 	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
 	int ret = 0;
 
+	/*
+	 * VFIO does not control TCE entries allocation and the guest
+	 * can write new TCEs on top of existing ones so iommu_tce_build()
+	 * must be able to release old pages. This functionality
+	 * requires exchange() callback defined so if it is not
+	 * implemented, we disallow taking ownership over the table.
+	 */
+	if (!tbl->it_ops->exchange)
+		return -EINVAL;
+
 	spin_lock_irqsave(&tbl->large_pool.lock, flags);
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_lock(&tbl->pools[i].lock);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3908636..ecbc071 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1738,6 +1738,20 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
 	return ret;
 }
 
+#ifdef CONFIG_IOMMU_API
+static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret && (tbl->it_type &
+			(TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
+		pnv_pci_ioda1_tce_invalidate(tbl, index, 1, false);
+
+	return ret;
+}
+#endif
+
 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
 		long npages)
 {
@@ -1749,6 +1763,9 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
 
 static struct iommu_table_ops pnv_ioda1_iommu_ops = {
 	.set = pnv_ioda1_tce_build,
+#ifdef CONFIG_IOMMU_API
+	.exchange = pnv_ioda1_tce_xchg,
+#endif
 	.clear = pnv_ioda1_tce_free,
 	.get = pnv_tce_get,
 };
@@ -1824,6 +1841,20 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
 	return ret;
 }
 
+#ifdef CONFIG_IOMMU_API
+static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret && (tbl->it_type &
+			(TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
+		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
+
+	return ret;
+}
+#endif
+
 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 		long npages)
 {
@@ -1835,6 +1866,9 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 	.set = pnv_ioda2_tce_build,
+#ifdef CONFIG_IOMMU_API
+	.exchange = pnv_ioda2_tce_xchg,
+#endif
 	.clear = pnv_ioda2_tce_free,
 	.get = pnv_tce_get,
 };
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index f80e5a1..eaec85b 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -85,6 +85,9 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
 
 static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
 	.set = pnv_tce_build,
+#ifdef CONFIG_IOMMU_API
+	.exchange = pnv_tce_xchg,
+#endif
 	.clear = pnv_tce_free,
 	.get = pnv_tce_get,
 };
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index a07b832..d465b9c 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -598,6 +598,24 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 	return 0;
 }
 
+#ifdef CONFIG_IOMMU_API
+int pnv_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
+	unsigned long newtce = *hpa | proto_tce, oldtce;
+	unsigned long idx = index - tbl->it_offset;
+
+	BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
+
+	oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce));
+	*hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	*direction = iommu_tce_direction(oldtce);
+
+	return 0;
+}
+#endif
+
 void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 {
 	long i;
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 792a723..8ef2d28 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -207,6 +207,8 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 		unsigned long uaddr, enum dma_data_direction direction,
 		struct dma_attrs *attrs);
 extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
+extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction);
 extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
 
 void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 9c720de..a9e2d13 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -236,18 +236,11 @@ static void tce_iommu_release(void *iommu_data)
 }
 
 static void tce_iommu_unuse_page(struct tce_container *container,
-		unsigned long oldtce)
+		unsigned long hpa)
 {
 	struct page *page;
 
-	if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE)))
-		return;
-
-	page = pfn_to_page(oldtce >> PAGE_SHIFT);
-
-	if (oldtce & TCE_PCI_WRITE)
-		SetPageDirty(page);
-
+	page = pfn_to_page(hpa >> PAGE_SHIFT);
 	put_page(page);
 }
 
@@ -255,14 +248,21 @@ static int tce_iommu_clear(struct tce_container *container,
 		struct iommu_table *tbl,
 		unsigned long entry, unsigned long pages)
 {
-	unsigned long oldtce;
+	unsigned long oldhpa;
+	long ret;
+	enum dma_data_direction direction;
 
 	for ( ; pages; --pages, ++entry) {
-		oldtce = iommu_clear_tce(tbl, entry);
-		if (!oldtce)
+		direction = DMA_NONE;
+		oldhpa = 0;
+		ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
+		if (ret)
+			continue;
+
+		if (direction == DMA_NONE)
 			continue;
 
-		tce_iommu_unuse_page(container, oldtce);
+		tce_iommu_unuse_page(container, oldhpa);
 	}
 
 	return 0;
@@ -284,12 +284,13 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 
 static long tce_iommu_build(struct tce_container *container,
 		struct iommu_table *tbl,
-		unsigned long entry, unsigned long tce, unsigned long pages)
+		unsigned long entry, unsigned long tce, unsigned long pages,
+		enum dma_data_direction direction)
 {
 	long i, ret = 0;
 	struct page *page;
 	unsigned long hpa;
-	enum dma_data_direction direction = iommu_tce_direction(tce);
+	enum dma_data_direction dirtmp;
 
 	for (i = 0; i < pages; ++i) {
 		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
@@ -305,8 +306,8 @@ static long tce_iommu_build(struct tce_container *container,
 		}
 
 		hpa |= offset;
-		ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va(hpa),
-				direction);
+		dirtmp = direction;
+		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
 		if (ret) {
 			tce_iommu_unuse_page(container, hpa);
 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
@@ -314,6 +315,10 @@ static long tce_iommu_build(struct tce_container *container,
 					tce, ret);
 			break;
 		}
+
+		if (dirtmp != DMA_NONE)
+			tce_iommu_unuse_page(container, hpa);
+
 		tce += IOMMU_PAGE_SIZE(tbl);
 	}
 
@@ -378,8 +383,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 	case VFIO_IOMMU_MAP_DMA: {
 		struct vfio_iommu_type1_dma_map param;
 		struct iommu_table *tbl = NULL;
-		unsigned long tce;
 		long num;
+		enum dma_data_direction direction;
 
 		if (!container->enabled)
 			return -EPERM;
@@ -405,19 +410,27 @@ static long tce_iommu_ioctl(void *iommu_data,
 			return -EINVAL;
 
 		/* iova is checked by the IOMMU API */
-		tce = param.vaddr;
-		if (param.flags & VFIO_DMA_MAP_FLAG_READ)
-			tce |= TCE_PCI_READ;
-		if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
-			tce |= TCE_PCI_WRITE;
+		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
+			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+				direction = DMA_BIDIRECTIONAL;
+			else
+				direction = DMA_TO_DEVICE;
+		} else {
+			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
+				direction = DMA_FROM_DEVICE;
+			else
+				return -EINVAL;
+		}
 
-		ret = iommu_tce_put_param_check(tbl, param.iova, tce);
+		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 		if (ret)
 			return ret;
 
 		ret = tce_iommu_build(container, tbl,
 				param.iova >> tbl->it_page_shift,
-				tce, param.size >> tbl->it_page_shift);
+				param.vaddr,
+				param.size >> tbl->it_page_shift,
+				direction);
 
 		iommu_flush_tce(tbl);
 
-- 
cgit v0.10.2


From e5aad1e678746af663e3e3acc7f3501309997f51 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:16 +1000
Subject: powerpc/powernv/ioda2: Rework iommu_table creation

This moves iommu_table creation to the beginning to make following changes
easier to review. This starts using table parameters from the iommu_table
struct.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index ecbc071..feaba5e 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2071,13 +2071,23 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	addr = page_address(tce_mem);
 	memset(addr, 0, tce_table_size);
 
+	/* Setup linux iommu table */
+	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
+			IOMMU_PAGE_SHIFT_4K);
+
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
+	iommu_init_table(tbl, phb->hose->node);
+#ifdef CONFIG_IOMMU_API
+	pe->table_group.ops = &pnv_pci_ioda2_ops;
+#endif
+
 	/*
 	 * Map TCE table through TVT. The TVE index is the PE number
 	 * shifted by 1 bit for 32-bits DMA space.
 	 */
 	rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
-					pe->pe_number << 1, 1, __pa(addr),
-					tce_table_size, 0x1000);
+			pe->pe_number << 1, 1, __pa(tbl->it_base),
+			tbl->it_size << 3, 1ULL << tbl->it_page_shift);
 	if (rc) {
 		pe_err(pe, "Failed to configure 32-bit TCE table,"
 		       " err %ld\n", rc);
@@ -2086,20 +2096,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 
 	pnv_pci_ioda2_tce_invalidate_entire(pe);
 
-	/* Setup linux iommu table */
-	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
-			IOMMU_PAGE_SHIFT_4K);
-
 	/* OPAL variant of PHB3 invalidated TCEs */
 	if (phb->ioda.tce_inval_reg)
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
 
-	tbl->it_ops = &pnv_ioda2_iommu_ops;
-	iommu_init_table(tbl, phb->hose->node);
-#ifdef CONFIG_IOMMU_API
-	pe->table_group.ops = &pnv_pci_ioda2_ops;
-#endif
-
 	if (pe->flags & PNV_IODA_PE_DEV) {
 		/*
 		 * Setting table base here only for carrying iommu_group
-- 
cgit v0.10.2


From aca6913f555176363cda21518ff79da7469ebd64 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:17 +1000
Subject: powerpc/powernv/ioda2: Introduce helpers to allocate TCE pages

This is a part of moving TCE table allocation into an iommu_ops
callback to support multiple IOMMU groups per one VFIO container.

This moves the code which allocates the actual TCE tables to helpers:
pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages().
These do not allocate/free the iommu_table struct.

This enforces window size to be a power of two.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index feaba5e..e61610c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -40,6 +40,7 @@
 #include <asm/debug.h>
 #include <asm/firmware.h>
 #include <asm/pnv-pci.h>
+#include <asm/mmzone.h>
 
 #include <misc/cxl-base.h>
 
@@ -49,6 +50,8 @@
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
 
+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
+
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 			    const char *fmt, ...)
 {
@@ -1313,8 +1316,8 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 		iommu_group_put(pe->table_group.group);
 		BUG_ON(pe->table_group.group);
 	}
+	pnv_pci_ioda2_table_free_pages(tbl);
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
-	free_pages(addr, get_order(TCE32_TABLE_SIZE));
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
@@ -2033,13 +2036,62 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
 	phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8);
 }
 
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
-				       struct pnv_ioda_pe *pe)
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
 {
 	struct page *tce_mem = NULL;
+	__be64 *addr;
+	unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+
+	tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
+	if (!tce_mem) {
+		pr_err("Failed to allocate a TCE memory, order=%d\n", order);
+		return NULL;
+	}
+	addr = page_address(tce_mem);
+	memset(addr, 0, 1UL << (order + PAGE_SHIFT));
+
+	return addr;
+}
+
+static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+		__u32 page_shift, __u64 window_size, struct iommu_table *tbl)
+{
 	void *addr;
+	const unsigned window_shift = ilog2(window_size);
+	unsigned entries_shift = window_shift - page_shift;
+	unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
+	const unsigned long tce_table_size = 1UL << table_shift;
+
+	if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
+		return -EINVAL;
+
+	/* Allocate TCE table */
+	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift);
+	if (!addr)
+		return -ENOMEM;
+
+	/* Setup linux iommu table */
+	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
+			page_shift);
+
+	pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
+			window_size, tce_table_size, bus_offset);
+
+	return 0;
+}
+
+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
+{
+	if (!tbl->it_size)
+		return;
+
+	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe)
+{
 	struct iommu_table *tbl;
-	unsigned int tce_table_size, end;
 	int64_t rc;
 
 	/* We shouldn't already have a 32-bit DMA associated */
@@ -2056,24 +2108,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 
 	/* The PE will reserve all possible 32-bits space */
 	pe->tce32_seg = 0;
-	end = (1 << ilog2(phb->ioda.m32_pci_base));
-	tce_table_size = (end / 0x1000) * 8;
 	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
-		end);
+		phb->ioda.m32_pci_base);
 
-	/* Allocate TCE table */
-	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
-				   get_order(tce_table_size));
-	if (!tce_mem) {
-		pe_err(pe, "Failed to allocate a 32-bit TCE memory\n");
+	/* Setup linux iommu table */
+	rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
+			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, tbl);
+	if (rc) {
+		pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
 		goto fail;
 	}
-	addr = page_address(tce_mem);
-	memset(addr, 0, tce_table_size);
-
-	/* Setup linux iommu table */
-	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
-			IOMMU_PAGE_SHIFT_4K);
 
 	tbl->it_ops = &pnv_ioda2_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
@@ -2119,9 +2163,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 fail:
 	if (pe->tce32_seg >= 0)
 		pe->tce32_seg = -1;
-	if (tce_mem)
-		__free_pages(tce_mem, get_order(tce_table_size));
 	if (tbl) {
+		pnv_pci_ioda2_table_free_pages(tbl);
 		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
 		iommu_free_table(tbl, "pnv");
 	}
-- 
cgit v0.10.2


From 43cb60ab7f8c91b9bf5988edc318dee99ec93b9b Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:18 +1000
Subject: powerpc/powernv/ioda2: Introduce pnv_pci_ioda2_set_window

This is a part of moving DMA window programming to an iommu_ops
callback. pnv_pci_ioda2_set_window() takes an iommu_table_group as
a first parameter (not pnv_ioda_pe) as it is going to be used as
a callback for VFIO DDW code.

This should cause no behavioural change.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index e61610c..552f6ac 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1970,6 +1970,43 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	}
 }
 
+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
+		int num, struct iommu_table *tbl)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = pe->phb;
+	int64_t rc;
+	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
+	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+
+	pe_info(pe, "Setting up window %llx..%llx pg=%x\n",
+			start_addr, start_addr + win_size - 1,
+			IOMMU_PAGE_SIZE(tbl));
+
+	/*
+	 * Map TCE table through TVT. The TVE index is the PE number
+	 * shifted by 1 bit for 32-bits DMA space.
+	 */
+	rc = opal_pci_map_pe_dma_window(phb->opal_id,
+			pe->pe_number,
+			pe->pe_number << 1,
+			1,
+			__pa(tbl->it_base),
+			tbl->it_size << 3,
+			IOMMU_PAGE_SIZE(tbl));
+	if (rc) {
+		pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
+		return rc;
+	}
+
+	pnv_pci_link_table_and_group(phb->hose->node, num,
+			tbl, &pe->table_group);
+	pnv_pci_ioda2_tce_invalidate_entire(pe);
+
+	return 0;
+}
+
 static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
@@ -2125,21 +2162,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	pe->table_group.ops = &pnv_pci_ioda2_ops;
 #endif
 
-	/*
-	 * Map TCE table through TVT. The TVE index is the PE number
-	 * shifted by 1 bit for 32-bits DMA space.
-	 */
-	rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
-			pe->pe_number << 1, 1, __pa(tbl->it_base),
-			tbl->it_size << 3, 1ULL << tbl->it_page_shift);
+	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
 	if (rc) {
 		pe_err(pe, "Failed to configure 32-bit TCE table,"
 		       " err %ld\n", rc);
 		goto fail;
 	}
 
-	pnv_pci_ioda2_tce_invalidate_entire(pe);
-
 	/* OPAL variant of PHB3 invalidated TCEs */
 	if (phb->ioda.tce_inval_reg)
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
-- 
cgit v0.10.2


From bbb845c4bac88d8feffa8945dd28b50849984e30 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:19 +1000
Subject: powerpc/powernv: Implement multilevel TCE tables

TCE tables might get too big in case of 4K IOMMU pages and DDW enabled
on huge guests (hundreds of GB of RAM) so the kernel might be unable to
allocate contiguous chunk of physical memory to store the TCE table.

To address this, POWER8 CPU (actually, IODA2) supports multi-level
TCE tables, up to 5 levels which splits the table into a tree of
smaller subtables.

This adds multi-level TCE tables support to
pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages()
helpers.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 4636734..706cfc0 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -96,6 +96,8 @@ struct iommu_pool {
 struct iommu_table {
 	unsigned long  it_busno;     /* Bus number this table belongs to */
 	unsigned long  it_size;      /* Size of iommu table in entries */
+	unsigned long  it_indirect_levels;
+	unsigned long  it_level_size;
 	unsigned long  it_offset;    /* Offset into global table */
 	unsigned long  it_base;      /* mapped address of tce table */
 	unsigned long  it_index;     /* which iommu table this is */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 552f6ac..21d8c61 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -50,6 +50,9 @@
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
 
+#define POWERNV_IOMMU_DEFAULT_LEVELS	1
+#define POWERNV_IOMMU_MAX_LEVELS	5
+
 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
 
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -1977,6 +1980,8 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 			table_group);
 	struct pnv_phb *phb = pe->phb;
 	int64_t rc;
+	const unsigned long size = tbl->it_indirect_levels ?
+			tbl->it_level_size : tbl->it_size;
 	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
 	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
 
@@ -1991,9 +1996,9 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 	rc = opal_pci_map_pe_dma_window(phb->opal_id,
 			pe->pe_number,
 			pe->pe_number << 1,
-			1,
+			tbl->it_indirect_levels + 1,
 			__pa(tbl->it_base),
-			tbl->it_size << 3,
+			size << 3,
 			IOMMU_PAGE_SIZE(tbl));
 	if (rc) {
 		pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
@@ -2073,11 +2078,16 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
 	phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8);
 }
 
-static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
+		unsigned levels, unsigned long limit,
+		unsigned long *current_offset)
 {
 	struct page *tce_mem = NULL;
-	__be64 *addr;
+	__be64 *addr, *tmp;
 	unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+	unsigned long allocated = 1UL << (order + PAGE_SHIFT);
+	unsigned entries = 1UL << (shift - 3);
+	long i;
 
 	tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
 	if (!tce_mem) {
@@ -2085,31 +2095,79 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
 		return NULL;
 	}
 	addr = page_address(tce_mem);
-	memset(addr, 0, 1UL << (order + PAGE_SHIFT));
+	memset(addr, 0, allocated);
+
+	--levels;
+	if (!levels) {
+		*current_offset += allocated;
+		return addr;
+	}
+
+	for (i = 0; i < entries; ++i) {
+		tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
+				levels, limit, current_offset);
+		if (!tmp)
+			break;
+
+		addr[i] = cpu_to_be64(__pa(tmp) |
+				TCE_PCI_READ | TCE_PCI_WRITE);
+
+		if (*current_offset >= limit)
+			break;
+	}
 
 	return addr;
 }
 
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+		unsigned long size, unsigned level);
+
 static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
-		__u32 page_shift, __u64 window_size, struct iommu_table *tbl)
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table *tbl)
 {
 	void *addr;
+	unsigned long offset = 0, level_shift;
 	const unsigned window_shift = ilog2(window_size);
 	unsigned entries_shift = window_shift - page_shift;
 	unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
 	const unsigned long tce_table_size = 1UL << table_shift;
 
+	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
+		return -EINVAL;
+
 	if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
 		return -EINVAL;
 
+	/* Adjust direct table size from window_size and levels */
+	entries_shift = (entries_shift + levels - 1) / levels;
+	level_shift = entries_shift + 3;
+	level_shift = max_t(unsigned, level_shift, PAGE_SHIFT);
+
 	/* Allocate TCE table */
-	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift);
+	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
+			levels, tce_table_size, &offset);
+
+	/* addr==NULL means that the first level allocation failed */
 	if (!addr)
 		return -ENOMEM;
 
+	/*
+	 * First level was allocated but some lower level failed as
+	 * we did not allocate as much as we wanted,
+	 * release partially allocated table.
+	 */
+	if (offset < tce_table_size) {
+		pnv_pci_ioda2_table_do_free_pages(addr,
+				1ULL << (level_shift - 3), levels - 1);
+		return -ENOMEM;
+	}
+
 	/* Setup linux iommu table */
 	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
 			page_shift);
+	tbl->it_level_size = 1ULL << (level_shift - 3);
+	tbl->it_indirect_levels = levels - 1;
 
 	pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
 			window_size, tce_table_size, bus_offset);
@@ -2117,12 +2175,40 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	return 0;
 }
 
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+		unsigned long size, unsigned level)
+{
+	const unsigned long addr_ul = (unsigned long) addr &
+			~(TCE_PCI_READ | TCE_PCI_WRITE);
+
+	if (level) {
+		long i;
+		u64 *tmp = (u64 *) addr_ul;
+
+		for (i = 0; i < size; ++i) {
+			unsigned long hpa = be64_to_cpu(tmp[i]);
+
+			if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE)))
+				continue;
+
+			pnv_pci_ioda2_table_do_free_pages(__va(hpa), size,
+					level - 1);
+		}
+	}
+
+	free_pages(addr_ul, get_order(size << 3));
+}
+
 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
 {
+	const unsigned long size = tbl->it_indirect_levels ?
+			tbl->it_level_size : tbl->it_size;
+
 	if (!tbl->it_size)
 		return;
 
-	free_pages(tbl->it_base, get_order(tbl->it_size << 3));
+	pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size,
+			tbl->it_indirect_levels);
 }
 
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
@@ -2150,7 +2236,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 
 	/* Setup linux iommu table */
 	rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
-			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, tbl);
+			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base,
+			POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
 	if (rc) {
 		pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
 		goto fail;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index d465b9c..765d8ed 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -575,6 +575,19 @@ struct pci_ops pnv_pci_ops = {
 static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
 {
 	__be64 *tmp = ((__be64 *)tbl->it_base);
+	int  level = tbl->it_indirect_levels;
+	const long shift = ilog2(tbl->it_level_size);
+	unsigned long mask = (tbl->it_level_size - 1) << (level * shift);
+
+	while (level) {
+		int n = (idx & mask) >> (level * shift);
+		unsigned long tce = be64_to_cpu(tmp[n]);
+
+		tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
+		idx &= ~mask;
+		mask >>= shift;
+		--level;
+	}
 
 	return tmp + idx;
 }
-- 
cgit v0.10.2


From 4793d65d1ac056d92b594d05c6aab3c040d913dd Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:20 +1000
Subject: vfio: powerpc/spapr: powerpc/powernv/ioda: Define and implement DMA
 windows API

This extends iommu_table_group_ops by a set of callbacks to support
dynamic DMA windows management.

create_table() creates a TCE table with specific parameters.
it receives iommu_table_group to know nodeid in order to allocate
TCE table memory closer to the PHB. The exact format of allocated
multi-level table might be also specific to the PHB model (not
the case now though).
This callback calculated the DMA window offset on a PCI bus from @num
and stores it in a just created table.

set_window() sets the window at specified TVT index + @num on PHB.

unset_window() unsets the window from specified TVT.

This adds a free() callback to iommu_table_ops to free the memory
(potentially a tree of tables) allocated for the TCE table.

create_table() and free() are supposed to be called once per
VFIO container and set_window()/unset_window() are supposed to be
called for every group in a container.

This adds IOMMU capabilities to iommu_table_group such as default
32bit window parameters and others. This makes use of new values in
vfio_iommu_spapr_tce. IODA1/P5IOC2 do not support DDW so they do not
advertise pagemasks to the userspace.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 706cfc0..e554175 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -70,6 +70,7 @@ struct iommu_table_ops {
 	/* get() returns a physical address */
 	unsigned long (*get)(struct iommu_table *tbl, long index);
 	void (*flush)(struct iommu_table *tbl);
+	void (*free)(struct iommu_table *tbl);
 };
 
 /* These are used by VIO */
@@ -146,6 +147,17 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 struct iommu_table_group;
 
 struct iommu_table_group_ops {
+	long (*create_table)(struct iommu_table_group *table_group,
+			int num,
+			__u32 page_shift,
+			__u64 window_size,
+			__u32 levels,
+			struct iommu_table **ptbl);
+	long (*set_window)(struct iommu_table_group *table_group,
+			int num,
+			struct iommu_table *tblnew);
+	long (*unset_window)(struct iommu_table_group *table_group,
+			int num);
 	/* Switch ownership from platform code to external user (e.g. VFIO) */
 	void (*take_ownership)(struct iommu_table_group *table_group);
 	/* Switch ownership from external user (e.g. VFIO) back to core */
@@ -159,6 +171,13 @@ struct iommu_table_group_link {
 };
 
 struct iommu_table_group {
+	/* IOMMU properties */
+	__u32 tce32_start;
+	__u32 tce32_size;
+	__u64 pgsizes; /* Bitmap of supported page sizes */
+	__u32 max_dynamic_windows_supported;
+	__u32 max_levels;
+
 	struct iommu_group *group;
 	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
 	struct iommu_table_group_ops *ops;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 21d8c61..ee3e643 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -25,6 +25,7 @@
 #include <linux/memblock.h>
 #include <linux/iommu.h>
 #include <linux/rculist.h>
+#include <linux/sizes.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -1870,6 +1871,12 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
 		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
 }
 
+static void pnv_ioda2_table_free(struct iommu_table *tbl)
+{
+	pnv_pci_ioda2_table_free_pages(tbl);
+	iommu_free_table(tbl, "pnv");
+}
+
 static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 	.set = pnv_ioda2_tce_build,
 #ifdef CONFIG_IOMMU_API
@@ -1877,6 +1884,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
 #endif
 	.clear = pnv_ioda2_tce_free,
 	.get = pnv_tce_get,
+	.free = pnv_ioda2_table_free,
 };
 
 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
@@ -1947,6 +1955,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 				 TCE_PCI_SWINV_PAIR);
 
 	tbl->it_ops = &pnv_ioda1_iommu_ops;
+	pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
+	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
@@ -1985,7 +1995,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
 	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
 
-	pe_info(pe, "Setting up window %llx..%llx pg=%x\n",
+	pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num,
 			start_addr, start_addr + win_size - 1,
 			IOMMU_PAGE_SIZE(tbl));
 
@@ -1995,7 +2005,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
 	 */
 	rc = opal_pci_map_pe_dma_window(phb->opal_id,
 			pe->pe_number,
-			pe->pe_number << 1,
+			(pe->pe_number << 1) + num,
 			tbl->it_indirect_levels + 1,
 			__pa(tbl->it_base),
 			size << 3,
@@ -2040,7 +2050,67 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 		pe->tce_bypass_enabled = enable;
 }
 
+static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table *tbl);
+
+static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
+		int num, __u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table **ptbl)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	int nid = pe->phb->hose->node;
+	__u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
+	long ret;
+	struct iommu_table *tbl;
+
+	tbl = pnv_pci_table_alloc(nid);
+	if (!tbl)
+		return -ENOMEM;
+
+	ret = pnv_pci_ioda2_table_alloc_pages(nid,
+			bus_offset, page_shift, window_size,
+			levels, tbl);
+	if (ret) {
+		iommu_free_table(tbl, "pnv");
+		return ret;
+	}
+
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
+	if (pe->phb->ioda.tce_inval_reg)
+		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+
+	*ptbl = tbl;
+
+	return 0;
+}
+
 #ifdef CONFIG_IOMMU_API
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+		int num)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = pe->phb;
+	long ret;
+
+	pe_info(pe, "Removing DMA window #%d\n", num);
+
+	ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+			(pe->pe_number << 1) + num,
+			0/* levels */, 0/* table address */,
+			0/* table size */, 0/* page size */);
+	if (ret)
+		pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
+	else
+		pnv_pci_ioda2_tce_invalidate_entire(pe);
+
+	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
+
+	return ret;
+}
+
 static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 {
 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
@@ -2060,6 +2130,9 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
 }
 
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+	.create_table = pnv_pci_ioda2_create_table,
+	.set_window = pnv_pci_ioda2_set_window,
+	.unset_window = pnv_pci_ioda2_unset_window,
 	.take_ownership = pnv_ioda2_take_ownership,
 	.release_ownership = pnv_ioda2_release_ownership,
 };
@@ -2214,7 +2287,7 @@ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 				       struct pnv_ioda_pe *pe)
 {
-	struct iommu_table *tbl;
+	struct iommu_table *tbl = NULL;
 	int64_t rc;
 
 	/* We shouldn't already have a 32-bit DMA associated */
@@ -2224,10 +2297,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	/* TVE #1 is selected by PCI address bit 59 */
 	pe->tce_bypass_base = 1ull << 59;
 
-	tbl = pnv_pci_table_alloc(phb->hose->node);
 	iommu_register_group(&pe->table_group, phb->hose->global_number,
 			pe->pe_number);
-	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
 	/* The PE will reserve all possible 32-bits space */
 	pe->tce32_seg = 0;
@@ -2235,13 +2306,22 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 		phb->ioda.m32_pci_base);
 
 	/* Setup linux iommu table */
-	rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
-			0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base,
-			POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
+	pe->table_group.tce32_start = 0;
+	pe->table_group.tce32_size = phb->ioda.m32_pci_base;
+	pe->table_group.max_dynamic_windows_supported =
+			IOMMU_TABLE_GROUP_MAX_TABLES;
+	pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
+	pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M;
+
+	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
+			IOMMU_PAGE_SHIFT_4K,
+			pe->table_group.tce32_size,
+			POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
 	if (rc) {
 		pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
 		goto fail;
 	}
+	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
 
 	tbl->it_ops = &pnv_ioda2_iommu_ops;
 	iommu_init_table(tbl, phb->hose->node);
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index eaec85b..f2bdfea 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -127,6 +127,8 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 	u64 phb_id;
 	int64_t rc;
 	static int primary = 1;
+	struct iommu_table_group *table_group;
+	struct iommu_table *tbl;
 
 	pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name);
 
@@ -201,7 +203,10 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 	 * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table()
 	 * should not be called for phb->p5ioc2.table_group.tables[0] ever.
 	 */
-	phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
+	tbl = phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
+	table_group = &phb->p5ioc2.table_group;
+	table_group->tce32_start = tbl->it_offset << tbl->it_page_shift;
+	table_group->tce32_size = tbl->it_size << tbl->it_page_shift;
 }
 
 void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index a9e2d13..6d919eb 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -135,7 +135,6 @@ static int tce_iommu_enable(struct tce_container *container)
 {
 	int ret = 0;
 	unsigned long locked;
-	struct iommu_table *tbl;
 	struct iommu_table_group *table_group;
 
 	if (!container->grp)
@@ -171,13 +170,19 @@ static int tce_iommu_enable(struct tce_container *container)
 	 * this is that we cannot tell here the amount of RAM used by the guest
 	 * as this information is only available from KVM and VFIO is
 	 * KVM agnostic.
+	 *
+	 * So we do not allow enabling a container without a group attached
+	 * as there is no way to know how much we should increment
+	 * the locked_vm counter.
 	 */
 	table_group = iommu_group_get_iommudata(container->grp);
 	if (!table_group)
 		return -ENODEV;
 
-	tbl = table_group->tables[0];
-	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
+	if (!table_group->tce32_size)
+		return -EPERM;
+
+	locked = table_group->tce32_size >> PAGE_SHIFT;
 	ret = try_increment_locked_vm(locked);
 	if (ret)
 		return ret;
@@ -350,7 +355,6 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 		struct vfio_iommu_spapr_tce_info info;
-		struct iommu_table *tbl;
 		struct iommu_table_group *table_group;
 
 		if (WARN_ON(!container->grp))
@@ -358,8 +362,7 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 		table_group = iommu_group_get_iommudata(container->grp);
 
-		tbl = table_group->tables[0];
-		if (WARN_ON_ONCE(!tbl))
+		if (!table_group)
 			return -ENXIO;
 
 		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
@@ -371,8 +374,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		info.dma32_window_start = tbl->it_offset << tbl->it_page_shift;
-		info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
+		info.dma32_window_start = table_group->tce32_start;
+		info.dma32_window_size = table_group->tce32_size;
 		info.flags = 0;
 
 		if (copy_to_user((void __user *)arg, &info, minsz))
-- 
cgit v0.10.2


From c035e37b58c75ca216bfd1d5de3c1080ac0022b9 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:21 +1000
Subject: powerpc/powernv/ioda2: Use new helpers to do proper cleanup on PE
 release

The existing code programmed TVT#0 with some address and then
immediately released that memory.

This makes use of pnv_pci_ioda2_unset_window() and
pnv_pci_ioda2_set_bypass() which do correct resource release and
TVT update.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index ee3e643..510bb9c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1288,34 +1288,21 @@ m64_failed:
 	return -EBUSY;
 }
 
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+		int num);
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+
 static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
 {
-	struct pci_bus        *bus;
-	struct pci_controller *hose;
-	struct pnv_phb        *phb;
 	struct iommu_table    *tbl;
-	unsigned long         addr;
 	int64_t               rc;
 
-	bus = dev->bus;
-	hose = pci_bus_to_host(bus);
-	phb = hose->private_data;
 	tbl = pe->table_group.tables[0];
-	addr = tbl->it_base;
-
-	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
-				   pe->pe_number << 1, 1, __pa(addr),
-				   0, 0x1000);
-
-	rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
-				        pe->pe_number,
-				        (pe->pe_number << 1) + 1,
-				        pe->tce_bypass_base,
-				        0);
+	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (rc)
 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
 
-	pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
+	pnv_pci_ioda2_set_bypass(pe, false);
 	if (pe->table_group.group) {
 		iommu_group_put(pe->table_group.group);
 		BUG_ON(pe->table_group.group);
-- 
cgit v0.10.2


From 0054719386d96984153ad31d714a8be4ec7eba80 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:22 +1000
Subject: powerpc/iommu/ioda2: Add get_table_size() to calculate the size of
 future table

This adds a way for the IOMMU user to know how much a new table will
use so it can be accounted in the locked_vm limit before allocation
happens.

This stores the allocated table size in pnv_pci_ioda2_get_table_size()
so the locked_vm counter can be updated correctly when a table is
being disposed.

This defines an iommu_table_group_ops callback to let VFIO know
how much memory will be locked if a table is created.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index e554175..9d37492 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -99,6 +99,7 @@ struct iommu_table {
 	unsigned long  it_size;      /* Size of iommu table in entries */
 	unsigned long  it_indirect_levels;
 	unsigned long  it_level_size;
+	unsigned long  it_allocated_size;
 	unsigned long  it_offset;    /* Offset into global table */
 	unsigned long  it_base;      /* mapped address of tce table */
 	unsigned long  it_index;     /* which iommu table this is */
@@ -147,6 +148,10 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 struct iommu_table_group;
 
 struct iommu_table_group_ops {
+	unsigned long (*get_table_size)(
+			__u32 page_shift,
+			__u64 window_size,
+			__u32 levels);
 	long (*create_table)(struct iommu_table_group *table_group,
 			int num,
 			__u32 page_shift,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 510bb9c..a7e098d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2074,6 +2074,38 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
 }
 
 #ifdef CONFIG_IOMMU_API
+static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+		__u64 window_size, __u32 levels)
+{
+	unsigned long bytes = 0;
+	const unsigned window_shift = ilog2(window_size);
+	unsigned entries_shift = window_shift - page_shift;
+	unsigned table_shift = entries_shift + 3;
+	unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
+	unsigned long direct_table_size;
+
+	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
+			(window_size > memory_hotplug_max()) ||
+			!is_power_of_2(window_size))
+		return 0;
+
+	/* Calculate a direct table size from window_size and levels */
+	entries_shift = (entries_shift + levels - 1) / levels;
+	table_shift = entries_shift + 3;
+	table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
+	direct_table_size =  1UL << table_shift;
+
+	for ( ; levels; --levels) {
+		bytes += _ALIGN_UP(tce_table_size, direct_table_size);
+
+		tce_table_size /= direct_table_size;
+		tce_table_size <<= 3;
+		tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size);
+	}
+
+	return bytes;
+}
+
 static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
 		int num)
 {
@@ -2117,6 +2149,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
 }
 
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+	.get_table_size = pnv_pci_ioda2_get_table_size,
 	.create_table = pnv_pci_ioda2_create_table,
 	.set_window = pnv_pci_ioda2_set_window,
 	.unset_window = pnv_pci_ioda2_unset_window,
@@ -2228,6 +2261,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 			page_shift);
 	tbl->it_level_size = 1ULL << (level_shift - 3);
 	tbl->it_indirect_levels = levels - 1;
+	tbl->it_allocated_size = offset;
 
 	pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
 			window_size, tce_table_size, bus_offset);
-- 
cgit v0.10.2


From 46d3e1e16294c587a74093b1f5474c1b33b72381 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:23 +1000
Subject: vfio: powerpc/spapr: powerpc/powernv/ioda2: Use DMA windows API in
 ownership control

Before the IOMMU user (VFIO) would take control over the IOMMU table
belonging to a specific IOMMU group. This approach did not allow sharing
tables between IOMMU groups attached to the same container.

This introduces a new IOMMU ownership flavour when the user can not
just control the existing IOMMU table but remove/create tables on demand.
If an IOMMU implements take/release_ownership() callbacks, this lets
the user have full control over the IOMMU group. When the ownership
is taken, the platform code removes all the windows so the caller must
create them.
Before returning the ownership back to the platform code, VFIO
unprograms and removes all the tables it created.

This changes IODA2's onwership handler to remove the existing table
rather than manipulating with the existing one. From now on,
iommu_take_ownership() and iommu_release_ownership() are only called
from the vfio_iommu_spapr_tce driver.

Old-style ownership is still supported allowing VFIO to run on older
P5IOC2 and IODA IO controllers.

No change in userspace-visible behaviour is expected. Since it recreates
TCE tables on each ownership change, related kernel traces will appear
more often.

This adds a pnv_pci_ioda2_setup_default_config() which is called
when PE is being configured at boot time and when the ownership is
passed from VFIO to the platform code.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index a7e098d..b9f0f43 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2073,6 +2073,49 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
 	return 0;
 }
 
+static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table *tbl = NULL;
+	long rc;
+
+	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
+			IOMMU_PAGE_SHIFT_4K,
+			pe->table_group.tce32_size,
+			POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
+	if (rc) {
+		pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
+				rc);
+		return rc;
+	}
+
+	iommu_init_table(tbl, pe->phb->hose->node);
+
+	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+	if (rc) {
+		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
+				rc);
+		pnv_ioda2_table_free(tbl);
+		return rc;
+	}
+
+	if (!pnv_iommu_bypass_disabled)
+		pnv_pci_ioda2_set_bypass(pe, true);
+
+	/* OPAL variant of PHB3 invalidated TCEs */
+	if (pe->phb->ioda.tce_inval_reg)
+		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+
+	/*
+	 * Setting table base here only for carrying iommu_group
+	 * further down to let iommu_add_device() do the job.
+	 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
+	 */
+	if (pe->flags & PNV_IODA_PE_DEV)
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+
+	return 0;
+}
+
 #ifdef CONFIG_IOMMU_API
 static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 		__u64 window_size, __u32 levels)
@@ -2134,9 +2177,12 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 {
 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
 						table_group);
+	/* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
+	struct iommu_table *tbl = pe->table_group.tables[0];
 
-	iommu_take_ownership(table_group->tables[0]);
 	pnv_pci_ioda2_set_bypass(pe, false);
+	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
+	pnv_ioda2_table_free(tbl);
 }
 
 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
@@ -2144,8 +2190,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
 						table_group);
 
-	iommu_release_ownership(table_group->tables[0]);
-	pnv_pci_ioda2_set_bypass(pe, true);
+	pnv_pci_ioda2_setup_default_config(pe);
 }
 
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
@@ -2308,7 +2353,6 @@ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 				       struct pnv_ioda_pe *pe)
 {
-	struct iommu_table *tbl = NULL;
 	int64_t rc;
 
 	/* We shouldn't already have a 32-bit DMA associated */
@@ -2333,58 +2377,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 			IOMMU_TABLE_GROUP_MAX_TABLES;
 	pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
 	pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M;
-
-	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
-			IOMMU_PAGE_SHIFT_4K,
-			pe->table_group.tce32_size,
-			POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
-	if (rc) {
-		pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
-		goto fail;
-	}
-	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
-
-	tbl->it_ops = &pnv_ioda2_iommu_ops;
-	iommu_init_table(tbl, phb->hose->node);
 #ifdef CONFIG_IOMMU_API
 	pe->table_group.ops = &pnv_pci_ioda2_ops;
 #endif
 
-	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+	rc = pnv_pci_ioda2_setup_default_config(pe);
 	if (rc) {
-		pe_err(pe, "Failed to configure 32-bit TCE table,"
-		       " err %ld\n", rc);
-		goto fail;
+		if (pe->tce32_seg >= 0)
+			pe->tce32_seg = -1;
+		return;
 	}
 
-	/* OPAL variant of PHB3 invalidated TCEs */
-	if (phb->ioda.tce_inval_reg)
-		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
-
-	if (pe->flags & PNV_IODA_PE_DEV) {
-		/*
-		 * Setting table base here only for carrying iommu_group
-		 * further down to let iommu_add_device() do the job.
-		 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
-		 */
-		set_iommu_table_base(&pe->pdev->dev, tbl);
+	if (pe->flags & PNV_IODA_PE_DEV)
 		iommu_add_device(&pe->pdev->dev);
-	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
-
-	/* Also create a bypass window */
-	if (!pnv_iommu_bypass_disabled)
-		pnv_pci_ioda2_set_bypass(pe, true);
-
-	return;
-fail:
-	if (pe->tce32_seg >= 0)
-		pe->tce32_seg = -1;
-	if (tbl) {
-		pnv_pci_ioda2_table_free_pages(tbl);
-		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
-		iommu_free_table(tbl, "pnv");
-	}
 }
 
 static void pnv_ioda_setup_dma(struct pnv_phb *phb)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 6d919eb..203caac 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -333,6 +333,45 @@ static long tce_iommu_build(struct tce_container *container,
 	return ret;
 }
 
+static long tce_iommu_create_table(struct tce_container *container,
+			struct iommu_table_group *table_group,
+			int num,
+			__u32 page_shift,
+			__u64 window_size,
+			__u32 levels,
+			struct iommu_table **ptbl)
+{
+	long ret, table_size;
+
+	table_size = table_group->ops->get_table_size(page_shift, window_size,
+			levels);
+	if (!table_size)
+		return -EINVAL;
+
+	ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
+	if (ret)
+		return ret;
+
+	ret = table_group->ops->create_table(table_group, num,
+			page_shift, window_size, levels, ptbl);
+
+	WARN_ON(!ret && !(*ptbl)->it_ops->free);
+	WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
+
+	if (ret)
+		decrement_locked_vm(table_size >> PAGE_SHIFT);
+
+	return ret;
+}
+
+static void tce_iommu_free_table(struct iommu_table *tbl)
+{
+	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
+
+	tbl->it_ops->free(tbl);
+	decrement_locked_vm(pages);
+}
+
 static long tce_iommu_ioctl(void *iommu_data,
 				 unsigned int cmd, unsigned long arg)
 {
@@ -546,15 +585,62 @@ static int tce_iommu_take_ownership(struct tce_container *container,
 static void tce_iommu_release_ownership_ddw(struct tce_container *container,
 		struct iommu_table_group *table_group)
 {
+	long i;
+
+	if (!table_group->ops->unset_window) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		/* Store table pointer as unset_window resets it */
+		struct iommu_table *tbl = table_group->tables[i];
+
+		if (!tbl)
+			continue;
+
+		table_group->ops->unset_window(table_group, i);
+		tce_iommu_clear(container, tbl,
+				tbl->it_offset, tbl->it_size);
+		tce_iommu_free_table(tbl);
+	}
+
 	table_group->ops->release_ownership(table_group);
 }
 
 static long tce_iommu_take_ownership_ddw(struct tce_container *container,
 		struct iommu_table_group *table_group)
 {
+	long ret;
+	struct iommu_table *tbl = NULL;
+
+	if (!table_group->ops->create_table || !table_group->ops->set_window ||
+			!table_group->ops->release_ownership) {
+		WARN_ON_ONCE(1);
+		return -EFAULT;
+	}
+
 	table_group->ops->take_ownership(table_group);
 
-	return 0;
+	ret = tce_iommu_create_table(container,
+			table_group,
+			0, /* window number */
+			IOMMU_PAGE_SHIFT_4K,
+			table_group->tce32_size,
+			1, /* default levels */
+			&tbl);
+	if (!ret) {
+		ret = table_group->ops->set_window(table_group, 0, tbl);
+		if (ret)
+			tce_iommu_free_table(tbl);
+		else
+			table_group->tables[0] = tbl;
+	}
+
+	if (ret)
+		table_group->ops->release_ownership(table_group);
+
+	return ret;
 }
 
 static int tce_iommu_attach_group(void *iommu_data,
-- 
cgit v0.10.2


From 15b244a88e1b2895605be4300b40b575345bcf50 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:24 +1000
Subject: powerpc/mmu: Add userspace-to-physical addresses translation cache

We are adding support for DMA memory pre-registration to be used in
conjunction with VFIO. The idea is that the userspace which is going to
run a guest may want to pre-register a user space memory region so
it all gets pinned once and never goes away. Having this done,
a hypervisor will not have to pin/unpin pages on every DMA map/unmap
request. This is going to help with multiple pinning of the same memory.

Another use of it is in-kernel real mode (mmu off) acceleration of
DMA requests where real time translation of guest physical to host
physical addresses is non-trivial and may fail as linux ptes may be
temporarily invalid. Also, having cached host physical addresses
(compared to just pinning at the start and then walking the page table
again on every H_PUT_TCE), we can be sure that the addresses which we put
into TCE table are the ones we already pinned.

This adds a list of memory regions to mm_context_t. Each region consists
of a header and a list of physical addresses. This adds API to:
1. register/unregister memory regions;
2. do final cleanup (which puts all pre-registered pages);
3. do userspace to physical address translation;
4. manage usage counters; multiple registration of the same memory
is allowed (once per container).

This implements 2 counters per registered memory region:
- @mapped: incremented on every DMA mapping; decremented on unmapping;
initialized to 1 when a region is just registered; once it becomes zero,
no more mappings allowe;
- @used: incremented on every "register" ioctl; decremented on
"unregister"; unregistration is allowed for DMA mapped regions unless
it is the very last reference. For the very last reference this checks
that the region is still mapped and returns -EBUSY so the userspace
gets to know that memory is still pinned and unregistration needs to
be retried; @used remains 1.

Host physical addresses are stored in vmalloc'ed array. In order to
access these in the real mode (mmu off), there is a real_vmalloc_addr()
helper. In-kernel acceleration patchset will move it from KVM to MMU code.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 1da6a81..a82f534 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -536,6 +536,9 @@ typedef struct {
 	/* for 4K PTE fragment support */
 	void *pte_frag;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	struct list_head iommu_group_mem_list;
+#endif
 } mm_context_t;
 
 
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 73382eb..3e51842 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -16,6 +16,24 @@
  */
 extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
 extern void destroy_context(struct mm_struct *mm);
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+struct mm_iommu_table_group_mem_t;
+
+extern bool mm_iommu_preregistered(void);
+extern long mm_iommu_get(unsigned long ua, unsigned long entries,
+		struct mm_iommu_table_group_mem_t **pmem);
+extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_init(mm_context_t *ctx);
+extern void mm_iommu_cleanup(mm_context_t *ctx);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+		unsigned long size);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua,
+		unsigned long entries);
+extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
+#endif
 
 extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
 extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 9fe3dcd..bdcbb71 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -687,6 +687,9 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_PPC_64K_PAGES
 	init_mm.context.pte_frag = NULL;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	mm_iommu_init(&init_mm.context);
+#endif
 	irqstack_early_init();
 	exc_lvl_early_init();
 	emergency_stack_init();
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 9c8770b..3eb73a3 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
+obj-$(CONFIG_SPAPR_TCE_IOMMU)	+= mmu_context_iommu.o
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index 178876ae..4e4efbc 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 #ifdef CONFIG_PPC_64K_PAGES
 	mm->context.pte_frag = NULL;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	mm_iommu_init(&mm->context);
+#endif
 	return 0;
 }
 
@@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct mm_struct *mm)
 
 void destroy_context(struct mm_struct *mm)
 {
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	mm_iommu_cleanup(&mm->context);
+#endif
 
 #ifdef CONFIG_PPC_ICSWX
 	drop_cop(mm->context.acop, mm);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
new file mode 100644
index 0000000..da6a216
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -0,0 +1,316 @@
+/*
+ *  IOMMU helpers in MMU context.
+ *
+ *  Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <asm/mmu_context.h>
+
+static DEFINE_MUTEX(mem_list_mutex);
+
+struct mm_iommu_table_group_mem_t {
+	struct list_head next;
+	struct rcu_head rcu;
+	unsigned long used;
+	atomic64_t mapped;
+	u64 ua;			/* userspace address */
+	u64 entries;		/* number of entries in hpas[] */
+	u64 *hpas;		/* vmalloc'ed */
+};
+
+static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
+		unsigned long npages, bool incr)
+{
+	long ret = 0, locked, lock_limit;
+
+	if (!npages)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+
+	if (incr) {
+		locked = mm->locked_vm + npages;
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			ret = -ENOMEM;
+		else
+			mm->locked_vm += npages;
+	} else {
+		if (WARN_ON_ONCE(npages > mm->locked_vm))
+			npages = mm->locked_vm;
+		mm->locked_vm -= npages;
+	}
+
+	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
+			current->pid,
+			incr ? '+' : '-',
+			npages << PAGE_SHIFT,
+			mm->locked_vm << PAGE_SHIFT,
+			rlimit(RLIMIT_MEMLOCK));
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+bool mm_iommu_preregistered(void)
+{
+	if (!current || !current->mm)
+		return false;
+
+	return !list_empty(&current->mm->context.iommu_group_mem_list);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
+
+long mm_iommu_get(unsigned long ua, unsigned long entries,
+		struct mm_iommu_table_group_mem_t **pmem)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+	long i, j, ret = 0, locked_entries = 0;
+	struct page *page = NULL;
+
+	if (!current || !current->mm)
+		return -ESRCH; /* process exited */
+
+	mutex_lock(&mem_list_mutex);
+
+	list_for_each_entry_rcu(mem, &current->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua == ua) && (mem->entries == entries)) {
+			++mem->used;
+			*pmem = mem;
+			goto unlock_exit;
+		}
+
+		/* Overlap? */
+		if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
+				(ua < (mem->ua +
+				       (mem->entries << PAGE_SHIFT)))) {
+			ret = -EINVAL;
+			goto unlock_exit;
+		}
+
+	}
+
+	ret = mm_iommu_adjust_locked_vm(current->mm, entries, true);
+	if (ret)
+		goto unlock_exit;
+
+	locked_entries = entries;
+
+	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+	if (!mem) {
+		ret = -ENOMEM;
+		goto unlock_exit;
+	}
+
+	mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
+	if (!mem->hpas) {
+		kfree(mem);
+		ret = -ENOMEM;
+		goto unlock_exit;
+	}
+
+	for (i = 0; i < entries; ++i) {
+		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+					1/* pages */, 1/* iswrite */, &page)) {
+			for (j = 0; j < i; ++j)
+				put_page(pfn_to_page(
+						mem->hpas[j] >> PAGE_SHIFT));
+			vfree(mem->hpas);
+			kfree(mem);
+			ret = -EFAULT;
+			goto unlock_exit;
+		}
+
+		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
+	}
+
+	atomic64_set(&mem->mapped, 1);
+	mem->used = 1;
+	mem->ua = ua;
+	mem->entries = entries;
+	*pmem = mem;
+
+	list_add_rcu(&mem->next, &current->mm->context.iommu_group_mem_list);
+
+unlock_exit:
+	if (locked_entries && ret)
+		mm_iommu_adjust_locked_vm(current->mm, locked_entries, false);
+
+	mutex_unlock(&mem_list_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_get);
+
+static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
+{
+	long i;
+	struct page *page = NULL;
+
+	for (i = 0; i < mem->entries; ++i) {
+		if (!mem->hpas[i])
+			continue;
+
+		page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
+		if (!page)
+			continue;
+
+		put_page(page);
+		mem->hpas[i] = 0;
+	}
+}
+
+static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
+{
+
+	mm_iommu_unpin(mem);
+	vfree(mem->hpas);
+	kfree(mem);
+}
+
+static void mm_iommu_free(struct rcu_head *head)
+{
+	struct mm_iommu_table_group_mem_t *mem = container_of(head,
+			struct mm_iommu_table_group_mem_t, rcu);
+
+	mm_iommu_do_free(mem);
+}
+
+static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
+{
+	list_del_rcu(&mem->next);
+	mm_iommu_adjust_locked_vm(current->mm, mem->entries, false);
+	call_rcu(&mem->rcu, mm_iommu_free);
+}
+
+long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem)
+{
+	long ret = 0;
+
+	if (!current || !current->mm)
+		return -ESRCH; /* process exited */
+
+	mutex_lock(&mem_list_mutex);
+
+	if (mem->used == 0) {
+		ret = -ENOENT;
+		goto unlock_exit;
+	}
+
+	--mem->used;
+	/* There are still users, exit */
+	if (mem->used)
+		goto unlock_exit;
+
+	/* Are there still mappings? */
+	if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
+		++mem->used;
+		ret = -EBUSY;
+		goto unlock_exit;
+	}
+
+	/* @mapped became 0 so now mappings are disabled, release the region */
+	mm_iommu_release(mem);
+
+unlock_exit:
+	mutex_unlock(&mem_list_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_put);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+		unsigned long size)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_rcu(mem,
+			&current->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua <= ua) &&
+				(ua + size <= mem->ua +
+				 (mem->entries << PAGE_SHIFT))) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua,
+		unsigned long entries)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_rcu(mem,
+			&current->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua == ua) && (mem->entries == entries)) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_find);
+
+long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa)
+{
+	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+	u64 *va = &mem->hpas[entry];
+
+	if (entry >= mem->entries)
+		return -EFAULT;
+
+	*hpa = *va | (ua & ~PAGE_MASK);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
+
+long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
+{
+	if (atomic64_inc_not_zero(&mem->mapped))
+		return 0;
+
+	/* Last mm_iommu_put() has been called, no more mappings allowed() */
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
+
+void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
+{
+	atomic64_add_unless(&mem->mapped, -1, 1);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
+
+void mm_iommu_init(mm_context_t *ctx)
+{
+	INIT_LIST_HEAD_RCU(&ctx->iommu_group_mem_list);
+}
+
+void mm_iommu_cleanup(mm_context_t *ctx)
+{
+	struct mm_iommu_table_group_mem_t *mem, *tmp;
+
+	list_for_each_entry_safe(mem, tmp, &ctx->iommu_group_mem_list, next) {
+		list_del_rcu(&mem->next);
+		mm_iommu_do_free(mem);
+	}
+}
-- 
cgit v0.10.2


From 2157e7b82f3b81f57bd80cd67cef09ef26e5f74c Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:25 +1000
Subject: vfio: powerpc/spapr: Register memory and define IOMMU v2

The existing implementation accounts the whole DMA window in
the locked_vm counter. This is going to be worse with multiple
containers and huge DMA windows. Also, real-time accounting would requite
additional tracking of accounted pages due to the page size difference -
IOMMU uses 4K pages and system uses 4K or 64K pages.

Another issue is that actual pages pinning/unpinning happens on every
DMA map/unmap request. This does not affect the performance much now as
we spend way too much time now on switching context between
guest/userspace/host but this will start to matter when we add in-kernel
DMA map/unmap acceleration.

This introduces a new IOMMU type for SPAPR - VFIO_SPAPR_TCE_v2_IOMMU.
New IOMMU deprecates VFIO_IOMMU_ENABLE/VFIO_IOMMU_DISABLE and introduces
2 new ioctls to register/unregister DMA memory -
VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY -
which receive user space address and size of a memory region which
needs to be pinned/unpinned and counted in locked_vm.
New IOMMU splits physical pages pinning and TCE table update
into 2 different operations. It requires:
1) guest pages to be registered first
2) consequent map/unmap requests to work only with pre-registered memory.
For the default single window case this means that the entire guest
(instead of 2GB) needs to be pinned before using VFIO.
When a huge DMA window is added, no additional pinning will be
required, otherwise it would be guest RAM + 2GB.

The new memory registration ioctls are not supported by
VFIO_SPAPR_TCE_IOMMU. Dynamic DMA window and in-kernel acceleration
will require memory to be preregistered in order to work.

The accounting is done per the user process.

This advertises v2 SPAPR TCE IOMMU and restricts what the userspace
can do with v1 or v2 IOMMUs.

In order to support memory pre-registration, we need a way to track
the use of every registered memory region and only allow unregistration
if a region is not in use anymore. So we need a way to tell from what
region the just cleared TCE was from.

This adds a userspace view of the TCE table into iommu_table struct.
It contains userspace address, one per TCE entry. The table is only
allocated when the ownership over an IOMMU group is taken which means
it is only used from outside of the powernv code (such as VFIO).

As v2 IOMMU supports IODA2 and pre-IODA2 IOMMUs (which do not support
DDW API), this creates a default DMA window for IODA2 for consistency.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 4c746a7..dcc37e1 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -289,10 +289,12 @@ PPC64 sPAPR implementation note
 
 This implementation has some specifics:
 
-1) Only one IOMMU group per container is supported as an IOMMU group
-represents the minimal entity which isolation can be guaranteed for and
-groups are allocated statically, one per a Partitionable Endpoint (PE)
+1) On older systems (POWER7 with P5IOC2/IODA1) only one IOMMU group per
+container is supported as an IOMMU table is allocated at the boot time,
+one table per a IOMMU group which is a Partitionable Endpoint (PE)
 (PE is often a PCI domain but not always).
+Newer systems (POWER8 with IODA2) have improved hardware design which allows
+to remove this limitation and have multiple IOMMU groups per a VFIO container.
 
 2) The hardware supports so called DMA windows - the PCI address range
 within which DMA transfer is allowed, any attempt to access address space
@@ -439,6 +441,29 @@ The code flow from the example above should be slightly changed:
 
 	....
 
+5) There is v2 of SPAPR TCE IOMMU. It deprecates VFIO_IOMMU_ENABLE/
+VFIO_IOMMU_DISABLE and implements 2 new ioctls:
+VFIO_IOMMU_SPAPR_REGISTER_MEMORY and VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY
+(which are unsupported in v1 IOMMU).
+
+PPC64 paravirtualized guests generate a lot of map/unmap requests,
+and the handling of those includes pinning/unpinning pages and updating
+mm::locked_vm counter to make sure we do not exceed the rlimit.
+The v2 IOMMU splits accounting and pinning into separate operations:
+
+- VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
+receive a user space address and size of the block to be pinned.
+Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
+be called with the exact address and size used for registering
+the memory block. The userspace is not expected to call these often.
+The ranges are stored in a linked list in a VFIO container.
+
+- VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA ioctls only update the actual
+IOMMU table and do not do pinning; instead these check that the userspace
+address is from pre-registered range.
+
+This separation helps in optimizing DMA for guests.
+
 -------------------------------------------------------------------------------
 
 [1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 9d37492..f9957eb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -112,9 +112,15 @@ struct iommu_table {
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
 	unsigned long  it_page_shift;/* table iommu page size */
 	struct list_head it_group_list;/* List of iommu_table_group_link */
+	unsigned long *it_userspace; /* userspace view of the table */
 	struct iommu_table_ops *it_ops;
 };
 
+#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
+		((tbl)->it_userspace ? \
+			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
+			NULL)
+
 /* Pure 2^n version of get_order */
 static inline __attribute_const__
 int get_iommu_order(unsigned long size, struct iommu_table *tbl)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 203caac..91a3223 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -19,8 +19,10 @@
 #include <linux/uaccess.h>
 #include <linux/err.h>
 #include <linux/vfio.h>
+#include <linux/vmalloc.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
+#include <asm/mmu_context.h>
 
 #define DRIVER_VERSION  "0.1"
 #define DRIVER_AUTHOR   "aik@ozlabs.ru"
@@ -81,6 +83,11 @@ static void decrement_locked_vm(long npages)
  * into DMA'ble space using the IOMMU
  */
 
+struct tce_iommu_group {
+	struct list_head next;
+	struct iommu_group *grp;
+};
+
 /*
  * The container descriptor supports only a single group per container.
  * Required by the API as the container is not supplied with the IOMMU group
@@ -88,11 +95,84 @@ static void decrement_locked_vm(long npages)
  */
 struct tce_container {
 	struct mutex lock;
-	struct iommu_group *grp;
 	bool enabled;
+	bool v2;
 	unsigned long locked_pages;
+	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+	struct list_head group_list;
 };
 
+static long tce_iommu_unregister_pages(struct tce_container *container,
+		__u64 vaddr, __u64 size)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+
+	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
+		return -EINVAL;
+
+	mem = mm_iommu_find(vaddr, size >> PAGE_SHIFT);
+	if (!mem)
+		return -ENOENT;
+
+	return mm_iommu_put(mem);
+}
+
+static long tce_iommu_register_pages(struct tce_container *container,
+		__u64 vaddr, __u64 size)
+{
+	long ret = 0;
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	unsigned long entries = size >> PAGE_SHIFT;
+
+	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
+			((vaddr + size) < vaddr))
+		return -EINVAL;
+
+	ret = mm_iommu_get(vaddr, entries, &mem);
+	if (ret)
+		return ret;
+
+	container->enabled = true;
+
+	return 0;
+}
+
+static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl)
+{
+	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
+			tbl->it_size, PAGE_SIZE);
+	unsigned long *uas;
+	long ret;
+
+	BUG_ON(tbl->it_userspace);
+
+	ret = try_increment_locked_vm(cb >> PAGE_SHIFT);
+	if (ret)
+		return ret;
+
+	uas = vzalloc(cb);
+	if (!uas) {
+		decrement_locked_vm(cb >> PAGE_SHIFT);
+		return -ENOMEM;
+	}
+	tbl->it_userspace = uas;
+
+	return 0;
+}
+
+static void tce_iommu_userspace_view_free(struct iommu_table *tbl)
+{
+	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
+			tbl->it_size, PAGE_SIZE);
+
+	if (!tbl->it_userspace)
+		return;
+
+	vfree(tbl->it_userspace);
+	tbl->it_userspace = NULL;
+	decrement_locked_vm(cb >> PAGE_SHIFT);
+}
+
 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 {
 	/*
@@ -103,18 +183,18 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
 }
 
+static inline bool tce_groups_attached(struct tce_container *container)
+{
+	return !list_empty(&container->group_list);
+}
+
 static long tce_iommu_find_table(struct tce_container *container,
 		phys_addr_t ioba, struct iommu_table **ptbl)
 {
 	long i;
-	struct iommu_table_group *table_group;
-
-	table_group = iommu_group_get_iommudata(container->grp);
-	if (!table_group)
-		return -1;
 
 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-		struct iommu_table *tbl = table_group->tables[i];
+		struct iommu_table *tbl = container->tables[i];
 
 		if (tbl) {
 			unsigned long entry = ioba >> tbl->it_page_shift;
@@ -136,9 +216,7 @@ static int tce_iommu_enable(struct tce_container *container)
 	int ret = 0;
 	unsigned long locked;
 	struct iommu_table_group *table_group;
-
-	if (!container->grp)
-		return -ENXIO;
+	struct tce_iommu_group *tcegrp;
 
 	if (!current->mm)
 		return -ESRCH; /* process exited */
@@ -175,7 +253,12 @@ static int tce_iommu_enable(struct tce_container *container)
 	 * as there is no way to know how much we should increment
 	 * the locked_vm counter.
 	 */
-	table_group = iommu_group_get_iommudata(container->grp);
+	if (!tce_groups_attached(container))
+		return -ENODEV;
+
+	tcegrp = list_first_entry(&container->group_list,
+			struct tce_iommu_group, next);
+	table_group = iommu_group_get_iommudata(tcegrp->grp);
 	if (!table_group)
 		return -ENODEV;
 
@@ -211,7 +294,7 @@ static void *tce_iommu_open(unsigned long arg)
 {
 	struct tce_container *container;
 
-	if (arg != VFIO_SPAPR_TCE_IOMMU) {
+	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 		pr_err("tce_vfio: Wrong IOMMU type\n");
 		return ERR_PTR(-EINVAL);
 	}
@@ -221,18 +304,45 @@ static void *tce_iommu_open(unsigned long arg)
 		return ERR_PTR(-ENOMEM);
 
 	mutex_init(&container->lock);
+	INIT_LIST_HEAD_RCU(&container->group_list);
+
+	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 
 	return container;
 }
 
+static int tce_iommu_clear(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long pages);
+static void tce_iommu_free_table(struct iommu_table *tbl);
+
 static void tce_iommu_release(void *iommu_data)
 {
 	struct tce_container *container = iommu_data;
+	struct iommu_table_group *table_group;
+	struct tce_iommu_group *tcegrp;
+	long i;
 
-	WARN_ON(container->grp);
+	while (tce_groups_attached(container)) {
+		tcegrp = list_first_entry(&container->group_list,
+				struct tce_iommu_group, next);
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+		tce_iommu_detach_group(iommu_data, tcegrp->grp);
+	}
 
-	if (container->grp)
-		tce_iommu_detach_group(iommu_data, container->grp);
+	/*
+	 * If VFIO created a table, it was not disposed
+	 * by tce_iommu_detach_group() so do it now.
+	 */
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		struct iommu_table *tbl = container->tables[i];
+
+		if (!tbl)
+			continue;
+
+		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+		tce_iommu_free_table(tbl);
+	}
 
 	tce_iommu_disable(container);
 	mutex_destroy(&container->lock);
@@ -249,6 +359,47 @@ static void tce_iommu_unuse_page(struct tce_container *container,
 	put_page(page);
 }
 
+static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size,
+		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
+{
+	long ret = 0;
+	struct mm_iommu_table_group_mem_t *mem;
+
+	mem = mm_iommu_lookup(tce, size);
+	if (!mem)
+		return -EINVAL;
+
+	ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
+	if (ret)
+		return -EINVAL;
+
+	*pmem = mem;
+
+	return 0;
+}
+
+static void tce_iommu_unuse_page_v2(struct iommu_table *tbl,
+		unsigned long entry)
+{
+	struct mm_iommu_table_group_mem_t *mem = NULL;
+	int ret;
+	unsigned long hpa = 0;
+	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+
+	if (!pua || !current || !current->mm)
+		return;
+
+	ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl),
+			&hpa, &mem);
+	if (ret)
+		pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
+				__func__, *pua, entry, ret);
+	if (mem)
+		mm_iommu_mapped_dec(mem);
+
+	*pua = 0;
+}
+
 static int tce_iommu_clear(struct tce_container *container,
 		struct iommu_table *tbl,
 		unsigned long entry, unsigned long pages)
@@ -267,6 +418,11 @@ static int tce_iommu_clear(struct tce_container *container,
 		if (direction == DMA_NONE)
 			continue;
 
+		if (container->v2) {
+			tce_iommu_unuse_page_v2(tbl, entry);
+			continue;
+		}
+
 		tce_iommu_unuse_page(container, oldhpa);
 	}
 
@@ -333,6 +489,64 @@ static long tce_iommu_build(struct tce_container *container,
 	return ret;
 }
 
+static long tce_iommu_build_v2(struct tce_container *container,
+		struct iommu_table *tbl,
+		unsigned long entry, unsigned long tce, unsigned long pages,
+		enum dma_data_direction direction)
+{
+	long i, ret = 0;
+	struct page *page;
+	unsigned long hpa;
+	enum dma_data_direction dirtmp;
+
+	for (i = 0; i < pages; ++i) {
+		struct mm_iommu_table_group_mem_t *mem = NULL;
+		unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
+				entry + i);
+
+		ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl),
+				&hpa, &mem);
+		if (ret)
+			break;
+
+		page = pfn_to_page(hpa >> PAGE_SHIFT);
+		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
+			ret = -EPERM;
+			break;
+		}
+
+		/* Preserve offset within IOMMU page */
+		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
+		dirtmp = direction;
+
+		/* The registered region is being unregistered */
+		if (mm_iommu_mapped_inc(mem))
+			break;
+
+		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
+		if (ret) {
+			/* dirtmp cannot be DMA_NONE here */
+			tce_iommu_unuse_page_v2(tbl, entry + i);
+			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
+					__func__, entry << tbl->it_page_shift,
+					tce, ret);
+			break;
+		}
+
+		if (dirtmp != DMA_NONE)
+			tce_iommu_unuse_page_v2(tbl, entry + i);
+
+		*pua = tce;
+
+		tce += IOMMU_PAGE_SIZE(tbl);
+	}
+
+	if (ret)
+		tce_iommu_clear(container, tbl, entry, i);
+
+	return ret;
+}
+
 static long tce_iommu_create_table(struct tce_container *container,
 			struct iommu_table_group *table_group,
 			int num,
@@ -358,6 +572,12 @@ static long tce_iommu_create_table(struct tce_container *container,
 	WARN_ON(!ret && !(*ptbl)->it_ops->free);
 	WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
 
+	if (!ret && container->v2) {
+		ret = tce_iommu_userspace_view_alloc(*ptbl);
+		if (ret)
+			(*ptbl)->it_ops->free(*ptbl);
+	}
+
 	if (ret)
 		decrement_locked_vm(table_size >> PAGE_SHIFT);
 
@@ -368,6 +588,7 @@ static void tce_iommu_free_table(struct iommu_table *tbl)
 {
 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 
+	tce_iommu_userspace_view_free(tbl);
 	tbl->it_ops->free(tbl);
 	decrement_locked_vm(pages);
 }
@@ -383,6 +604,7 @@ static long tce_iommu_ioctl(void *iommu_data,
 	case VFIO_CHECK_EXTENSION:
 		switch (arg) {
 		case VFIO_SPAPR_TCE_IOMMU:
+		case VFIO_SPAPR_TCE_v2_IOMMU:
 			ret = 1;
 			break;
 		default:
@@ -394,12 +616,15 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 		struct vfio_iommu_spapr_tce_info info;
+		struct tce_iommu_group *tcegrp;
 		struct iommu_table_group *table_group;
 
-		if (WARN_ON(!container->grp))
+		if (!tce_groups_attached(container))
 			return -ENXIO;
 
-		table_group = iommu_group_get_iommudata(container->grp);
+		tcegrp = list_first_entry(&container->group_list,
+				struct tce_iommu_group, next);
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
 
 		if (!table_group)
 			return -ENXIO;
@@ -468,11 +693,18 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (ret)
 			return ret;
 
-		ret = tce_iommu_build(container, tbl,
-				param.iova >> tbl->it_page_shift,
-				param.vaddr,
-				param.size >> tbl->it_page_shift,
-				direction);
+		if (container->v2)
+			ret = tce_iommu_build_v2(container, tbl,
+					param.iova >> tbl->it_page_shift,
+					param.vaddr,
+					param.size >> tbl->it_page_shift,
+					direction);
+		else
+			ret = tce_iommu_build(container, tbl,
+					param.iova >> tbl->it_page_shift,
+					param.vaddr,
+					param.size >> tbl->it_page_shift,
+					direction);
 
 		iommu_flush_tce(tbl);
 
@@ -518,7 +750,62 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 		return ret;
 	}
+	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
+		struct vfio_iommu_spapr_register_memory param;
+
+		if (!container->v2)
+			break;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
+				size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		mutex_lock(&container->lock);
+		ret = tce_iommu_register_pages(container, param.vaddr,
+				param.size);
+		mutex_unlock(&container->lock);
+
+		return ret;
+	}
+	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
+		struct vfio_iommu_spapr_register_memory param;
+
+		if (!container->v2)
+			break;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
+				size);
+
+		if (copy_from_user(&param, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (param.argsz < minsz)
+			return -EINVAL;
+
+		/* No flag is supported now */
+		if (param.flags)
+			return -EINVAL;
+
+		mutex_lock(&container->lock);
+		ret = tce_iommu_unregister_pages(container, param.vaddr,
+				param.size);
+		mutex_unlock(&container->lock);
+
+		return ret;
+	}
 	case VFIO_IOMMU_ENABLE:
+		if (container->v2)
+			break;
+
 		mutex_lock(&container->lock);
 		ret = tce_iommu_enable(container);
 		mutex_unlock(&container->lock);
@@ -526,16 +813,27 @@ static long tce_iommu_ioctl(void *iommu_data,
 
 
 	case VFIO_IOMMU_DISABLE:
+		if (container->v2)
+			break;
+
 		mutex_lock(&container->lock);
 		tce_iommu_disable(container);
 		mutex_unlock(&container->lock);
 		return 0;
-	case VFIO_EEH_PE_OP:
-		if (!container->grp)
-			return -ENODEV;
 
-		return vfio_spapr_iommu_eeh_ioctl(container->grp,
-						  cmd, arg);
+	case VFIO_EEH_PE_OP: {
+		struct tce_iommu_group *tcegrp;
+
+		ret = 0;
+		list_for_each_entry(tcegrp, &container->group_list, next) {
+			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
+					cmd, arg);
+			if (ret)
+				return ret;
+		}
+		return ret;
+	}
+
 	}
 
 	return -ENOTTY;
@@ -547,14 +845,17 @@ static void tce_iommu_release_ownership(struct tce_container *container,
 	int i;
 
 	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-		struct iommu_table *tbl = table_group->tables[i];
+		struct iommu_table *tbl = container->tables[i];
 
 		if (!tbl)
 			continue;
 
 		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+		tce_iommu_userspace_view_free(tbl);
 		if (tbl->it_map)
 			iommu_release_ownership(tbl);
+
+		container->tables[i] = NULL;
 	}
 }
 
@@ -569,7 +870,10 @@ static int tce_iommu_take_ownership(struct tce_container *container,
 		if (!tbl || !tbl->it_map)
 			continue;
 
-		rc = iommu_take_ownership(tbl);
+		rc = tce_iommu_userspace_view_alloc(tbl);
+		if (!rc)
+			rc = iommu_take_ownership(tbl);
+
 		if (rc) {
 			for (j = 0; j < i; ++j)
 				iommu_release_ownership(
@@ -579,6 +883,9 @@ static int tce_iommu_take_ownership(struct tce_container *container,
 		}
 	}
 
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
+		container->tables[i] = table_group->tables[i];
+
 	return 0;
 }
 
@@ -592,18 +899,8 @@ static void tce_iommu_release_ownership_ddw(struct tce_container *container,
 		return;
 	}
 
-	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-		/* Store table pointer as unset_window resets it */
-		struct iommu_table *tbl = table_group->tables[i];
-
-		if (!tbl)
-			continue;
-
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
 		table_group->ops->unset_window(table_group, i);
-		tce_iommu_clear(container, tbl,
-				tbl->it_offset, tbl->it_size);
-		tce_iommu_free_table(tbl);
-	}
 
 	table_group->ops->release_ownership(table_group);
 }
@@ -611,7 +908,7 @@ static void tce_iommu_release_ownership_ddw(struct tce_container *container,
 static long tce_iommu_take_ownership_ddw(struct tce_container *container,
 		struct iommu_table_group *table_group)
 {
-	long ret;
+	long i, ret = 0;
 	struct iommu_table *tbl = NULL;
 
 	if (!table_group->ops->create_table || !table_group->ops->set_window ||
@@ -622,23 +919,45 @@ static long tce_iommu_take_ownership_ddw(struct tce_container *container,
 
 	table_group->ops->take_ownership(table_group);
 
-	ret = tce_iommu_create_table(container,
-			table_group,
-			0, /* window number */
-			IOMMU_PAGE_SHIFT_4K,
-			table_group->tce32_size,
-			1, /* default levels */
-			&tbl);
-	if (!ret) {
-		ret = table_group->ops->set_window(table_group, 0, tbl);
+	/*
+	 * If it the first group attached, check if there is
+	 * a default DMA window and create one if none as
+	 * the userspace expects it to exist.
+	 */
+	if (!tce_groups_attached(container) && !container->tables[0]) {
+		ret = tce_iommu_create_table(container,
+				table_group,
+				0, /* window number */
+				IOMMU_PAGE_SHIFT_4K,
+				table_group->tce32_size,
+				1, /* default levels */
+				&tbl);
 		if (ret)
-			tce_iommu_free_table(tbl);
+			goto release_exit;
 		else
-			table_group->tables[0] = tbl;
+			container->tables[0] = tbl;
 	}
 
-	if (ret)
-		table_group->ops->release_ownership(table_group);
+	/* Set all windows to the new group */
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		tbl = container->tables[i];
+
+		if (!tbl)
+			continue;
+
+		/* Set the default window to a new group */
+		ret = table_group->ops->set_window(table_group, i, tbl);
+		if (ret)
+			goto release_exit;
+	}
+
+	return 0;
+
+release_exit:
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
+		table_group->ops->unset_window(table_group, i);
+
+	table_group->ops->release_ownership(table_group);
 
 	return ret;
 }
@@ -649,29 +968,44 @@ static int tce_iommu_attach_group(void *iommu_data,
 	int ret;
 	struct tce_container *container = iommu_data;
 	struct iommu_table_group *table_group;
+	struct tce_iommu_group *tcegrp = NULL;
 
 	mutex_lock(&container->lock);
 
 	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
 			iommu_group_id(iommu_group), iommu_group); */
-	if (container->grp) {
-		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
-				iommu_group_id(container->grp),
-				iommu_group_id(iommu_group));
+	table_group = iommu_group_get_iommudata(iommu_group);
+
+	if (tce_groups_attached(container) && (!table_group->ops ||
+			!table_group->ops->take_ownership ||
+			!table_group->ops->release_ownership)) {
 		ret = -EBUSY;
 		goto unlock_exit;
 	}
 
-	if (container->enabled) {
-		pr_err("tce_vfio: attaching group #%u to enabled container\n",
-				iommu_group_id(iommu_group));
-		ret = -EBUSY;
-		goto unlock_exit;
+	/* Check if new group has the same iommu_ops (i.e. compatible) */
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		struct iommu_table_group *table_group_tmp;
+
+		if (tcegrp->grp == iommu_group) {
+			pr_warn("tce_vfio: Group %d is already attached\n",
+					iommu_group_id(iommu_group));
+			ret = -EBUSY;
+			goto unlock_exit;
+		}
+		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
+		if (table_group_tmp->ops != table_group->ops) {
+			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
+					iommu_group_id(iommu_group),
+					iommu_group_id(tcegrp->grp));
+			ret = -EPERM;
+			goto unlock_exit;
+		}
 	}
 
-	table_group = iommu_group_get_iommudata(iommu_group);
-	if (!table_group) {
-		ret = -ENXIO;
+	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
+	if (!tcegrp) {
+		ret = -ENOMEM;
 		goto unlock_exit;
 	}
 
@@ -681,10 +1015,15 @@ static int tce_iommu_attach_group(void *iommu_data,
 	else
 		ret = tce_iommu_take_ownership_ddw(container, table_group);
 
-	if (!ret)
-		container->grp = iommu_group;
+	if (!ret) {
+		tcegrp->grp = iommu_group;
+		list_add(&tcegrp->next, &container->group_list);
+	}
 
 unlock_exit:
+	if (ret && tcegrp)
+		kfree(tcegrp);
+
 	mutex_unlock(&container->lock);
 
 	return ret;
@@ -695,24 +1034,26 @@ static void tce_iommu_detach_group(void *iommu_data,
 {
 	struct tce_container *container = iommu_data;
 	struct iommu_table_group *table_group;
+	bool found = false;
+	struct tce_iommu_group *tcegrp;
 
 	mutex_lock(&container->lock);
-	if (iommu_group != container->grp) {
-		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
-				iommu_group_id(iommu_group),
-				iommu_group_id(container->grp));
-		goto unlock_exit;
+
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		if (tcegrp->grp == iommu_group) {
+			found = true;
+			break;
+		}
 	}
 
-	if (container->enabled) {
-		pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
-				iommu_group_id(container->grp));
-		tce_iommu_disable(container);
+	if (!found) {
+		pr_warn("tce_vfio: detaching unattached group #%u\n",
+				iommu_group_id(iommu_group));
+		goto unlock_exit;
 	}
 
-	/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
-	   iommu_group_id(iommu_group), iommu_group); */
-	container->grp = NULL;
+	list_del(&tcegrp->next);
+	kfree(tcegrp);
 
 	table_group = iommu_group_get_iommudata(iommu_group);
 	BUG_ON(!table_group);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index e4fa199..fa84391 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -36,6 +36,8 @@
 /* Two-stage IOMMU */
 #define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
 
+#define VFIO_SPAPR_TCE_v2_IOMMU		7
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
@@ -507,6 +509,31 @@ struct vfio_eeh_pe_op {
 
 #define VFIO_EEH_PE_OP			_IO(VFIO_TYPE, VFIO_BASE + 21)
 
+/**
+ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory)
+ *
+ * Registers user space memory where DMA is allowed. It pins
+ * user pages and does the locked memory accounting so
+ * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls
+ * get faster.
+ */
+struct vfio_iommu_spapr_register_memory {
+	__u32	argsz;
+	__u32	flags;
+	__u64	vaddr;				/* Process virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/**
+ * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory)
+ *
+ * Unregisters user space memory registered with
+ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY.
+ * Uses vfio_iommu_spapr_register_memory for parameters.
+ */
+#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 18)
+
 /* ***************************************************************** */
 
 #endif /* _UAPIVFIO_H */
-- 
cgit v0.10.2


From e633bc86a922468a82300eef5b9802e17be5e23d Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Fri, 5 Jun 2015 16:35:26 +1000
Subject: vfio: powerpc/spapr: Support Dynamic DMA windows

This adds create/remove window ioctls to create and remove DMA windows.
sPAPR defines a Dynamic DMA windows capability which allows
para-virtualized guests to create additional DMA windows on a PCI bus.
The existing linux kernels use this new window to map the entire guest
memory and switch to the direct DMA operations saving time on map/unmap
requests which would normally happen in a big amounts.

This adds 2 ioctl handlers - VFIO_IOMMU_SPAPR_TCE_CREATE and
VFIO_IOMMU_SPAPR_TCE_REMOVE - to create and remove windows.
Up to 2 windows are supported now by the hardware and by this driver.

This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional
information such as a number of supported windows and maximum number
levels of TCE tables.

DDW is added as a capability, not as a SPAPR TCE IOMMU v2 unique feature
as we still want to support v2 on platforms which cannot do DDW for
the sake of TCE acceleration in KVM (coming soon).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index dcc37e1..1dd3fdd 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -464,6 +464,25 @@ address is from pre-registered range.
 
 This separation helps in optimizing DMA for guests.
 
+6) sPAPR specification allows guests to have an additional DMA window(s) on
+a PCI bus with a variable page size. Two ioctls have been added to support
+this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE.
+The platform has to support the functionality or error will be returned to
+the userspace. The existing hardware supports up to 2 DMA windows, one is
+2GB long, uses 4K pages and called "default 32bit window"; the other can
+be as big as entire RAM, use different page size, it is optional - guests
+create those in run-time if the guest driver supports 64bit DMA.
+
+VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and
+a number of TCE table levels (if a TCE table is going to be big enough and
+the kernel may not be able to allocate enough of physically contiguous memory).
+It creates a new window in the available slot and returns the bus address where
+the new window starts. Due to hardware limitation, the user space cannot choose
+the location of DMA windows.
+
+VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window
+and removes it.
+
 -------------------------------------------------------------------------------
 
 [1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index f9957eb..ca18cff 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -149,7 +149,7 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 					    int nid);
-#define IOMMU_TABLE_GROUP_MAX_TABLES	1
+#define IOMMU_TABLE_GROUP_MAX_TABLES	2
 
 struct iommu_table_group;
 
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 91a3223..0582b72 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -211,6 +211,18 @@ static long tce_iommu_find_table(struct tce_container *container,
 	return -1;
 }
 
+static int tce_iommu_find_free_table(struct tce_container *container)
+{
+	int i;
+
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		if (!container->tables[i])
+			return i;
+	}
+
+	return -ENOSPC;
+}
+
 static int tce_iommu_enable(struct tce_container *container)
 {
 	int ret = 0;
@@ -593,11 +605,115 @@ static void tce_iommu_free_table(struct iommu_table *tbl)
 	decrement_locked_vm(pages);
 }
 
+static long tce_iommu_create_window(struct tce_container *container,
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		__u64 *start_addr)
+{
+	struct tce_iommu_group *tcegrp;
+	struct iommu_table_group *table_group;
+	struct iommu_table *tbl = NULL;
+	long ret, num;
+
+	num = tce_iommu_find_free_table(container);
+	if (num < 0)
+		return num;
+
+	/* Get the first group for ops::create_table */
+	tcegrp = list_first_entry(&container->group_list,
+			struct tce_iommu_group, next);
+	table_group = iommu_group_get_iommudata(tcegrp->grp);
+	if (!table_group)
+		return -EFAULT;
+
+	if (!(table_group->pgsizes & (1ULL << page_shift)))
+		return -EINVAL;
+
+	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
+			!table_group->ops->get_table_size ||
+			!table_group->ops->create_table)
+		return -EPERM;
+
+	/* Create TCE table */
+	ret = tce_iommu_create_table(container, table_group, num,
+			page_shift, window_size, levels, &tbl);
+	if (ret)
+		return ret;
+
+	BUG_ON(!tbl->it_ops->free);
+
+	/*
+	 * Program the table to every group.
+	 * Groups have been tested for compatibility at the attach time.
+	 */
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+
+		ret = table_group->ops->set_window(table_group, num, tbl);
+		if (ret)
+			goto unset_exit;
+	}
+
+	container->tables[num] = tbl;
+
+	/* Return start address assigned by platform in create_table() */
+	*start_addr = tbl->it_offset << tbl->it_page_shift;
+
+	return 0;
+
+unset_exit:
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+		table_group->ops->unset_window(table_group, num);
+	}
+	tce_iommu_free_table(tbl);
+
+	return ret;
+}
+
+static long tce_iommu_remove_window(struct tce_container *container,
+		__u64 start_addr)
+{
+	struct iommu_table_group *table_group = NULL;
+	struct iommu_table *tbl;
+	struct tce_iommu_group *tcegrp;
+	int num;
+
+	num = tce_iommu_find_table(container, start_addr, &tbl);
+	if (num < 0)
+		return -EINVAL;
+
+	BUG_ON(!tbl->it_size);
+
+	/* Detach groups from IOMMUs */
+	list_for_each_entry(tcegrp, &container->group_list, next) {
+		table_group = iommu_group_get_iommudata(tcegrp->grp);
+
+		/*
+		 * SPAPR TCE IOMMU exposes the default DMA window to
+		 * the guest via dma32_window_start/size of
+		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
+		 * the userspace to remove this window, some do not so
+		 * here we check for the platform capability.
+		 */
+		if (!table_group->ops || !table_group->ops->unset_window)
+			return -EPERM;
+
+		table_group->ops->unset_window(table_group, num);
+	}
+
+	/* Free table */
+	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+	tce_iommu_free_table(tbl);
+	container->tables[num] = NULL;
+
+	return 0;
+}
+
 static long tce_iommu_ioctl(void *iommu_data,
 				 unsigned int cmd, unsigned long arg)
 {
 	struct tce_container *container = iommu_data;
-	unsigned long minsz;
+	unsigned long minsz, ddwsz;
 	long ret;
 
 	switch (cmd) {
@@ -641,6 +757,21 @@ static long tce_iommu_ioctl(void *iommu_data,
 		info.dma32_window_start = table_group->tce32_start;
 		info.dma32_window_size = table_group->tce32_size;
 		info.flags = 0;
+		memset(&info.ddw, 0, sizeof(info.ddw));
+
+		if (table_group->max_dynamic_windows_supported &&
+				container->v2) {
+			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
+			info.ddw.pgsizes = table_group->pgsizes;
+			info.ddw.max_dynamic_windows_supported =
+				table_group->max_dynamic_windows_supported;
+			info.ddw.levels = table_group->max_levels;
+		}
+
+		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
+
+		if (info.argsz >= ddwsz)
+			minsz = ddwsz;
 
 		if (copy_to_user((void __user *)arg, &info, minsz))
 			return -EFAULT;
@@ -834,6 +965,69 @@ static long tce_iommu_ioctl(void *iommu_data,
 		return ret;
 	}
 
+	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
+		struct vfio_iommu_spapr_tce_create create;
+
+		if (!container->v2)
+			break;
+
+		if (!tce_groups_attached(container))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
+				start_addr);
+
+		if (copy_from_user(&create, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (create.argsz < minsz)
+			return -EINVAL;
+
+		if (create.flags)
+			return -EINVAL;
+
+		mutex_lock(&container->lock);
+
+		ret = tce_iommu_create_window(container, create.page_shift,
+				create.window_size, create.levels,
+				&create.start_addr);
+
+		mutex_unlock(&container->lock);
+
+		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
+			ret = -EFAULT;
+
+		return ret;
+	}
+	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
+		struct vfio_iommu_spapr_tce_remove remove;
+
+		if (!container->v2)
+			break;
+
+		if (!tce_groups_attached(container))
+			return -ENXIO;
+
+		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
+				start_addr);
+
+		if (copy_from_user(&remove, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (remove.argsz < minsz)
+			return -EINVAL;
+
+		if (remove.flags)
+			return -EINVAL;
+
+		mutex_lock(&container->lock);
+
+		ret = tce_iommu_remove_window(container, remove.start_addr);
+
+		mutex_unlock(&container->lock);
+
+		return ret;
+	}
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index fa84391..9fd7b5d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -445,6 +445,23 @@ struct vfio_iommu_type1_dma_unmap {
 /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
 
 /*
+ * The SPAPR TCE DDW info struct provides the information about
+ * the details of Dynamic DMA window capability.
+ *
+ * @pgsizes contains a page size bitmask, 4K/64K/16M are supported.
+ * @max_dynamic_windows_supported tells the maximum number of windows
+ * which the platform can create.
+ * @levels tells the maximum number of levels in multi-level IOMMU tables;
+ * this allows splitting a table into smaller chunks which reduces
+ * the amount of physically contiguous memory required for the table.
+ */
+struct vfio_iommu_spapr_tce_ddw_info {
+	__u64 pgsizes;			/* Bitmap of supported page sizes */
+	__u32 max_dynamic_windows_supported;
+	__u32 levels;
+};
+
+/*
  * The SPAPR TCE info struct provides the information about the PCI bus
  * address ranges available for DMA, these values are programmed into
  * the hardware so the guest has to know that information.
@@ -454,14 +471,17 @@ struct vfio_iommu_type1_dma_unmap {
  * addresses too so the window works as a filter rather than an offset
  * for IOVA addresses.
  *
- * A flag will need to be added if other page sizes are supported,
- * so as defined here, it is always 4k.
+ * Flags supported:
+ * - VFIO_IOMMU_SPAPR_INFO_DDW: informs the userspace that dynamic DMA windows
+ *   (DDW) support is present. @ddw is only supported when DDW is present.
  */
 struct vfio_iommu_spapr_tce_info {
 	__u32 argsz;
-	__u32 flags;			/* reserved for future use */
+	__u32 flags;
+#define VFIO_IOMMU_SPAPR_INFO_DDW	(1 << 0)	/* DDW supported */
 	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
 	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
+	struct vfio_iommu_spapr_tce_ddw_info ddw;
 };
 
 #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
@@ -534,6 +554,41 @@ struct vfio_iommu_spapr_register_memory {
  */
 #define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 18)
 
+/**
+ * VFIO_IOMMU_SPAPR_TCE_CREATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, struct vfio_iommu_spapr_tce_create)
+ *
+ * Creates an additional TCE table and programs it (sets a new DMA window)
+ * to every IOMMU group in the container. It receives page shift, window
+ * size and number of levels in the TCE table being created.
+ *
+ * It allocates and returns an offset on a PCI bus of the new DMA window.
+ */
+struct vfio_iommu_spapr_tce_create {
+	__u32 argsz;
+	__u32 flags;
+	/* in */
+	__u32 page_shift;
+	__u64 window_size;
+	__u32 levels;
+	/* out */
+	__u64 start_addr;
+};
+#define VFIO_IOMMU_SPAPR_TCE_CREATE	_IO(VFIO_TYPE, VFIO_BASE + 19)
+
+/**
+ * VFIO_IOMMU_SPAPR_TCE_REMOVE - _IOW(VFIO_TYPE, VFIO_BASE + 20, struct vfio_iommu_spapr_tce_remove)
+ *
+ * Unprograms a TCE table from all groups in the container and destroys it.
+ * It receives a PCI bus offset as a window id.
+ */
+struct vfio_iommu_spapr_tce_remove {
+	__u32 argsz;
+	__u32 flags;
+	/* in */
+	__u64 start_addr;
+};
+#define VFIO_IOMMU_SPAPR_TCE_REMOVE	_IO(VFIO_TYPE, VFIO_BASE + 20)
+
 /* ***************************************************************** */
 
 #endif /* _UAPIVFIO_H */
-- 
cgit v0.10.2


From b91c1e3e7a6f22a6b898e345b745b6a43273c973 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 26 May 2015 08:53:25 +1000
Subject: powerpc: Fix duplicate const clang warning in user access code

We see a large number of duplicate const errors in the user access
code when building with llvm/clang:

  include/linux/pagemap.h:576:8: warning: duplicate 'const' declaration specifier
      [-Wduplicate-decl-specifier]
        ret = __get_user(c, uaddr);

The problem is we are doing const __typeof__(*(ptr)), which will hit the
warning if ptr is marked const.

Removing const does not seem to have any effect on GCC code generation.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index a0c071d..2a8ebae 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -265,7 +265,7 @@ do {								\
 ({								\
 	long __gu_err;						\
 	unsigned long __gu_val;					\
-	const __typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
+	__typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
 	__chk_user_ptr(ptr);					\
 	if (!is_kernel_addr((unsigned long)__gu_addr))		\
 		might_fault();					\
@@ -279,7 +279,7 @@ do {								\
 ({								\
 	long __gu_err;						\
 	long long __gu_val;					\
-	const __typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
+	__typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
 	__chk_user_ptr(ptr);					\
 	if (!is_kernel_addr((unsigned long)__gu_addr))		\
 		might_fault();					\
@@ -293,7 +293,7 @@ do {								\
 ({									\
 	long __gu_err = -EFAULT;					\
 	unsigned long  __gu_val = 0;					\
-	const __typeof__(*(ptr)) __user *__gu_addr = (ptr);		\
+	__typeof__(*(ptr)) __user *__gu_addr = (ptr);		\
 	might_fault();							\
 	if (access_ok(VERIFY_READ, __gu_addr, (size)))			\
 		__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
@@ -305,7 +305,7 @@ do {								\
 ({								\
 	long __gu_err;						\
 	unsigned long __gu_val;					\
-	const __typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
+	__typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
 	__chk_user_ptr(ptr);					\
 	__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
-- 
cgit v0.10.2


From 1fb3f5a7ca599f322e6bf21272ad215301159aa0 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 26 May 2015 08:53:26 +1000
Subject: powerpc: Only use -mabi=altivec if toolchain supports it

The -mabi=altivec option is not recognised on LLVM, so use call cc-option
to check for support.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 7902802..a47e142 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -33,6 +33,6 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
 obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o
 
 obj-$(CONFIG_ALTIVEC)	+= xor_vmx.o
-CFLAGS_xor_vmx.o += -maltivec -mabi=altivec
+CFLAGS_xor_vmx.o += -maltivec $(call cc-option,-mabi=altivec)
 
 obj-$(CONFIG_PPC64) += $(obj64-y)
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index c7dab06..3b10a48 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -15,7 +15,7 @@ quiet_cmd_unroll = UNROLL  $@
                    < $< > $@ || ( rm -f $@ && exit 1 )
 
 ifeq ($(CONFIG_ALTIVEC),y)
-altivec_flags := -maltivec -mabi=altivec
+altivec_flags := -maltivec $(call cc-option,-mabi=altivec)
 endif
 
 # The GCC option -ffreestanding is required in order to compile code containing
-- 
cgit v0.10.2


From a50a862e74a9d7b447df30c33aa3d534ce3a977a Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 26 May 2015 08:53:27 +1000
Subject: powerpc: Only use -mtraceback=no, -mno-string and -msoft-float if
 toolchain supports it

These options are not recognised on LLVM, so use call cc-option to check
for support.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index edf39ce..799e67d 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -113,7 +113,7 @@ else
 endif
 endif
 
-CFLAGS-$(CONFIG_PPC64)	:= -mtraceback=no
+CFLAGS-$(CONFIG_PPC64)	:= $(call cc-option,-mtraceback=no)
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
 CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mabi=elfv2,-mcall-aixdesc)
 AFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mabi=elfv2)
@@ -160,7 +160,8 @@ asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1)
 
 KBUILD_CPPFLAGS	+= -Iarch/$(ARCH) $(asinstr)
 KBUILD_AFLAGS	+= -Iarch/$(ARCH) $(AFLAGS-y)
-KBUILD_CFLAGS	+= -msoft-float -pipe -Iarch/$(ARCH) $(CFLAGS-y)
+KBUILD_CFLAGS	+= $(call cc-option,-msoft-float)
+KBUILD_CFLAGS	+= -pipe -Iarch/$(ARCH) $(CFLAGS-y)
 CPP		= $(CC) -E $(KBUILD_CFLAGS)
 
 CHECKFLAGS	+= -m$(CONFIG_WORD_SIZE) -D__powerpc__ -D__powerpc$(CONFIG_WORD_SIZE)__
@@ -192,7 +193,7 @@ KBUILD_CFLAGS	+= $(call cc-option,-fno-dwarf2-cfi-asm)
 
 # Never use string load/store instructions as they are
 # often slow when they are implemented at all
-KBUILD_CFLAGS		+= -mno-string
+KBUILD_CFLAGS		+= $(call cc-option,-mno-string)
 
 ifeq ($(CONFIG_6xx),y)
 KBUILD_CFLAGS		+= -mcpu=powerpc
-- 
cgit v0.10.2


From 92d6cf2dab83cfb00b3d80fd38a38a77c6873277 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 26 May 2015 08:53:28 +1000
Subject: powerpc: Don't use -mno-strict-align on clang

We added -mno-strict-align in commit f036b3681962 (powerpc: Work around little
endian gcc bug) to fix gcc bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57134

Clang doesn't understand it. We need to use a conditional because we can't use the
simpler call cc-option here.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 799e67d..1eec1a0 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -66,7 +66,10 @@ endif
 UTS_MACHINE := $(OLDARCH)
 
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
-override CC	+= -mlittle-endian -mno-strict-align
+override CC	+= -mlittle-endian
+ifneq ($(COMPILER),clang)
+override CC	+= -mno-strict-align
+endif
 override AS	+= -mlittle-endian
 override LD	+= -EL
 override CROSS32CC += -mlittle-endian
-- 
cgit v0.10.2


From 238abecde8ad43f914e095fcf23e0bd35dc7a7f2 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 26 May 2015 08:53:29 +1000
Subject: powerpc: Don't use gcc specific options on clang

We have code to choose between several options, eg. -mabi=elfv2 vs
-mcall-aixdesc, and -mcmodel=medium vs -mminimal-toc. But these are all
GCC specific, so use cc-option on all of them.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 1eec1a0..05f464e 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -118,12 +118,12 @@ endif
 
 CFLAGS-$(CONFIG_PPC64)	:= $(call cc-option,-mtraceback=no)
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
-CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mabi=elfv2,-mcall-aixdesc)
+CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mabi=elfv2,$(call cc-option,-mcall-aixdesc))
 AFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mabi=elfv2)
 else
-CFLAGS-$(CONFIG_PPC64)	+= -mcall-aixdesc
+CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mcall-aixdesc)
 endif
-CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mcmodel=medium,-mminimal-toc)
+CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mcmodel=medium,$(call cc-option,-mminimal-toc))
 CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mno-pointers-to-nested-functions)
 CFLAGS-$(CONFIG_PPC32)	:= -ffixed-r2 $(MULTIPLEWORD)
 
-- 
cgit v0.10.2


From 2dc710e93c303f2a33d4fecbcc7e8088f2776604 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sun, 31 May 2015 11:01:33 +1000
Subject: macintosh/nvram: Remove as unused

The full history Linux tree says it was moved:

  commit 17c6f4635bea74e110ab3558d408c9cd218c568a
  Author: Benjamin Herrenschmidt <benh@kernel.crashing.org>
  Date:   Fri Feb 6 14:20:56 2004 +1100

      ppc32: Rework nvram management

      move drivers/macintosh/nvram.c to drivers/char/generic_nvram.c,
      update platform hooks,
      fix powermac nvram driver for newer machines

But it's still there, and not buildable, so remove it.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/macintosh/nvram.c b/drivers/macintosh/nvram.c
deleted file mode 100644
index f0e03e7..0000000
--- a/drivers/macintosh/nvram.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * /dev/nvram driver for Power Macintosh.
- */
-
-#define NVRAM_VERSION "1.0"
-
-#include <linux/module.h>
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/miscdevice.h>
-#include <linux/fcntl.h>
-#include <linux/nvram.h>
-#include <linux/init.h>
-#include <asm/uaccess.h>
-#include <asm/nvram.h>
-
-#define NVRAM_SIZE	8192
-
-static loff_t nvram_llseek(struct file *file, loff_t offset, int origin)
-{
-	switch (origin) {
-	case 0:
-		break;
-	case 1:
-		offset += file->f_pos;
-		break;
-	case 2:
-		offset += NVRAM_SIZE;
-		break;
-	default:
-		offset = -1;
-	}
-	if (offset < 0)
-		return -EINVAL;
-
-	file->f_pos = offset;
-	return file->f_pos;
-}
-
-static ssize_t read_nvram(struct file *file, char __user *buf,
-			  size_t count, loff_t *ppos)
-{
-	unsigned int i;
-	char __user *p = buf;
-
-	if (!access_ok(VERIFY_WRITE, buf, count))
-		return -EFAULT;
-	if (*ppos >= NVRAM_SIZE)
-		return 0;
-	for (i = *ppos; count > 0 && i < NVRAM_SIZE; ++i, ++p, --count)
-		if (__put_user(nvram_read_byte(i), p))
-			return -EFAULT;
-	*ppos = i;
-	return p - buf;
-}
-
-static ssize_t write_nvram(struct file *file, const char __user *buf,
-			   size_t count, loff_t *ppos)
-{
-	unsigned int i;
-	const char __user *p = buf;
-	char c;
-
-	if (!access_ok(VERIFY_READ, buf, count))
-		return -EFAULT;
-	if (*ppos >= NVRAM_SIZE)
-		return 0;
-	for (i = *ppos; count > 0 && i < NVRAM_SIZE; ++i, ++p, --count) {
-		if (__get_user(c, p))
-			return -EFAULT;
-		nvram_write_byte(c, i);
-	}
-	*ppos = i;
-	return p - buf;
-}
-
-static long nvram_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	switch(cmd) {
-		case PMAC_NVRAM_GET_OFFSET:
-		{
-			int part, offset;
-			if (copy_from_user(&part, (void __user*)arg, sizeof(part)) != 0)
-				return -EFAULT;
-			if (part < pmac_nvram_OF || part > pmac_nvram_NR)
-				return -EINVAL;
-			offset = pmac_get_partition(part);
-			if (copy_to_user((void __user*)arg, &offset, sizeof(offset)) != 0)
-				return -EFAULT;
-			break;
-		}
-
-		default:
-			return -EINVAL;
-	}
-
-	return 0;
-}
-
-const struct file_operations nvram_fops = {
-	.owner		= THIS_MODULE,
-	.llseek		= nvram_llseek,
-	.read		= read_nvram,
-	.write		= write_nvram,
-	.unlocked_ioctl	= nvram_ioctl,
-};
-
-static struct miscdevice nvram_dev = {
-	NVRAM_MINOR,
-	"nvram",
-	&nvram_fops
-};
-
-int __init nvram_init(void)
-{
-	printk(KERN_INFO "Macintosh non-volatile memory driver v%s\n",
-		NVRAM_VERSION);
-	return misc_register(&nvram_dev);
-}
-
-void __exit nvram_cleanup(void)
-{
-        misc_deregister( &nvram_dev );
-}
-
-module_init(nvram_init);
-module_exit(nvram_cleanup);
-MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From 4bece972fce6e597cb513bdcae4a04e14fc0dd81 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 15 Jun 2015 15:01:32 +1000
Subject: powerpc/powernv: pnv_init_idle_states() should only run on powernv

Although this init call checks for device tree properties before doing
anything, it should still only run on powernv machines.

Reviewed-by: Shreyas B Prabhu <shreyas@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index bd39a12..59d735d 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -17,6 +17,7 @@
 #include <linux/cpu.h>
 
 #include <asm/firmware.h>
+#include <asm/machdep.h>
 #include <asm/opal.h>
 #include <asm/cputhreads.h>
 #include <asm/cpuidle.h>
@@ -289,5 +290,4 @@ out_free:
 out:
 	return 0;
 }
-
-subsys_initcall(pnv_init_idle_states);
+machine_subsys_initcall(powernv, pnv_init_idle_states);
-- 
cgit v0.10.2


From 3609d819a36c65857816ca1278d80767d6d9b990 Mon Sep 17 00:00:00 2001
From: "Shreyas B. Prabhu" <shreyas@linux.vnet.ibm.com>
Date: Wed, 20 May 2015 00:30:14 +0530
Subject: powerpc: Make doorbell check preemption safe

Doorbell can be used to cause ipi on cpus which are sibling threads on
the same core. So icp_native_cause_ipi checks if the destination cpu
is a sibling thread of the current cpu and uses doorbell in such cases.

But while running with CONFIG_PREEMPT=y, since this section is
preemtible, we can run into issues if after we check if the destination
cpu is a sibling cpu, the task gets migrated from a sibling cpu to a
cpu on another core.

Fix this by using get_cpu()/ put_cpu()

Signed-off-by: Shreyas B. Prabhu <shreyas@linux.vnet.ibm.com>
Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 2fc4cf1..eae3265 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -147,12 +147,16 @@ static void icp_native_cause_ipi(int cpu, unsigned long data)
 {
 	kvmppc_set_host_ipi(cpu, 1);
 #ifdef CONFIG_PPC_DOORBELL
-	if (cpu_has_feature(CPU_FTR_DBELL) &&
-	    (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id()))))
-		doorbell_cause_ipi(cpu, data);
-	else
+	if (cpu_has_feature(CPU_FTR_DBELL)) {
+		if (cpumask_test_cpu(cpu, cpu_sibling_mask(get_cpu()))) {
+			doorbell_cause_ipi(cpu, data);
+			put_cpu();
+			return;
+		}
+		put_cpu();
+	}
 #endif
-		icp_native_set_qirr(cpu, IPI_PRIORITY);
+	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
 /*
-- 
cgit v0.10.2


From 02b6505c8f84cbb4be459c9bf8ebbb7b0754e764 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Wed, 17 Jun 2015 11:36:57 +1000
Subject: powerpc/powernv: Increase opal-irqchip initcall priority

The eeh subsystem for powernv requires the opal event irqchip to be
initialised prior to initialisation or the following errors are
produced (and eeh doesn't work as expected):

irq: XICS didn't like hwirq-0x9 to VIRQ17 mapping (rc=-22)
pnv_eeh_post_init: Can't request OPAL event interrupt (0)

On powernv eeh is initialised from a subsys_initcall due to a check
for machine_is(powernv) in eeh_init(). This patch increases the
initcall priority of opal_event_init() to an arch_initcall to ensure
the opal event interface is initialised prior to any users of it.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Reported-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
index 841135f..e2e7d75 100644
--- a/arch/powerpc/platforms/powernv/opal-irqchip.c
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -231,6 +231,7 @@ out:
 	of_node_put(opal_node);
 	return rc;
 }
+machine_arch_initcall(powernv, opal_event_init);
 
 /**
  * opal_event_request(unsigned int opal_event_nr) - Request an event
@@ -244,6 +245,9 @@ out:
  */
 int opal_event_request(unsigned int opal_event_nr)
 {
+	if (WARN_ON_ONCE(!opal_event_irqchip.domain))
+		return NO_IRQ;
+
 	return irq_create_mapping(opal_event_irqchip.domain, opal_event_nr);
 }
 EXPORT_SYMBOL(opal_event_request);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 9e9c483..f084afa 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -657,9 +657,6 @@ static int __init opal_init(void)
 		return -ENODEV;
 	}
 
-	/* Initialise OPAL events */
-	opal_event_init();
-
 	/* Register OPAL consoles if any ports */
 	if (firmware_has_feature(FW_FEATURE_OPALv2))
 		consoles = of_find_node_by_path("/ibm,opal/consoles");
-- 
cgit v0.10.2


From 7185795a62589292015484985635b4a38029a2b9 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Mon, 15 Jun 2015 17:25:34 +0800
Subject: powerpc/powernv: fix construction of opal PRD messages

We currently have a bug in the PRD code, where the contents of an
incoming message (beyond the header) will be overwritten by the list
item manipulations when adding to to the prd_msg_queue.

This change reorders struct opal_prd_msg_queue_item, so that the
message body doesn't overlap the list_head.

We also clarify the memcpy of the message, as we're copying unnecessary
bytes at the end of the message data.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Acked-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
index d9e72bd..46cb3fe 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -32,9 +32,13 @@
 #include <asm/uaccess.h>
 
 
+/**
+ * The msg member must be at the end of the struct, as it's followed by the
+ * message data.
+ */
 struct opal_prd_msg_queue_item {
-	struct opal_prd_msg_header	msg;
 	struct list_head		list;
+	struct opal_prd_msg_header	msg;
 };
 
 static struct device_node *prd_node;
@@ -351,22 +355,23 @@ static int opal_prd_msg_notifier(struct notifier_block *nb,
 	struct opal_prd_msg_queue_item *item;
 	struct opal_prd_msg_header *hdr;
 	struct opal_msg *msg = _msg;
+	int msg_size, item_size;
 	unsigned long flags;
-	int size;
 
 	if (msg_type != OPAL_MSG_PRD)
 		return 0;
 
-	/* Calculate total size of the item we need to store. The 'size' field
-	 * in the header includes the header itself. */
+	/* Calculate total size of the message and item we need to store. The
+	 * 'size' field in the header includes the header itself. */
 	hdr = (void *)msg->params;
-	size = (sizeof(*item) - sizeof(item->msg)) + be16_to_cpu(hdr->size);
+	msg_size = be16_to_cpu(hdr->size);
+	item_size = msg_size + sizeof(*item) - sizeof(item->msg);
 
-	item = kzalloc(size, GFP_ATOMIC);
+	item = kzalloc(item_size, GFP_ATOMIC);
 	if (!item)
 		return -ENOMEM;
 
-	memcpy(&item->msg, msg->params, size);
+	memcpy(&item->msg, msg->params, msg_size);
 
 	spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
 	list_add_tail(&item->list, &opal_prd_msg_queue);
-- 
cgit v0.10.2


From cdf2bc1c283066d6f0834cc5dd4fcf5d01e44af9 Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Wed, 17 Jun 2015 08:35:39 +0800
Subject: powerpc/include: Add opal-prd to installed uapi headers

We'll want to build the opal-prd daemon with the prd headers, so include
this in the uapi headers list.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index 79c4068..f44a0278 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -18,6 +18,7 @@ header-y += kvm_para.h
 header-y += mman.h
 header-y += msgbuf.h
 header-y += nvram.h
+header-y += opal-prd.h
 header-y += param.h
 header-y += perf_event.h
 header-y += poll.h
-- 
cgit v0.10.2


From b5926430dfa07d17e5d768c16b0d81c13a793f7c Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 15 Jun 2015 17:49:59 +1000
Subject: powerpc/iommu/ioda2: Enable compile with IOV=on and IOMMU_API=off

The pnv_pci_ioda2_unset_window() function is used to do the final
cleanup of a DMA window being released:
- via VFIO ioctl by the guest request;
- via unplugging a virtual PCI function.
However the function was under #ifdef CONFIG_IOMMU_API and was missing.

This moves the helper outside of IOMMU_API block and enables it
for either or both IOMMU_API and PCI_IOV.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index b9f0f43..8424f5c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2116,6 +2116,32 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
 	return 0;
 }
 
+#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+		int num)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = pe->phb;
+	long ret;
+
+	pe_info(pe, "Removing DMA window #%d\n", num);
+
+	ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+			(pe->pe_number << 1) + num,
+			0/* levels */, 0/* table address */,
+			0/* table size */, 0/* page size */);
+	if (ret)
+		pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
+	else
+		pnv_pci_ioda2_tce_invalidate_entire(pe);
+
+	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
+
+	return ret;
+}
+#endif
+
 #ifdef CONFIG_IOMMU_API
 static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 		__u64 window_size, __u32 levels)
@@ -2149,30 +2175,6 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 	return bytes;
 }
 
-static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
-		int num)
-{
-	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
-			table_group);
-	struct pnv_phb *phb = pe->phb;
-	long ret;
-
-	pe_info(pe, "Removing DMA window #%d\n", num);
-
-	ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
-			(pe->pe_number << 1) + num,
-			0/* levels */, 0/* table address */,
-			0/* table size */, 0/* page size */);
-	if (ret)
-		pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
-	else
-		pnv_pci_ioda2_tce_invalidate_entire(pe);
-
-	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
-
-	return ret;
-}
-
 static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
 {
 	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
-- 
cgit v0.10.2


From b4b56f9ecab40f3b4ef53e130c9f6663be491894 Mon Sep 17 00:00:00 2001
From: Sam bobroff <sam.bobroff@au1.ibm.com>
Date: Fri, 12 Jun 2015 11:06:32 +1000
Subject: powerpc/tm: Abort syscalls in active transactions

This patch changes the syscall handler to doom (tabort) active
transactions when a syscall is made and return very early without
performing the syscall and keeping side effects to a minimum (no CPU
accounting or system call tracing is performed). Also included is a
new HWCAP2 bit, PPC_FEATURE2_HTM_NOSC, to indicate this
behaviour to userspace.

Currently, the system call instruction automatically suspends an
active transaction which causes side effects to persist when an active
transaction fails.

This does change the kernel's behaviour, but in a way that was
documented as unsupported.  It doesn't reduce functionality as
syscalls will still be performed after tsuspend; it just requires that
the transaction be explicitly suspended.  It also provides a
consistent interface and makes the behaviour of user code
substantially the same across powerpc and platforms that do not
support suspended transactions (e.g. x86 and s390).

Performance measurements using
http://ozlabs.org/~anton/junkcode/null_syscall.c indicate the cost of
a normal (non-aborted) system call increases by about 0.25%.

Signed-off-by: Sam Bobroff <sam.bobroff@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/Documentation/powerpc/transactional_memory.txt b/Documentation/powerpc/transactional_memory.txt
index ded6979..ba0a2a4 100644
--- a/Documentation/powerpc/transactional_memory.txt
+++ b/Documentation/powerpc/transactional_memory.txt
@@ -74,22 +74,23 @@ Causes of transaction aborts
 Syscalls
 ========
 
-Performing syscalls from within transaction is not recommended, and can lead
-to unpredictable results.
+Syscalls made from within an active transaction will not be performed and the
+transaction will be doomed by the kernel with the failure code TM_CAUSE_SYSCALL
+| TM_CAUSE_PERSISTENT.
 
-Syscalls do not by design abort transactions, but beware: The kernel code will
-not be running in transactional state.  The effect of syscalls will always
-remain visible, but depending on the call they may abort your transaction as a
-side-effect, read soon-to-be-aborted transactional data that should not remain
-invisible, etc.  If you constantly retry a transaction that constantly aborts
-itself by calling a syscall, you'll have a livelock & make no progress.
+Syscalls made from within a suspended transaction are performed as normal and
+the transaction is not explicitly doomed by the kernel.  However, what the
+kernel does to perform the syscall may result in the transaction being doomed
+by the hardware.  The syscall is performed in suspended mode so any side
+effects will be persistent, independent of transaction success or failure.  No
+guarantees are provided by the kernel about which syscalls will affect
+transaction success.
 
-Simple syscalls (e.g. sigprocmask()) "could" be OK.  Even things like write()
-from, say, printf() should be OK as long as the kernel does not access any
-memory that was accessed transactionally.
-
-Consider any syscalls that happen to work as debug-only -- not recommended for
-production use.  Best to queue them up till after the transaction is over.
+Care must be taken when relying on syscalls to abort during active transactions
+if the calls are made via a library.  Libraries may cache values (which may
+give the appearance of success) or perform operations that cause transaction
+failure before entering the kernel (which may produce different failure codes).
+Examples are glibc's getpid() and lazy symbol resolution.
 
 
 Signals
@@ -176,8 +177,7 @@ kernel aborted a transaction:
  TM_CAUSE_RESCHED       Thread was rescheduled.
  TM_CAUSE_TLBI          Software TLB invalid.
  TM_CAUSE_FAC_UNAV      FP/VEC/VSX unavailable trap.
- TM_CAUSE_SYSCALL       Currently unused; future syscalls that must abort
-                        transactions for consistency will use this.
+ TM_CAUSE_SYSCALL       Syscall from active transaction.
  TM_CAUSE_SIGNAL        Signal delivered.
  TM_CAUSE_MISC          Currently unused.
  TM_CAUSE_ALIGNMENT     Alignment fault.
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 6367b83..4994648 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -242,11 +242,13 @@ enum {
 
 /* We only set the TM feature if the kernel was compiled with TM supprt */
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-#define CPU_FTR_TM_COMP		CPU_FTR_TM
-#define PPC_FEATURE2_HTM_COMP	PPC_FEATURE2_HTM
+#define CPU_FTR_TM_COMP			CPU_FTR_TM
+#define PPC_FEATURE2_HTM_COMP		PPC_FEATURE2_HTM
+#define PPC_FEATURE2_HTM_NOSC_COMP	PPC_FEATURE2_HTM_NOSC
 #else
-#define CPU_FTR_TM_COMP		0
-#define PPC_FEATURE2_HTM_COMP	0
+#define CPU_FTR_TM_COMP			0
+#define PPC_FEATURE2_HTM_COMP		0
+#define PPC_FEATURE2_HTM_NOSC_COMP	0
 #endif
 
 /* We need to mark all pages as being coherent if we're SMP or we have a
diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h
index de2c0e4..4368604 100644
--- a/arch/powerpc/include/uapi/asm/cputable.h
+++ b/arch/powerpc/include/uapi/asm/cputable.h
@@ -42,5 +42,6 @@
 #define PPC_FEATURE2_ISEL		0x08000000
 #define PPC_FEATURE2_TAR		0x04000000
 #define PPC_FEATURE2_VEC_CRYPTO		0x02000000
+#define PPC_FEATURE2_HTM_NOSC		0x01000000
 
 #endif /* _UAPI__ASM_POWERPC_CPUTABLE_H */
diff --git a/arch/powerpc/include/uapi/asm/tm.h b/arch/powerpc/include/uapi/asm/tm.h
index 5d836b7..5047659 100644
--- a/arch/powerpc/include/uapi/asm/tm.h
+++ b/arch/powerpc/include/uapi/asm/tm.h
@@ -11,7 +11,7 @@
 #define TM_CAUSE_RESCHED	0xde
 #define TM_CAUSE_TLBI		0xdc
 #define TM_CAUSE_FAC_UNAV	0xda
-#define TM_CAUSE_SYSCALL	0xd8  /* future use */
+#define TM_CAUSE_SYSCALL	0xd8
 #define TM_CAUSE_MISC		0xd6  /* future use */
 #define TM_CAUSE_SIGNAL		0xd4
 #define TM_CAUSE_ALIGNMENT	0xd2
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 60262fd..7d80bfd 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -108,7 +108,9 @@ extern void __restore_cpu_e6500(void);
 				 PPC_FEATURE_TRUE_LE | \
 				 PPC_FEATURE_PSERIES_PERFMON_COMPAT)
 #define COMMON_USER2_POWER8	(PPC_FEATURE2_ARCH_2_07 | \
-				 PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_DSCR | \
+				 PPC_FEATURE2_HTM_COMP | \
+				 PPC_FEATURE2_HTM_NOSC_COMP | \
+				 PPC_FEATURE2_DSCR | \
 				 PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | \
 				 PPC_FEATURE2_VEC_CRYPTO)
 #define COMMON_USER_PA6T	(COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 278888e..579e0f9 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -34,6 +34,7 @@
 #include <asm/ftrace.h>
 #include <asm/hw_irq.h>
 #include <asm/context_tracking.h>
+#include <asm/tm.h>
 
 /*
  * System calls.
@@ -51,6 +52,12 @@ exception_marker:
 
 	.globl system_call_common
 system_call_common:
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	extrdi.	r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */
+	bne	tabort_syscall
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
 	andi.	r10,r12,MSR_PR
 	mr	r10,r1
 	addi	r1,r1,-INT_FRAME_SIZE
@@ -311,6 +318,34 @@ syscall_exit_work:
 	bl	do_syscall_trace_leave
 	b	ret_from_except
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+tabort_syscall:
+	/* Firstly we need to enable TM in the kernel */
+	mfmsr	r10
+	li	r13, 1
+	rldimi	r10, r13, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r10, 0
+
+	/* tabort, this dooms the transaction, nothing else */
+	li	r13, (TM_CAUSE_SYSCALL|TM_CAUSE_PERSISTENT)
+	TABORT(R13)
+
+	/*
+	 * Return directly to userspace. We have corrupted user register state,
+	 * but userspace will never see that register state. Execution will
+	 * resume after the tbegin of the aborted transaction with the
+	 * checkpointed register state.
+	 */
+	li	r13, MSR_RI
+	andc	r10, r10, r13
+	mtmsrd	r10, 1
+	mtspr	SPRN_SRR0, r11
+	mtspr	SPRN_SRR1, r12
+
+	rfid
+	b	.	/* prevent speculative execution */
+#endif
+
 /* Save non-volatile GPRs, if not already saved. */
 _GLOBAL(save_nvgprs)
 	ld	r11,_TRAP(r1)
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
index 6bff955..4bea62a 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -1,11 +1,11 @@
-TEST_PROGS := tm-resched-dscr
+TEST_PROGS := tm-resched-dscr tm-syscall
 
 all: $(TEST_PROGS)
 
 $(TEST_PROGS): ../harness.c
 
 tm-syscall: tm-syscall-asm.S
-tm-syscall: CFLAGS += -mhtm
+tm-syscall: CFLAGS += -mhtm -I../../../../../usr/include
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/powerpc/tm/tm-syscall.c b/tools/testing/selftests/powerpc/tm/tm-syscall.c
index 3ed8d4b..1276e23 100644
--- a/tools/testing/selftests/powerpc/tm/tm-syscall.c
+++ b/tools/testing/selftests/powerpc/tm/tm-syscall.c
@@ -82,7 +82,8 @@ int tm_syscall(void)
 	unsigned count = 0;
 	struct timeval end, now;
 
-	SKIP_IF(!((long)get_auxv_entry(AT_HWCAP2) & PPC_FEATURE2_HTM));
+	SKIP_IF(!((long)get_auxv_entry(AT_HWCAP2)
+		  & PPC_FEATURE2_HTM_NOSC));
 	setbuf(stdout, NULL);
 
 	printf("Testing transactional syscalls for %d seconds...\n", TEST_DURATION);
-- 
cgit v0.10.2


From 02505cebc1db06707d31635e9c9b342d58f887b7 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 17 Jun 2015 08:13:40 +0530
Subject: powerpc/mm: PTE_RPN_MAX is not used, remove the same

Remove the unused #define

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index c5a755e..b7c8d07 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -85,10 +85,8 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
  * 64-bit PTEs
  */
 #if defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
-#define PTE_RPN_MAX	(1ULL << (64 - PTE_RPN_SHIFT))
 #define PTE_RPN_MASK	(~((1ULL<<PTE_RPN_SHIFT)-1))
 #else
-#define PTE_RPN_MAX	(1UL << (32 - PTE_RPN_SHIFT))
 #define PTE_RPN_MASK	(~((1UL<<PTE_RPN_SHIFT)-1))
 #endif
 
-- 
cgit v0.10.2


From e148315852855f8f06b8bb4601c8cade34af8df3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 17 Jun 2015 08:13:41 +0530
Subject: powerpc/mm: Change the swap encoding in pte.

Current swap encoding in pte can't support large pfns
above 4TB. Change the swap encoding such that we put
the swap type in the PTE bits. Also add build checks
to make sure we don't overlap with HPTEFLAGS.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index f951d9c..f890f7c 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -347,11 +347,27 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
 /* Encode and de-code a swap entry */
-#define __swp_type(entry)	(((entry).val >> 1) & 0x3f)
-#define __swp_offset(entry)	((entry).val >> 8)
-#define __swp_entry(type, offset) ((swp_entry_t){((type)<< 1)|((offset)<<8)})
-#define __pte_to_swp_entry(pte)	((swp_entry_t){pte_val(pte) >> PTE_RPN_SHIFT})
-#define __swp_entry_to_pte(x)	((pte_t) { (x).val << PTE_RPN_SHIFT })
+#define MAX_SWAPFILES_CHECK() do { \
+	BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \
+	/*							\
+	 * Don't have overlapping bits with _PAGE_HPTEFLAGS	\
+	 * We filter HPTEFLAGS on set_pte.			\
+	 */							\
+	BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
+	} while (0)
+/*
+ * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
+ */
+#define SWP_TYPE_BITS 5
+#define __swp_type(x)		(((x).val >> _PAGE_BIT_SWAP_TYPE) \
+				& ((1UL << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x)		((x).val >> PTE_RPN_SHIFT)
+#define __swp_entry(type, offset)	((swp_entry_t) { \
+					((type) << _PAGE_BIT_SWAP_TYPE) \
+					| ((offset) << PTE_RPN_SHIFT) })
+
+#define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val((pte)) })
+#define __swp_entry_to_pte(x)		__pte((x).val)
 
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h
index 91a7049..8d84732 100644
--- a/arch/powerpc/include/asm/pte-book3e.h
+++ b/arch/powerpc/include/asm/pte-book3e.h
@@ -11,6 +11,7 @@
 /* Architected bits */
 #define _PAGE_PRESENT	0x000001 /* software: pte contains a translation */
 #define _PAGE_SW1	0x000002
+#define _PAGE_BIT_SWAP_TYPE	2
 #define _PAGE_BAP_SR	0x000004
 #define _PAGE_BAP_UR	0x000008
 #define _PAGE_BAP_SW	0x000010
diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h
index fc852f7..ef612c1 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -16,6 +16,7 @@
  */
 #define _PAGE_PRESENT		0x0001 /* software: pte contains a translation */
 #define _PAGE_USER		0x0002 /* matches one of the PP bits */
+#define _PAGE_BIT_SWAP_TYPE	2
 #define _PAGE_EXEC		0x0004 /* No execute on POWER4 and newer (we invert) */
 #define _PAGE_GUARDED		0x0008
 /* We can derive Memory coherence from _PAGE_NO_CACHE */
-- 
cgit v0.10.2


From 5c89a87d13d168eacb8110810544a98ce0f4319d Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 18 Jun 2015 11:41:36 +1000
Subject: powerpc/powernv: Fix wrong IOMMU table in pnv_ioda_setup_bus_dma()

When pnv_pci_ioda_fixup() is called during PHB fixup time, each PE in
the sorted list of PEs (phb::pe_dma_list) is iterated to setup the PE's
DMA32 space by pnv_ioda_setup_bus_dma() if the PE's DMA32 weight is bigger
than zero. The function also assigns all the subordinate PCI devices of
the PE's primary bus with the PE's DMA32 IOMMU table. It causes the PCI
devicess in the child PEs, which don't have DMA weight, receives wrong
IOMMU table and then IOMMU group.

The patch fixes above issue by more check on the PE's coverage and don't
assign IOMMU table to those PCI devices, which belong to the child PEs.
The problem was found on Firestone platform initially.

Suggested-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 8424f5c..5738d31 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1661,7 +1661,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
 		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
 		iommu_add_device(&dev->dev);
 
-		if (dev->subordinate)
+		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
 			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
 	}
 }
-- 
cgit v0.10.2


From e0fcdc201037c2bd356424520d199492e92d0289 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Tue, 16 Jun 2015 16:45:44 +1000
Subject: cxl: Add CXL_KERNEL_API config option

Add CXL_KERNEL_API config option so drivers which depend on this new
functionality won't be enabled until this is visible.

This is useful for merging the cxlflash driver which comes in via the SCSI
tree.  The cxlflash driver can depend on CXL_KERNEL_API, hence it won't be
enabled in the SCSI tree until this new config option is merged via the powerpc
tree.  Hence all trees will be bisectable at all times.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/Kconfig b/drivers/misc/cxl/Kconfig
index a990b39..b6db9eb 100644
--- a/drivers/misc/cxl/Kconfig
+++ b/drivers/misc/cxl/Kconfig
@@ -7,10 +7,15 @@ config CXL_BASE
 	default n
 	select PPC_COPRO_BASE
 
+config CXL_KERNEL_API
+	bool
+	default n
+
 config CXL
 	tristate "Support for IBM Coherent Accelerators (CXL)"
 	depends on PPC_POWERNV && PCI_MSI
 	select CXL_BASE
+	select CXL_KERNEL_API
 	default m
 	help
 	  Select this option to enable driver support for IBM Coherent
-- 
cgit v0.10.2


From f293106917f6d1538c9c7779a632c2ab904a756b Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Thu, 18 Jun 2015 15:15:10 +1000
Subject: cxl: Fix typo in debug print

Fix typo in debug print. p1_base() should be p2_base(). No change other
than to the debug output.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 416905e..c68ef58 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -205,7 +205,7 @@ static void dump_cxl_config_space(struct pci_dev *dev)
 	dev_info(&dev->dev, "p1 regs: %#llx, len: %#llx\n",
 		p1_base(dev), p1_size(dev));
 	dev_info(&dev->dev, "p2 regs: %#llx, len: %#llx\n",
-		p1_base(dev), p2_size(dev));
+		p2_base(dev), p2_size(dev));
 	dev_info(&dev->dev, "BAR 4/5: %#llx, len: %#llx\n",
 		pci_resource_start(dev, 4), pci_resource_len(dev, 4));
 
-- 
cgit v0.10.2