From b7182d1a788461c36bad5a13b78d0779a6376f5c Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Tue, 11 Aug 2015 12:08:32 +0300
Subject: MAINTAINERS: add git tree for the arc architecture

Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index a9ae6c1..d7ab736 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9876,6 +9876,7 @@ S:	Supported
 F:	arch/arc/
 F:	Documentation/devicetree/bindings/arc/
 F:	drivers/tty/serial/arc_uart.c
+T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git
 
 SYNOPSYS ARC SDP platform support
 M:	Alexey Brodkin <abrodkin@synopsys.com>
-- 
cgit v0.10.2


From 2a4401687c11def29fe5a9b23ab98bf7ab1dce61 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Sat, 8 Aug 2015 17:51:58 +0530
Subject: ARC: Enable optimistic spinning for LLSC config

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index bd4670d..e119d42 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -8,6 +8,7 @@
 
 config ARC
 	def_bool y
+	select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC
 	select BUILDTIME_EXTABLE_SORT
 	select COMMON_CLK
 	select CLONE_BACKWARDS
-- 
cgit v0.10.2


From f2b0b25a37a6db12580dcdfdf00f020e5e0e3a43 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <abrodkin@synopsys.com>
Date: Mon, 25 May 2015 19:54:28 +0300
Subject: ARCv2: Support IO Coherency and permutations involving L1 and L2
 caches

In case of ARCv2 CPU there're could be following configurations
that affect cache handling for data exchanged with peripherals
via DMA:
 [1] Only L1 cache exists
 [2] Both L1 and L2 exist, but no IO coherency unit
 [3] L1, L2 caches and IO coherency unit exist

Current implementation takes care of [1] and [2].
Moreover support of [2] is implemented with run-time check
for SLC existence which is not super optimal.

This patch introduces support of [3] and rework of DMA ops
usage. Instead of doing run-time check every time a particular
DMA op is executed we'll have 3 different implementations of
DMA ops and select appropriate one during init.

As for IOC support for it we need:
 [a] Implement empty DMA ops because IOC takes care of cache
     coherency with DMAed data
 [b] Route dma_alloc_coherent() via dma_alloc_noncoherent()
     This is required to make IOC work in first place and also
     serves as optimization as LD/ST to coherent buffers can be
     srviced from caches w/o going all the way to memory

Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
[vgupta:
  -Added some comments about IOC gains
  -Marked dma ops as static,
  -Massaged changelog a bit]
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h
index c8f57b8..d8023bc 100644
--- a/arch/arc/include/asm/arcregs.h
+++ b/arch/arc/include/asm/arcregs.h
@@ -35,6 +35,7 @@
 #define ARC_REG_RTT_BCR		0xF2
 #define ARC_REG_IRQ_BCR		0xF3
 #define ARC_REG_SMART_BCR	0xFF
+#define ARC_REG_CLUSTER_BCR	0xcf
 
 /* status32 Bits Positions */
 #define STATUS_AE_BIT		5	/* Exception active */
diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h
index d67345d..e23ea6e 100644
--- a/arch/arc/include/asm/cache.h
+++ b/arch/arc/include/asm/cache.h
@@ -53,6 +53,8 @@ extern void arc_cache_init(void);
 extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len);
 extern void read_decode_cache_bcr(void);
 
+extern int ioc_exists;
+
 #endif	/* !__ASSEMBLY__ */
 
 /* Instruction cache related Auxiliary registers */
@@ -94,4 +96,10 @@ extern void read_decode_cache_bcr(void);
 #define SLC_CTRL_BUSY		0x100
 #define SLC_CTRL_RGN_OP_INV	0x200
 
+/* IO coherency related Auxiliary registers */
+#define ARC_REG_IO_COH_ENABLE	0x500
+#define ARC_REG_IO_COH_PARTIAL	0x501
+#define ARC_REG_IO_COH_AP0_BASE	0x508
+#define ARC_REG_IO_COH_AP0_SIZE	0x509
+
 #endif /* _ASM_CACHE_H */
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 1cd6695..25e7077 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -22,10 +22,15 @@
 #include <asm/setup.h>
 
 static int l2_line_sz;
+int ioc_exists;
 
 void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr,
 			       unsigned long sz, const int cacheop);
 
+void (*__dma_cache_wback_inv)(unsigned long start, unsigned long sz);
+void (*__dma_cache_inv)(unsigned long start, unsigned long sz);
+void (*__dma_cache_wback)(unsigned long start, unsigned long sz);
+
 char *arc_cache_mumbojumbo(int c, char *buf, int len)
 {
 	int n = 0;
@@ -50,6 +55,9 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
 		n += scnprintf(buf + n, len - n,
 			"SLC\t\t: %uK, %uB Line\n", p->sz_k, p->line_len);
 
+	if (ioc_exists)
+		n += scnprintf(buf + n, len - n, "IOC\t\t: exists\n");
+
 	return buf;
 }
 
@@ -80,6 +88,14 @@ void read_decode_cache_bcr(void)
 #endif
 	} slc_cfg;
 
+	struct bcr_clust_cfg {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		unsigned int pad:7, c:1, num_entries:8, num_cores:8, ver:8;
+#else
+		unsigned int ver:8, num_cores:8, num_entries:8, c:1, pad:7;
+#endif
+	} cbcr;
+
 	p_ic = &cpuinfo_arc700[cpu].icache;
 	READ_BCR(ARC_REG_IC_BCR, ibcr);
 
@@ -133,6 +149,10 @@ slc_chk:
 		p_slc->sz_k = 128 << slc_cfg.sz;
 		l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64;
 	}
+
+	READ_BCR(ARC_REG_CLUSTER_BCR, cbcr);
+	if (cbcr.c)
+		ioc_exists = 1;
 }
 
 /*
@@ -516,11 +536,6 @@ noinline void slc_op(unsigned long paddr, unsigned long sz, const int op)
 #endif
 }
 
-static inline int need_slc_flush(void)
-{
-	return is_isa_arcv2() && l2_line_sz;
-}
-
 /***********************************************************
  * Exported APIs
  */
@@ -569,30 +584,74 @@ void flush_dcache_page(struct page *page)
 }
 EXPORT_SYMBOL(flush_dcache_page);
 
-void dma_cache_wback_inv(unsigned long start, unsigned long sz)
+/*
+ * DMA ops for systems with L1 cache only
+ * Make memory coherent with L1 cache by flushing/invalidating L1 lines
+ */
+static void __dma_cache_wback_inv_l1(unsigned long start, unsigned long sz)
 {
 	__dc_line_op_k(start, sz, OP_FLUSH_N_INV);
+}
 
-	if (need_slc_flush())
-		slc_op(start, sz, OP_FLUSH_N_INV);
+static void __dma_cache_inv_l1(unsigned long start, unsigned long sz)
+{
+	__dc_line_op_k(start, sz, OP_INV);
 }
-EXPORT_SYMBOL(dma_cache_wback_inv);
 
-void dma_cache_inv(unsigned long start, unsigned long sz)
+static void __dma_cache_wback_l1(unsigned long start, unsigned long sz)
+{
+	__dc_line_op_k(start, sz, OP_FLUSH);
+}
+
+/*
+ * DMA ops for systems with both L1 and L2 caches, but without IOC
+ * Both L1 and L2 lines need to be explicity flushed/invalidated
+ */
+static void __dma_cache_wback_inv_slc(unsigned long start, unsigned long sz)
+{
+	__dc_line_op_k(start, sz, OP_FLUSH_N_INV);
+	slc_op(start, sz, OP_FLUSH_N_INV);
+}
+
+static void __dma_cache_inv_slc(unsigned long start, unsigned long sz)
 {
 	__dc_line_op_k(start, sz, OP_INV);
+	slc_op(start, sz, OP_INV);
+}
 
-	if (need_slc_flush())
-		slc_op(start, sz, OP_INV);
+static void __dma_cache_wback_slc(unsigned long start, unsigned long sz)
+{
+	__dc_line_op_k(start, sz, OP_FLUSH);
+	slc_op(start, sz, OP_FLUSH);
+}
+
+/*
+ * DMA ops for systems with IOC
+ * IOC hardware snoops all DMA traffic keeping the caches consistent with
+ * memory - eliding need for any explicit cache maintenance of DMA buffers
+ */
+static void __dma_cache_wback_inv_ioc(unsigned long start, unsigned long sz) {}
+static void __dma_cache_inv_ioc(unsigned long start, unsigned long sz) {}
+static void __dma_cache_wback_ioc(unsigned long start, unsigned long sz) {}
+
+/*
+ * Exported DMA API
+ */
+void dma_cache_wback_inv(unsigned long start, unsigned long sz)
+{
+	__dma_cache_wback_inv(start, sz);
+}
+EXPORT_SYMBOL(dma_cache_wback_inv);
+
+void dma_cache_inv(unsigned long start, unsigned long sz)
+{
+	__dma_cache_inv(start, sz);
 }
 EXPORT_SYMBOL(dma_cache_inv);
 
 void dma_cache_wback(unsigned long start, unsigned long sz)
 {
-	__dc_line_op_k(start, sz, OP_FLUSH);
-
-	if (need_slc_flush())
-		slc_op(start, sz, OP_FLUSH);
+	__dma_cache_wback(start, sz);
 }
 EXPORT_SYMBOL(dma_cache_wback);
 
@@ -848,4 +907,27 @@ void arc_cache_init(void)
 				panic("Disable CONFIG_ARC_CACHE_VIPT_ALIASING\n");
 		}
 	}
+
+	if (is_isa_arcv2() && ioc_exists) {
+		/* IO coherency base - 0x8z */
+		write_aux_reg(ARC_REG_IO_COH_AP0_BASE, 0x80000);
+		/* IO coherency aperture size - 512Mb: 0x8z-0xAz */
+		write_aux_reg(ARC_REG_IO_COH_AP0_SIZE, 0x11);
+		/* Enable partial writes */
+		write_aux_reg(ARC_REG_IO_COH_PARTIAL, 1);
+		/* Enable IO coherency */
+		write_aux_reg(ARC_REG_IO_COH_ENABLE, 1);
+
+		__dma_cache_wback_inv = __dma_cache_wback_inv_ioc;
+		__dma_cache_inv = __dma_cache_inv_ioc;
+		__dma_cache_wback = __dma_cache_wback_ioc;
+	} else if (is_isa_arcv2() && l2_line_sz) {
+		__dma_cache_wback_inv = __dma_cache_wback_inv_slc;
+		__dma_cache_inv = __dma_cache_inv_slc;
+		__dma_cache_wback = __dma_cache_wback_slc;
+	} else {
+		__dma_cache_wback_inv = __dma_cache_wback_inv_l1;
+		__dma_cache_inv = __dma_cache_inv_l1;
+		__dma_cache_wback = __dma_cache_wback_l1;
+	}
 }
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 57706a9..e039fac 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -19,6 +19,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/dma-debug.h>
 #include <linux/export.h>
+#include <asm/cache.h>
 #include <asm/cacheflush.h>
 
 /*
@@ -53,6 +54,20 @@ void *dma_alloc_coherent(struct device *dev, size_t size,
 {
 	void *paddr, *kvaddr;
 
+	/*
+	 * IOC relies on all data (even coherent DMA data) being in cache
+	 * Thus allocate normal cached memory
+	 *
+	 * The gains with IOC are two pronged:
+	 *   -For streaming data, elides needs for cache maintenance, saving
+	 *    cycles in flush code, and bus bandwidth as all the lines of a
+	 *    buffer need to be flushed out to memory
+	 *   -For coherent data, Read/Write to buffers terminate early in cache
+	 *   (vs. always going to memory - thus are faster)
+	 */
+	if (ioc_exists)
+		return dma_alloc_noncoherent(dev, size, dma_handle, gfp);
+
 	/* This is linear addr (0x8000_0000 based) */
 	paddr = alloc_pages_exact(size, gfp);
 	if (!paddr)
@@ -85,6 +100,9 @@ EXPORT_SYMBOL(dma_alloc_coherent);
 void dma_free_coherent(struct device *dev, size_t size, void *kvaddr,
 		       dma_addr_t dma_handle)
 {
+	if (ioc_exists)
+		return dma_free_noncoherent(dev, size, kvaddr, dma_handle);
+
 	iounmap((void __force __iomem *)kvaddr);
 
 	free_pages_exact((void *)dma_handle, size);
-- 
cgit v0.10.2


From 79335a2ca03fdd883823e068b5e2f89a8ee47839 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Thu, 4 Jun 2015 18:30:23 +0530
Subject: ARCv2: SLC: Allow boot time disable

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 25e7077..7c424e3 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -23,6 +23,7 @@
 
 static int l2_line_sz;
 int ioc_exists;
+volatile int slc_enable = 1;
 
 void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr,
 			       unsigned long sz, const int cacheop);
@@ -36,6 +37,7 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
 	int n = 0;
 	struct cpuinfo_arc_cache *p;
 
+#define IS_USED_RUN(v)		((v) ? "" : "(disabled) ")
 #define PR_CACHE(p, cfg, str)						\
 	if (!(p)->ver)							\
 		n += scnprintf(buf + n, len - n, str"\t\t: N/A\n");	\
@@ -53,7 +55,8 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
 	p = &cpuinfo_arc700[c].slc;
 	if (p->ver)
 		n += scnprintf(buf + n, len - n,
-			"SLC\t\t: %uK, %uB Line\n", p->sz_k, p->line_len);
+			       "SLC\t\t: %uK, %uB Line%s\n",
+			       p->sz_k, p->line_len, IS_USED_RUN(slc_enable));
 
 	if (ioc_exists)
 		n += scnprintf(buf + n, len - n, "IOC\t\t: exists\n");
@@ -908,6 +911,20 @@ void arc_cache_init(void)
 		}
 	}
 
+	if (is_isa_arcv2() && l2_line_sz && !slc_enable) {
+
+		/* IM set : flush before invalidate */
+		write_aux_reg(ARC_REG_SLC_CTRL,
+			read_aux_reg(ARC_REG_SLC_CTRL) | SLC_CTRL_IM);
+
+		write_aux_reg(ARC_REG_SLC_INVALIDATE, 1);
+
+		/* Important to wait for flush to complete */
+		while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY);
+		write_aux_reg(ARC_REG_SLC_CTRL,
+			read_aux_reg(ARC_REG_SLC_CTRL) | SLC_CTRL_DISABLE);
+	}
+
 	if (is_isa_arcv2() && ioc_exists) {
 		/* IO coherency base - 0x8z */
 		write_aux_reg(ARC_REG_IO_COH_AP0_BASE, 0x80000);
@@ -921,7 +938,7 @@ void arc_cache_init(void)
 		__dma_cache_wback_inv = __dma_cache_wback_inv_ioc;
 		__dma_cache_inv = __dma_cache_inv_ioc;
 		__dma_cache_wback = __dma_cache_wback_ioc;
-	} else if (is_isa_arcv2() && l2_line_sz) {
+	} else if (is_isa_arcv2() && l2_line_sz && slc_enable) {
 		__dma_cache_wback_inv = __dma_cache_wback_inv_slc;
 		__dma_cache_inv = __dma_cache_inv_slc;
 		__dma_cache_wback = __dma_cache_wback_slc;
-- 
cgit v0.10.2


From 1648c70d301e669ba03aa1c70fff46ec2c400414 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <abrodkin@synopsys.com>
Date: Tue, 9 Jun 2015 11:25:22 +0300
Subject: ARCv2: IOC: Allow boot time disable

Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 7c424e3..5c825c8 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -23,7 +23,7 @@
 
 static int l2_line_sz;
 int ioc_exists;
-volatile int slc_enable = 1;
+volatile int slc_enable = 1, ioc_enable = 1;
 
 void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr,
 			       unsigned long sz, const int cacheop);
@@ -59,7 +59,8 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
 			       p->sz_k, p->line_len, IS_USED_RUN(slc_enable));
 
 	if (ioc_exists)
-		n += scnprintf(buf + n, len - n, "IOC\t\t: exists\n");
+		n += scnprintf(buf + n, len - n, "IOC\t\t:%s\n",
+				IS_USED_RUN(ioc_enable));
 
 	return buf;
 }
@@ -154,7 +155,7 @@ slc_chk:
 	}
 
 	READ_BCR(ARC_REG_CLUSTER_BCR, cbcr);
-	if (cbcr.c)
+	if (cbcr.c && ioc_enable)
 		ioc_exists = 1;
 }
 
-- 
cgit v0.10.2


From 31d30c8208a38a0442cc01a9c7f6542489c76353 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Wed, 5 Aug 2015 19:10:02 +0530
Subject: ARC: add barriers to futex code

The atomic ops on futex need to provide the full barrier just like
regular atomics in kernel.

Also remove pagefault_enable/disable in futex_atomic_cmpxchg_inatomic()
as core code already does that

Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 70cfe16..9de18a5 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -20,6 +20,7 @@
 
 #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)\
 							\
+	smp_mb();					\
 	__asm__ __volatile__(				\
 	"1:	llock	%1, [%2]		\n"	\
 		insn				"\n"	\
@@ -40,12 +41,14 @@
 							\
 	: "=&r" (ret), "=&r" (oldval)			\
 	: "r" (uaddr), "r" (oparg), "ir" (-EFAULT)	\
-	: "cc", "memory")
+	: "cc", "memory");				\
+	smp_mb()					\
 
 #else	/* !CONFIG_ARC_HAS_LLSC */
 
 #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)\
 							\
+	smp_mb();					\
 	__asm__ __volatile__(				\
 	"1:	ld	%1, [%2]		\n"	\
 		insn				"\n"	\
@@ -65,7 +68,8 @@
 							\
 	: "=&r" (ret), "=&r" (oldval)			\
 	: "r" (uaddr), "r" (oparg), "ir" (-EFAULT)	\
-	: "cc", "memory")
+	: "cc", "memory");				\
+	smp_mb()					\
 
 #endif
 
@@ -134,13 +138,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	return ret;
 }
 
-/* Compare-xchg with pagefaults disabled.
- *  Notes:
- *      -Best-Effort: Exchg happens only if compare succeeds.
- *          If compare fails, returns; leaving retry/looping to upper layers
- *      -successful cmp-xchg: return orig value in @addr (same as cmp val)
- *      -Compare fails: return orig value in @addr
- *      -user access r/w fails: return -EFAULT
+/*
+ * cmpxchg of futex (pagefaults disabled by caller)
  */
 static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
@@ -151,7 +150,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	pagefault_disable();
+	smp_mb();
 
 	__asm__ __volatile__(
 #ifdef CONFIG_ARC_HAS_LLSC
@@ -178,7 +177,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
 	: "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
 	: "cc", "memory");
 
-	pagefault_enable();
+	smp_mb();
 
 	*uval = val;
 	return val;
-- 
cgit v0.10.2


From ed574e2bbd81ec20134059fb5e17acbc76387270 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Wed, 5 Aug 2015 19:23:34 +0530
Subject: ARC: futex cosmetics

Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 9de18a5..14b1c9a 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -94,6 +94,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 		__futex_atomic_op("mov %0, %3", ret, oldval, uaddr, oparg);
 		break;
 	case FUTEX_OP_ADD:
+		/* oldval = *uaddr; *uaddr += oparg ; ret = *uaddr */
 		__futex_atomic_op("add %0, %1, %3", ret, oldval, uaddr, oparg);
 		break;
 	case FUTEX_OP_OR:
@@ -142,12 +143,12 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
  * cmpxchg of futex (pagefaults disabled by caller)
  */
 static inline int
-futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
-					u32 newval)
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval,
+			      u32 newval)
 {
-	u32 val;
+	u32 existval;
 
-	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
 	smp_mb();
@@ -173,14 +174,14 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
 	"	.word   1b, 4b	\n"
 	"	.word   2b, 4b	\n"
 	"	.previous\n"
-	: "=&r"(val)
-	: "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
+	: "=&r"(existval)
+	: "r"(expval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
 	: "cc", "memory");
 
 	smp_mb();
 
-	*uval = val;
-	return val;
+	*uval = existval;
+	return existval;
 }
 
 #endif
-- 
cgit v0.10.2


From 882a95ae0a4f8fc303257acf5c6ff305df34d04b Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Thu, 6 Aug 2015 17:03:17 +0530
Subject: ARC: make futex_atomic_cmpxchg_inatomic() return bimodal

Callers of cmpxchg_futex_value_locked() in futex code expect bimodal
return value:
  !0 (essentially -EFAULT as failure)
   0 (success)

Before this patch, the success return value was old value of futex,
which could very well be non zero, causing caller to possibly take the
failure path erroneously.

Fix that by returning 0 for success

(This fix was done back in 2011 for all upstream arches, which ARC
obviously missed)

Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 14b1c9a..0ea8bcc 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -141,11 +141,13 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 
 /*
  * cmpxchg of futex (pagefaults disabled by caller)
+ * Return 0 for success, -EFAULT otherwise
  */
 static inline int
 futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval,
 			      u32 newval)
 {
+	int ret = 0;
 	u32 existval;
 
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
@@ -155,18 +157,18 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval,
 
 	__asm__ __volatile__(
 #ifdef CONFIG_ARC_HAS_LLSC
-	"1:	llock	%0, [%3]		\n"
-	"	brne	%0, %1, 3f		\n"
-	"2:	scond	%2, [%3]		\n"
+	"1:	llock	%1, [%4]		\n"
+	"	brne	%1, %2, 3f		\n"
+	"2:	scond	%3, [%4]		\n"
 	"	bnz	1b			\n"
 #else
-	"1:	ld	%0, [%3]		\n"
-	"	brne	%0, %1, 3f		\n"
-	"2:	st	%2, [%3]		\n"
+	"1:	ld	%1, [%4]		\n"
+	"	brne	%1, %2, 3f		\n"
+	"2:	st	%3, [%4]		\n"
 #endif
 	"3:	\n"
 	"	.section .fixup,\"ax\"	\n"
-	"4:	mov %0, %4	\n"
+	"4:	mov %0, %5	\n"
 	"	b   3b	\n"
 	"	.previous	\n"
 	"	.section __ex_table,\"a\"	\n"
@@ -174,14 +176,14 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval,
 	"	.word   1b, 4b	\n"
 	"	.word   2b, 4b	\n"
 	"	.previous\n"
-	: "=&r"(existval)
+	: "+&r"(ret), "=&r"(existval)
 	: "r"(expval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
 	: "cc", "memory");
 
 	smp_mb();
 
 	*uval = existval;
-	return existval;
+	return ret;
 }
 
 #endif
-- 
cgit v0.10.2


From 5e0574292ad48dcdf48ef90a47da862c21d649a6 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Thu, 6 Aug 2015 17:55:34 +0530
Subject: ARC: Enable HAVE_FUTEX_CMPXCHG

ARC doesn't need the runtime detection of futex cmpxchg op

Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index e119d42..78c0621 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -23,6 +23,7 @@ config ARC
 	select GENERIC_SMP_IDLE_THREAD
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
+	select HAVE_FUTEX_CMPXCHG
 	select HAVE_IOREMAP_PROT
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
-- 
cgit v0.10.2


From eb2cd8b72b08fe56998600aee8a5dff93f7be5a2 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Thu, 6 Aug 2015 19:11:06 +0530
Subject: ARC: ensure futex ops are atomic in !LLSC config

W/o hardware assisted atomic r-m-w the best we can do is to disable
preemption.

Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 0ea8bcc..8f44998 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -87,6 +87,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
+#ifndef CONFIG_ARC_HAS_LLSC
+	preempt_disable();	/* to guarantee atomic r-m-w of futex op */
+#endif
 	pagefault_disable();
 
 	switch (op) {
@@ -111,6 +114,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	}
 
 	pagefault_enable();
+#ifndef CONFIG_ARC_HAS_LLSC
+	preempt_enable();
+#endif
 
 	if (!ret) {
 		switch (cmp) {
@@ -153,6 +159,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
+#ifndef CONFIG_ARC_HAS_LLSC
+	preempt_disable();	/* to guarantee atomic r-m-w of futex op */
+#endif
 	smp_mb();
 
 	__asm__ __volatile__(
@@ -182,6 +191,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval,
 
 	smp_mb();
 
+#ifndef CONFIG_ARC_HAS_LLSC
+	preempt_enable();
+#endif
 	*uval = existval;
 	return ret;
 }
-- 
cgit v0.10.2


From 6de6066c0d24a66df465cf87a4041ef7ef35ba6f Mon Sep 17 00:00:00 2001
From: Yuriy Kolerov <yuriy.kolerov@synopsys.com>
Date: Wed, 12 Aug 2015 17:23:32 +0300
Subject: ARC: change some branchs to jumps to resolve linkage errors

When kernel's binary becomes large enough (32M and more) errors
may occur during the final linkage stage. It happens because
the build system uses short relocations for ARC  by default.
This problem may be easily resolved by passing -mlong-calls
option to GCC to use long absolute jumps (j) instead of short
relative branchs (b).

But there are fragments of pure assembler code exist which use
branchs in inappropriate places and cause a linkage error because
of relocations overflow.

First of these fragments is .fixup insertion in futex.h and
unaligned.c. It inserts a code in the separate section (.fixup)
with branch instruction. It leads to the linkage error when
kernel becomes large.

Second of these fragments is calling scheduler's functions
(common kernel code) from entry.S of ARC's code. When kernel's
binary becomes large it may lead to the linkage error because
scheduler may occur far enough from ARC's code in the final
binary.

Signed-off-by: Yuriy Kolerov <yuriy.kolerov@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 8f44998..11e1b1f 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -31,7 +31,7 @@
 	"	.section .fixup,\"ax\"		\n"	\
 	"	.align  4			\n"	\
 	"4:	mov %0, %4			\n"	\
-	"	b   3b				\n"	\
+	"	j   3b				\n"	\
 	"	.previous			\n"	\
 	"	.section __ex_table,\"a\"	\n"	\
 	"	.align  4			\n"	\
@@ -58,7 +58,7 @@
 	"	.section .fixup,\"ax\"		\n"	\
 	"	.align  4			\n"	\
 	"4:	mov %0, %4			\n"	\
-	"	b   3b				\n"	\
+	"	j   3b				\n"	\
 	"	.previous			\n"	\
 	"	.section __ex_table,\"a\"	\n"	\
 	"	.align  4			\n"	\
@@ -178,7 +178,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval,
 	"3:	\n"
 	"	.section .fixup,\"ax\"	\n"
 	"4:	mov %0, %5	\n"
-	"	b   3b	\n"
+	"	j   3b	\n"
 	"	.previous	\n"
 	"	.section __ex_table,\"a\"	\n"
 	"	.align  4	\n"
diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S
index f7a82fd..589abf5 100644
--- a/arch/arc/kernel/entry.S
+++ b/arch/arc/kernel/entry.S
@@ -42,7 +42,7 @@ ENTRY(ret_from_fork)
 	; when the forked child comes here from the __switch_to function
 	; r0 has the last task pointer.
 	; put last task in scheduler queue
-	bl   @schedule_tail
+	jl   @schedule_tail
 
 	ld   r9, [sp, PT_status32]
 	brne r9, 0, 1f
@@ -320,7 +320,7 @@ resume_user_mode_begin:
 	; --- (Slow Path #1) task preemption ---
 	bbit0  r9, TIF_NEED_RESCHED, .Lchk_pend_signals
 	mov    blink, resume_user_mode_begin  ; tail-call to U mode ret chks
-	b      @schedule 	; BTST+Bnz causes relo error in link
+	j      @schedule 	; BTST+Bnz causes relo error in link
 
 .Lchk_pend_signals:
 	IRQ_ENABLE	r10
@@ -381,7 +381,7 @@ resume_kernel_mode:
 	bbit0  r9, TIF_NEED_RESCHED, .Lrestore_regs
 
 	; Invoke PREEMPTION
-	bl      preempt_schedule_irq
+	jl      preempt_schedule_irq
 
 	; preempt_schedule_irq() always returns with IRQ disabled
 #endif
diff --git a/arch/arc/kernel/unaligned.c b/arch/arc/kernel/unaligned.c
index 74db59b..abd961f 100644
--- a/arch/arc/kernel/unaligned.c
+++ b/arch/arc/kernel/unaligned.c
@@ -34,7 +34,7 @@
 	"	.section .fixup,\"ax\"\n"		\
 	"	.align	4\n"				\
 	"3:	mov	%0, 1\n"			\
-	"	b	2b\n"				\
+	"	j	2b\n"				\
 	"	.previous\n"				\
 	"	.section __ex_table,\"a\"\n"		\
 	"	.align	4\n"				\
@@ -82,7 +82,7 @@
 		"	.section .fixup,\"ax\"\n"	\
 		"	.align	4\n"			\
 		"4:	mov	%0, 1\n"		\
-		"	b	3b\n"			\
+		"	j	3b\n"			\
 		"	.previous\n"			\
 		"	.section __ex_table,\"a\"\n"	\
 		"	.align	4\n"			\
@@ -113,7 +113,7 @@
 		"	.section .fixup,\"ax\"\n"	\
 		"	.align	4\n"			\
 		"6:	mov	%0, 1\n"		\
-		"	b	5b\n"			\
+		"	j	5b\n"			\
 		"	.previous\n"			\
 		"	.section __ex_table,\"a\"\n"	\
 		"	.align	4\n"			\
-- 
cgit v0.10.2


From 090749502ff20d7d9ec244036fe636b6bf0433b6 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Wed, 19 Aug 2015 17:23:58 +0530
Subject: ARC: add/fix some comments in code - no functional change

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi
index 1cd5e82..846481f 100644
--- a/arch/arc/boot/dts/axc003.dtsi
+++ b/arch/arc/boot/dts/axc003.dtsi
@@ -72,12 +72,13 @@
 	};
 
 	/*
-	 * This INTC is actually connected to DW APB GPIO
-	 * which acts as a wire between MB INTC and CPU INTC.
-	 * GPIO INTC is configured in platform init code
-	 * and here we mimic direct connection from MB INTC to
-	 * CPU INTC, thus we set "interrupts = <7>" instead of
-	 * "interrupts = <12>"
+	 * The DW APB ICTL intc on MB is connected to CPU intc via a
+	 * DT "invisible" DW APB GPIO block, configured to simply pass thru
+	 * interrupts - setup accordinly in platform init (plat-axs10x/ax10x.c)
+	 *
+	 * So here we mimic a direct connection betwen them, ignoring the
+	 * ABPG GPIO. Thus set "interrupts = <24>" (DW APB GPIO to core)
+	 * instead of "interrupts = <12>" (DW APB ICTL to DW APB GPIO)
 	 *
 	 * This intc actually resides on MB, but we move it here to
 	 * avoid duplicating the MB dtsi file given that IRQ from
diff --git a/arch/arc/include/asm/cmpxchg.h b/arch/arc/include/asm/cmpxchg.h
index 44fd531..af7a2db 100644
--- a/arch/arc/include/asm/cmpxchg.h
+++ b/arch/arc/include/asm/cmpxchg.h
@@ -110,18 +110,18 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr,
 						 sizeof(*(ptr))))
 
 /*
- * On ARC700, EX insn is inherently atomic, so by default "vanilla" xchg() need
- * not require any locking. However there's a quirk.
- * ARC lacks native CMPXCHG, thus emulated (see above), using external locking -
- * incidently it "reuses" the same atomic_ops_lock used by atomic APIs.
- * Now, llist code uses cmpxchg() and xchg() on same data, so xchg() needs to
- * abide by same serializing rules, thus ends up using atomic_ops_lock as well.
+ * xchg() maps directly to ARC EX instruction which guarantees atomicity.
+ * However in !LLSC config, it also needs to be use @atomic_ops_lock spinlock
+ * due to a subtle reason:
+ *  - For !LLSC, cmpxchg() needs to use that lock (see above) and there is lot
+ *    of  kernel code which calls xchg()/cmpxchg() on same data (see llist.h)
+ *    Hence xchg() needs to follow same locking rules.
  *
- * This however is only relevant if SMP and/or ARC lacks LLSC
- *   if (UP or LLSC)
- *      xchg doesn't need serialization
- *   else <==> !(UP or LLSC) <==> (!UP and !LLSC) <==> (SMP and !LLSC)
- *      xchg needs serialization
+ * Technically the lock is also needed for UP (boils down to irq save/restore)
+ * but we can cheat a bit since cmpxchg() atomic_ops_lock() would cause irqs to
+ * be disabled thus can't possibly be interrpted/preempted/clobbered by xchg()
+ * Other way around, xchg is one instruction anyways, so can't be interrupted
+ * as such
  */
 
 #if !defined(CONFIG_ARC_HAS_LLSC) && defined(CONFIG_SMP)
diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h
index 2b8880e..e2eaf6f 100644
--- a/arch/arc/include/asm/perf_event.h
+++ b/arch/arc/include/asm/perf_event.h
@@ -95,7 +95,7 @@ static const char * const arc_pmu_ev_hw_map[] = {
 
 	/* counts condition */
 	[PERF_COUNT_HW_INSTRUCTIONS] = "iall",
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp",
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", /* Excludes ZOL jumps */
 	[PERF_COUNT_ARC_BPOK]         = "bpok",	  /* NP-NT, PT-T, PNT-NT */
 	[PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */
 
diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index 1287388..79ab199 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -199,8 +199,8 @@ static void arc_pmu_start(struct perf_event *event, int flags)
 	event->hw.state = 0;
 
 	/* enable ARC pmu here */
-	write_aux_reg(ARC_REG_PCT_INDEX, idx);
-	write_aux_reg(ARC_REG_PCT_CONFIG, hwc->config);
+	write_aux_reg(ARC_REG_PCT_INDEX, idx);		/* counter # */
+	write_aux_reg(ARC_REG_PCT_CONFIG, hwc->config);	/* condition */
 }
 
 static void arc_pmu_stop(struct perf_event *event, int flags)
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index 4409245..91d5a0f 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -65,7 +65,7 @@ asmlinkage void ret_from_fork(void);
  * ------------------
  * |     r25        |   <==== top of Stack (thread.ksp)
  * ~                ~
- * |    --to--      |   (CALLEE Regs of user mode)
+ * |    --to--      |   (CALLEE Regs of kernel mode)
  * |     r13        |
  * ------------------
  * |     fp         |
diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c
index e7769c3..ad9825d 100644
--- a/arch/arc/plat-axs10x/axs10x.c
+++ b/arch/arc/plat-axs10x/axs10x.c
@@ -46,7 +46,7 @@ static void __init axs10x_enable_gpio_intc_wire(void)
 	 * -------------------   -------------------
 	 * | snps,dw-apb-gpio |  | snps,dw-apb-gpio |
 	 * -------------------   -------------------
-	 *        |                         |
+	 *        | #12                     |
 	 *        |                 [ Debug UART on cpu card ]
 	 *        |
 	 * ------------------------
-- 
cgit v0.10.2


From fd0881a24ac9ab2be6c052d30ca779597c0bd3bc Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Fri, 21 Aug 2015 15:06:43 +0530
Subject: ARC: Eliminate some ARCv2 specific code for ARCompact build

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 5c825c8..0d1a6e9 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -52,6 +52,9 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
 	PR_CACHE(&cpuinfo_arc700[c].icache, CONFIG_ARC_HAS_ICACHE, "I-Cache");
 	PR_CACHE(&cpuinfo_arc700[c].dcache, CONFIG_ARC_HAS_DCACHE, "D-Cache");
 
+	if (!is_isa_arcv2())
+                return buf;
+
 	p = &cpuinfo_arc700[c].slc;
 	if (p->ver)
 		n += scnprintf(buf + n, len - n,
@@ -70,18 +73,9 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len)
  * the cpuinfo structure for later use.
  * No Validation done here, simply read/convert the BCRs
  */
-void read_decode_cache_bcr(void)
+static void read_decode_cache_bcr_arcv2(int cpu)
 {
-	struct cpuinfo_arc_cache *p_ic, *p_dc, *p_slc;
-	unsigned int cpu = smp_processor_id();
-	struct bcr_cache {
-#ifdef CONFIG_CPU_BIG_ENDIAN
-		unsigned int pad:12, line_len:4, sz:4, config:4, ver:8;
-#else
-		unsigned int ver:8, config:4, sz:4, line_len:4, pad:12;
-#endif
-	} ibcr, dbcr;
-
+	struct cpuinfo_arc_cache *p_slc = &cpuinfo_arc700[cpu].slc;
 	struct bcr_generic sbcr;
 
 	struct bcr_slc_cfg {
@@ -100,6 +94,31 @@ void read_decode_cache_bcr(void)
 #endif
 	} cbcr;
 
+	READ_BCR(ARC_REG_SLC_BCR, sbcr);
+	if (sbcr.ver) {
+		READ_BCR(ARC_REG_SLC_CFG, slc_cfg);
+		p_slc->ver = sbcr.ver;
+		p_slc->sz_k = 128 << slc_cfg.sz;
+		l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64;
+	}
+
+	READ_BCR(ARC_REG_CLUSTER_BCR, cbcr);
+	if (cbcr.c && ioc_enable)
+		ioc_exists = 1;
+}
+
+void read_decode_cache_bcr(void)
+{
+	struct cpuinfo_arc_cache *p_ic, *p_dc;
+	unsigned int cpu = smp_processor_id();
+	struct bcr_cache {
+#ifdef CONFIG_CPU_BIG_ENDIAN
+		unsigned int pad:12, line_len:4, sz:4, config:4, ver:8;
+#else
+		unsigned int ver:8, config:4, sz:4, line_len:4, pad:12;
+#endif
+	} ibcr, dbcr;
+
 	p_ic = &cpuinfo_arc700[cpu].icache;
 	READ_BCR(ARC_REG_IC_BCR, ibcr);
 
@@ -142,21 +161,8 @@ dc_chk:
 	p_dc->ver = dbcr.ver;
 
 slc_chk:
-	if (!is_isa_arcv2())
-		return;
-
-	p_slc = &cpuinfo_arc700[cpu].slc;
-	READ_BCR(ARC_REG_SLC_BCR, sbcr);
-	if (sbcr.ver) {
-		READ_BCR(ARC_REG_SLC_CFG, slc_cfg);
-		p_slc->ver = sbcr.ver;
-		p_slc->sz_k = 128 << slc_cfg.sz;
-		l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64;
-	}
-
-	READ_BCR(ARC_REG_CLUSTER_BCR, cbcr);
-	if (cbcr.c && ioc_enable)
-		ioc_exists = 1;
+	if (is_isa_arcv2())
+                read_decode_cache_bcr_arcv2(cpu);
 }
 
 /*
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index e039fac..29a46bb 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -65,7 +65,7 @@ void *dma_alloc_coherent(struct device *dev, size_t size,
 	 *   -For coherent data, Read/Write to buffers terminate early in cache
 	 *   (vs. always going to memory - thus are faster)
 	 */
-	if (ioc_exists)
+	if (is_isa_arcv2() && ioc_exists)
 		return dma_alloc_noncoherent(dev, size, dma_handle, gfp);
 
 	/* This is linear addr (0x8000_0000 based) */
@@ -100,7 +100,7 @@ EXPORT_SYMBOL(dma_alloc_coherent);
 void dma_free_coherent(struct device *dev, size_t size, void *kvaddr,
 		       dma_addr_t dma_handle)
 {
-	if (ioc_exists)
+	if (is_isa_arcv2() && ioc_exists)
 		return dma_free_noncoherent(dev, size, kvaddr, dma_handle);
 
 	iounmap((void __force __iomem *)kvaddr);
-- 
cgit v0.10.2


From fb7c57255168d34ae34300bcf78f50aebdeae4dc Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Mon, 24 Aug 2015 13:37:01 +0300
Subject: ARC: perf: cap the number of counters to hardware max of 32

The number of counters in PCT can never be more than 32 (while
countable conditions could be 100+) for both ARCompact and ARCv2

And while at it update copyright dates.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h
index e2eaf6f..3c9bf28 100644
--- a/arch/arc/include/asm/perf_event.h
+++ b/arch/arc/include/asm/perf_event.h
@@ -1,6 +1,7 @@
 /*
  * Linux performance counter support for ARC
  *
+ * Copyright (C) 2014-2015 Synopsys, Inc. (www.synopsys.com)
  * Copyright (C) 2011-2013 Synopsys, Inc. (www.synopsys.com)
  *
  * This program is free software; you can redistribute it and/or modify
@@ -12,8 +13,8 @@
 #ifndef __ASM_PERF_EVENT_H
 #define __ASM_PERF_EVENT_H
 
-/* real maximum varies per CPU, this is the maximum supported by the driver */
-#define ARC_PMU_MAX_HWEVENTS	64
+/* Max number of counters that PCT block may ever have */
+#define ARC_PERF_MAX_COUNTERS	32
 
 #define ARC_REG_CC_BUILD	0xF6
 #define ARC_REG_CC_INDEX	0x240
diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index 79ab199..c5554373 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -1,7 +1,7 @@
 /*
  * Linux performance counter support for ARC700 series
  *
- * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com)
+ * Copyright (C) 2013-2015 Synopsys, Inc. (www.synopsys.com)
  *
  * This code is inspired by the perf support of various other architectures.
  *
@@ -22,7 +22,7 @@ struct arc_pmu {
 	struct pmu	pmu;
 	int		counter_size;	/* in bits */
 	int		n_counters;
-	unsigned long	used_mask[BITS_TO_LONGS(ARC_PMU_MAX_HWEVENTS)];
+	unsigned long	used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)];
 	int		ev_hw_idx[PERF_COUNT_ARC_HW_MAX];
 };
 
@@ -284,7 +284,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
 		pr_err("This core does not have performance counters!\n");
 		return -ENODEV;
 	}
-	BUG_ON(pct_bcr.c > ARC_PMU_MAX_HWEVENTS);
+	BUG_ON(pct_bcr.c > ARC_PERF_MAX_COUNTERS);
 
 	READ_BCR(ARC_REG_CC_BUILD, cc_bcr);
 	BUG_ON(!cc_bcr.v); /* Counters exist but No countable conditions ? */
-- 
cgit v0.10.2


From 1fe8bfa5ff3b2e97f26add89b20768fb7c4188c0 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <abrodkin@synopsys.com>
Date: Mon, 24 Aug 2015 13:42:27 +0300
Subject: ARCv2: perf: implement "event_set_period"

This generalization prepares for support of overflow interrupts.

Hardware event counters on ARC work that way:
Each counter counts from programmed start value (set in
ARC_REG_PCT_COUNT) to a limit value (set in ARC_REG_PCT_INT_CNT) and
once limit value is reached this timer generates an interrupt.

Even though this hardware implementation allows for more flexibility,
in Linux kernel we decided to mimic behavior of other architectures
this way:

 [1] Set limit value as half of counter's max value (to allow counter to
     run after reaching it limit, see below for more explanation):
 ---------->8-----------
 arc_pmu->max_period = (1ULL << counter_size) / 2 - 1ULL;
 ---------->8-----------

 [2] Set start value as "arc_pmu->max_period - sample_period" and then
count up to the limit

Our event counters don't stop on reaching max value (the one we set in
ARC_REG_PCT_INT_CNT) but continue to count until kernel explicitly
stops each of them.

And setting a limit as half of counter capacity is done to allow
capturing of additional events in between moment when interrupt was
triggered until we're actually processing PMU interrupts. That way
we're trying to be more precise.

For example if we count CPU cycles we keep track of cycles while
running through generic IRQ handling code:

 [1] We set counter period as say 100_000 events of type "crun"
 [2] Counter reaches that limit and raises its interrupt
 [3] Once we get in PMU IRQ handler we read current counter value from
ARC_REG_PCT_SNAP ans see there something like 105_000.

If counters stop on reaching a limit value then we would miss
additional 5000 cycles.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index c5554373..3626c56 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -20,9 +20,9 @@
 
 struct arc_pmu {
 	struct pmu	pmu;
-	int		counter_size;	/* in bits */
 	int		n_counters;
 	unsigned long	used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)];
+	u64		max_period;
 	int		ev_hw_idx[PERF_COUNT_ARC_HW_MAX];
 };
 
@@ -88,18 +88,15 @@ static uint64_t arc_pmu_read_counter(int idx)
 static void arc_perf_event_update(struct perf_event *event,
 				  struct hw_perf_event *hwc, int idx)
 {
-	uint64_t prev_raw_count, new_raw_count;
-	int64_t delta;
-
-	do {
-		prev_raw_count = local64_read(&hwc->prev_count);
-		new_raw_count = arc_pmu_read_counter(idx);
-	} while (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-				 new_raw_count) != prev_raw_count);
-
-	delta = (new_raw_count - prev_raw_count) &
-		((1ULL << arc_pmu->counter_size) - 1ULL);
+	uint64_t prev_raw_count = local64_read(&hwc->prev_count);
+	uint64_t new_raw_count = arc_pmu_read_counter(idx);
+	int64_t delta = new_raw_count - prev_raw_count;
 
+	/*
+	 * We don't afaraid of hwc->prev_count changing beneath our feet
+	 * because there's no way for us to re-enter this function anytime.
+	 */
+	local64_set(&hwc->prev_count, new_raw_count);
 	local64_add(delta, &event->count);
 	local64_sub(delta, &hwc->period_left);
 }
@@ -142,6 +139,10 @@ static int arc_pmu_event_init(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int ret;
 
+	hwc->sample_period  = arc_pmu->max_period;
+	hwc->last_period = hwc->sample_period;
+	local64_set(&hwc->period_left, hwc->sample_period);
+
 	switch (event->attr.type) {
 	case PERF_TYPE_HARDWARE:
 		if (event->attr.config >= PERF_COUNT_HW_MAX)
@@ -153,6 +154,7 @@ static int arc_pmu_event_init(struct perf_event *event)
 			 (int) event->attr.config, (int) hwc->config,
 			 arc_pmu_ev_hw_map[event->attr.config]);
 		return 0;
+
 	case PERF_TYPE_HW_CACHE:
 		ret = arc_pmu_cache_event(event->attr.config);
 		if (ret < 0)
@@ -180,6 +182,47 @@ static void arc_pmu_disable(struct pmu *pmu)
 	write_aux_reg(ARC_REG_PCT_CONTROL, (tmp & 0xffff0000) | 0x0);
 }
 
+static int arc_pmu_event_set_period(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int idx = hwc->idx;
+	int overflow = 0;
+	u64 value;
+
+	if (unlikely(left <= -period)) {
+		/* left underflowed by more than period. */
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	} else	if (unlikely(left <= 0)) {
+		/* left underflowed by less than period. */
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	}
+
+	if (left > arc_pmu->max_period)
+		left = arc_pmu->max_period;
+
+	value = arc_pmu->max_period - left;
+	local64_set(&hwc->prev_count, value);
+
+	/* Select counter */
+	write_aux_reg(ARC_REG_PCT_INDEX, idx);
+
+	/* Write value */
+	write_aux_reg(ARC_REG_PCT_COUNTL, (u32)value);
+	write_aux_reg(ARC_REG_PCT_COUNTH, (value >> 32));
+
+	perf_event_update_userpage(event);
+
+	return overflow;
+}
+
 /*
  * Assigns hardware counter to hardware condition.
  * Note that there is no separate start/stop mechanism;
@@ -194,9 +237,11 @@ static void arc_pmu_start(struct perf_event *event, int flags)
 		return;
 
 	if (flags & PERF_EF_RELOAD)
-		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+		WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+
+	hwc->state = 0;
 
-	event->hw.state = 0;
+	arc_pmu_event_set_period(event);
 
 	/* enable ARC pmu here */
 	write_aux_reg(ARC_REG_PCT_INDEX, idx);		/* counter # */
@@ -269,6 +314,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
 	struct arc_reg_pct_build pct_bcr;
 	struct arc_reg_cc_build cc_bcr;
 	int i, j;
+	int counter_size;	/* in bits */
 
 	union cc_name {
 		struct {
@@ -294,10 +340,11 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	arc_pmu->n_counters = pct_bcr.c;
-	arc_pmu->counter_size = 32 + (pct_bcr.s << 4);
+	counter_size = 32 + (pct_bcr.s << 4);
+	arc_pmu->max_period = (1ULL << counter_size) / 2 - 1ULL;
 
 	pr_info("ARC perf\t: %d counters (%d bits), %d countable conditions\n",
-		arc_pmu->n_counters, arc_pmu->counter_size, cc_bcr.c);
+		arc_pmu->n_counters, counter_size, cc_bcr.c);
 
 	cc_name.str[8] = 0;
 	for (i = 0; i < PERF_COUNT_ARC_HW_MAX; i++)
-- 
cgit v0.10.2


From 36481cf7fbcc666699d54cb267088d2b415ff164 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <abrodkin@synopsys.com>
Date: Mon, 24 Aug 2015 13:48:06 +0300
Subject: ARCv2: perf: Support sampling events using overflow interrupts

In times of ARC 700 performance counters didn't have support of
interrupt an so for ARC we only had support of non-sampling events.

Put simply only "perf stat" was functional.

Now with ARC HS we have support of interrupts in performance counters
which this change introduces support of.

ARC performance counters act in the following way in regard of
interrupts generation.
 [1] A counter counts starting from value set in PCT_COUNT register pair
 [2] Once counter reaches value set in PCT_INT_CNT interrupt is raised

Basic setup look like this:
 [1] PCT_COUNT = 0;
 [2] PCT_INT_CNT = __limit_value__;
 [3] Enable interrupts for that counter and let it run
 [4] Let counter reach its limit
 [5] Handle interrupt when it happens

Note that PCT HW block is build in CPU core and so ints interrupt
line (which is basically OR of all counters IRQs) is wired directly to
top-level IRQC. That means do de-assert PCT interrupt it's required to
reset IRQs from all counters that have reached their limit values.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h
index 3c9bf28..3522d09 100644
--- a/arch/arc/include/asm/perf_event.h
+++ b/arch/arc/include/asm/perf_event.h
@@ -29,15 +29,19 @@
 #define ARC_REG_PCT_CONFIG	0x254
 #define ARC_REG_PCT_CONTROL	0x255
 #define ARC_REG_PCT_INDEX	0x256
+#define ARC_REG_PCT_INT_CNTL	0x25C
+#define ARC_REG_PCT_INT_CNTH	0x25D
+#define ARC_REG_PCT_INT_CTRL	0x25E
+#define ARC_REG_PCT_INT_ACT	0x25F
 
 #define ARC_REG_PCT_CONTROL_CC	(1 << 16)	/* clear counts */
 #define ARC_REG_PCT_CONTROL_SN	(1 << 17)	/* snapshot */
 
 struct arc_reg_pct_build {
 #ifdef CONFIG_CPU_BIG_ENDIAN
-	unsigned int m:8, c:8, r:6, s:2, v:8;
+	unsigned int m:8, c:8, r:5, i:1, s:2, v:8;
 #else
-	unsigned int v:8, s:2, r:6, c:8, m:8;
+	unsigned int v:8, s:2, i:1, r:5, c:8, m:8;
 #endif
 };
 
diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index 3626c56..208c954 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -11,6 +11,7 @@
  *
  */
 #include <linux/errno.h>
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/perf_event.h>
@@ -24,6 +25,7 @@ struct arc_pmu {
 	unsigned long	used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)];
 	u64		max_period;
 	int		ev_hw_idx[PERF_COUNT_ARC_HW_MAX];
+	struct perf_event *act_counter[ARC_PERF_MAX_COUNTERS];
 };
 
 struct arc_callchain_trace {
@@ -139,9 +141,11 @@ static int arc_pmu_event_init(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int ret;
 
-	hwc->sample_period  = arc_pmu->max_period;
-	hwc->last_period = hwc->sample_period;
-	local64_set(&hwc->period_left, hwc->sample_period);
+	if (!is_sampling_event(event)) {
+		hwc->sample_period  = arc_pmu->max_period;
+		hwc->last_period = hwc->sample_period;
+		local64_set(&hwc->period_left, hwc->sample_period);
+	}
 
 	switch (event->attr.type) {
 	case PERF_TYPE_HARDWARE:
@@ -243,6 +247,11 @@ static void arc_pmu_start(struct perf_event *event, int flags)
 
 	arc_pmu_event_set_period(event);
 
+	/* Enable interrupt for this counter */
+	if (is_sampling_event(event))
+		write_aux_reg(ARC_REG_PCT_INT_CTRL,
+			      read_aux_reg(ARC_REG_PCT_INT_CTRL) | (1 << idx));
+
 	/* enable ARC pmu here */
 	write_aux_reg(ARC_REG_PCT_INDEX, idx);		/* counter # */
 	write_aux_reg(ARC_REG_PCT_CONFIG, hwc->config);	/* condition */
@@ -253,6 +262,17 @@ static void arc_pmu_stop(struct perf_event *event, int flags)
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 
+	/* Disable interrupt for this counter */
+	if (is_sampling_event(event)) {
+		/*
+		 * Reset interrupt flag by writing of 1. This is required
+		 * to make sure pending interrupt was not left.
+		 */
+		write_aux_reg(ARC_REG_PCT_INT_ACT, 1 << idx);
+		write_aux_reg(ARC_REG_PCT_INT_CTRL,
+			      read_aux_reg(ARC_REG_PCT_INT_CTRL) & ~(1 << idx));
+	}
+
 	if (!(event->hw.state & PERF_HES_STOPPED)) {
 		/* stop ARC pmu here */
 		write_aux_reg(ARC_REG_PCT_INDEX, idx);
@@ -275,6 +295,8 @@ static void arc_pmu_del(struct perf_event *event, int flags)
 	arc_pmu_stop(event, PERF_EF_UPDATE);
 	__clear_bit(event->hw.idx, arc_pmu->used_mask);
 
+	arc_pmu->act_counter[event->hw.idx] = 0;
+
 	perf_event_update_userpage(event);
 }
 
@@ -295,6 +317,16 @@ static int arc_pmu_add(struct perf_event *event, int flags)
 	}
 
 	write_aux_reg(ARC_REG_PCT_INDEX, idx);
+
+	arc_pmu->act_counter[idx] = event;
+
+	if (is_sampling_event(event)) {
+		/* Mimic full counter overflow as other arches do */
+		write_aux_reg(ARC_REG_PCT_INT_CNTL, (u32)arc_pmu->max_period);
+		write_aux_reg(ARC_REG_PCT_INT_CNTH,
+			      (arc_pmu->max_period >> 32));
+	}
+
 	write_aux_reg(ARC_REG_PCT_CONFIG, 0);
 	write_aux_reg(ARC_REG_PCT_COUNTL, 0);
 	write_aux_reg(ARC_REG_PCT_COUNTH, 0);
@@ -309,11 +341,70 @@ static int arc_pmu_add(struct perf_event *event, int flags)
 	return 0;
 }
 
+#ifdef CONFIG_ISA_ARCV2
+static irqreturn_t arc_pmu_intr(int irq, void *dev)
+{
+	struct perf_sample_data data;
+	struct arc_pmu *arc_pmu = (struct arc_pmu *)dev;
+	struct pt_regs *regs;
+	int active_ints;
+	int idx;
+
+	arc_pmu_disable(&arc_pmu->pmu);
+
+	active_ints = read_aux_reg(ARC_REG_PCT_INT_ACT);
+
+	regs = get_irq_regs();
+
+	for (idx = 0; idx < arc_pmu->n_counters; idx++) {
+		struct perf_event *event = arc_pmu->act_counter[idx];
+		struct hw_perf_event *hwc;
+
+		if (!(active_ints & (1 << idx)))
+			continue;
+
+		/* Reset interrupt flag by writing of 1 */
+		write_aux_reg(ARC_REG_PCT_INT_ACT, 1 << idx);
+
+		/*
+		 * On reset of "interrupt active" bit corresponding
+		 * "interrupt enable" bit gets automatically reset as well.
+		 * Now we need to re-enable interrupt for the counter.
+		 */
+		write_aux_reg(ARC_REG_PCT_INT_CTRL,
+			read_aux_reg(ARC_REG_PCT_INT_CTRL) | (1 << idx));
+
+		hwc = &event->hw;
+
+		WARN_ON_ONCE(hwc->idx != idx);
+
+		arc_perf_event_update(event, &event->hw, event->hw.idx);
+		perf_sample_data_init(&data, 0, hwc->last_period);
+		if (!arc_pmu_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			arc_pmu_stop(event, 0);
+	}
+
+	arc_pmu_enable(&arc_pmu->pmu);
+
+	return IRQ_HANDLED;
+}
+#else
+
+static irqreturn_t arc_pmu_intr(int irq, void *dev)
+{
+	return IRQ_NONE;
+}
+
+#endif /* CONFIG_ISA_ARCV2 */
+
 static int arc_pmu_device_probe(struct platform_device *pdev)
 {
 	struct arc_reg_pct_build pct_bcr;
 	struct arc_reg_cc_build cc_bcr;
-	int i, j;
+	int i, j, has_interrupts;
 	int counter_size;	/* in bits */
 
 	union cc_name {
@@ -339,12 +430,16 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
 	if (!arc_pmu)
 		return -ENOMEM;
 
+	has_interrupts = is_isa_arcv2() ? pct_bcr.i : 0;
+
 	arc_pmu->n_counters = pct_bcr.c;
 	counter_size = 32 + (pct_bcr.s << 4);
+
 	arc_pmu->max_period = (1ULL << counter_size) / 2 - 1ULL;
 
-	pr_info("ARC perf\t: %d counters (%d bits), %d countable conditions\n",
-		arc_pmu->n_counters, counter_size, cc_bcr.c);
+	pr_info("ARC perf\t: %d counters (%d bits), %d conditions%s\n",
+		arc_pmu->n_counters, counter_size, cc_bcr.c,
+		has_interrupts ? ", [overflow IRQ support]":"");
 
 	cc_name.str[8] = 0;
 	for (i = 0; i < PERF_COUNT_ARC_HW_MAX; i++)
@@ -379,8 +474,25 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
 		.read		= arc_pmu_read,
 	};
 
-	/* ARC 700 PMU does not support sampling events */
-	arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+	if (has_interrupts) {
+		int irq = platform_get_irq(pdev, 0);
+
+		if (irq < 0) {
+			pr_err("Cannot get IRQ number for the platform\n");
+			return -ENODEV;
+		}
+
+		ret = devm_request_irq(&pdev->dev, irq, arc_pmu_intr, 0,
+				       "arc-pmu", arc_pmu);
+		if (ret) {
+			pr_err("could not allocate PMU IRQ\n");
+			return ret;
+		}
+
+		/* Clean all pending interrupt flags */
+		write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff);
+	} else
+		arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
 
 	return perf_pmu_register(&arc_pmu->pmu, pdev->name, PERF_TYPE_RAW);
 }
-- 
cgit v0.10.2


From e6b1d126bb748103824087189e30febc88c4db73 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <abrodkin@synopsys.com>
Date: Mon, 24 Aug 2015 13:53:36 +0300
Subject: ARCv2: perf: implement exclusion of event counting in user or kernel
 mode

Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h
index 3522d09..5824ab4 100644
--- a/arch/arc/include/asm/perf_event.h
+++ b/arch/arc/include/asm/perf_event.h
@@ -34,6 +34,9 @@
 #define ARC_REG_PCT_INT_CTRL	0x25E
 #define ARC_REG_PCT_INT_ACT	0x25F
 
+#define ARC_REG_PCT_CONFIG_USER	(1 << 18)	/* count in user mode */
+#define ARC_REG_PCT_CONFIG_KERN	(1 << 19)	/* count in kernel mode */
+
 #define ARC_REG_PCT_CONTROL_CC	(1 << 16)	/* clear counts */
 #define ARC_REG_PCT_CONTROL_SN	(1 << 17)	/* snapshot */
 
diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index 208c954..5b94ceb 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -147,13 +147,25 @@ static int arc_pmu_event_init(struct perf_event *event)
 		local64_set(&hwc->period_left, hwc->sample_period);
 	}
 
+	hwc->config = 0;
+
+	if (is_isa_arcv2()) {
+		/* "exclude user" means "count only kernel" */
+		if (event->attr.exclude_user)
+			hwc->config |= ARC_REG_PCT_CONFIG_KERN;
+
+		/* "exclude kernel" means "count only user" */
+		if (event->attr.exclude_kernel)
+			hwc->config |= ARC_REG_PCT_CONFIG_USER;
+	}
+
 	switch (event->attr.type) {
 	case PERF_TYPE_HARDWARE:
 		if (event->attr.config >= PERF_COUNT_HW_MAX)
 			return -ENOENT;
 		if (arc_pmu->ev_hw_idx[event->attr.config] < 0)
 			return -ENOENT;
-		hwc->config = arc_pmu->ev_hw_idx[event->attr.config];
+		hwc->config |= arc_pmu->ev_hw_idx[event->attr.config];
 		pr_debug("init event %d with h/w %d \'%s\'\n",
 			 (int) event->attr.config, (int) hwc->config,
 			 arc_pmu_ev_hw_map[event->attr.config]);
@@ -163,7 +175,7 @@ static int arc_pmu_event_init(struct perf_event *event)
 		ret = arc_pmu_cache_event(event->attr.config);
 		if (ret < 0)
 			return ret;
-		hwc->config = arc_pmu->ev_hw_idx[ret];
+		hwc->config |= arc_pmu->ev_hw_idx[ret];
 		return 0;
 	default:
 		return -ENOENT;
-- 
cgit v0.10.2


From e525c37f8413b19130d0499c7467fed45a94579b Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <abrodkin@synopsys.com>
Date: Mon, 24 Aug 2015 14:03:30 +0300
Subject: ARCv2: perf: SMP support

* split off pmu info into singleton and per-cpu bits
* setup PMU on all cores

Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index 5b94ceb..7430652 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -21,10 +21,22 @@
 
 struct arc_pmu {
 	struct pmu	pmu;
+	unsigned int	irq;
 	int		n_counters;
-	unsigned long	used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)];
 	u64		max_period;
 	int		ev_hw_idx[PERF_COUNT_ARC_HW_MAX];
+};
+
+struct arc_pmu_cpu {
+	/*
+	 * A 1 bit for an index indicates that the counter is being used for
+	 * an event. A 0 means that the counter can be used.
+	 */
+	unsigned long	used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)];
+
+	/*
+	 * The events that are active on the PMU for the given index.
+	 */
 	struct perf_event *act_counter[ARC_PERF_MAX_COUNTERS];
 };
 
@@ -67,6 +79,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 }
 
 static struct arc_pmu *arc_pmu;
+static DEFINE_PER_CPU(struct arc_pmu_cpu, arc_pmu_cpu);
 
 /* read counter #idx; note that counter# != event# on ARC! */
 static uint64_t arc_pmu_read_counter(int idx)
@@ -304,10 +317,12 @@ static void arc_pmu_stop(struct perf_event *event, int flags)
 
 static void arc_pmu_del(struct perf_event *event, int flags)
 {
+	struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu);
+
 	arc_pmu_stop(event, PERF_EF_UPDATE);
-	__clear_bit(event->hw.idx, arc_pmu->used_mask);
+	__clear_bit(event->hw.idx, pmu_cpu->used_mask);
 
-	arc_pmu->act_counter[event->hw.idx] = 0;
+	pmu_cpu->act_counter[event->hw.idx] = 0;
 
 	perf_event_update_userpage(event);
 }
@@ -315,22 +330,23 @@ static void arc_pmu_del(struct perf_event *event, int flags)
 /* allocate hardware counter and optionally start counting */
 static int arc_pmu_add(struct perf_event *event, int flags)
 {
+	struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu);
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 
-	if (__test_and_set_bit(idx, arc_pmu->used_mask)) {
-		idx = find_first_zero_bit(arc_pmu->used_mask,
+	if (__test_and_set_bit(idx, pmu_cpu->used_mask)) {
+		idx = find_first_zero_bit(pmu_cpu->used_mask,
 					  arc_pmu->n_counters);
 		if (idx == arc_pmu->n_counters)
 			return -EAGAIN;
 
-		__set_bit(idx, arc_pmu->used_mask);
+		__set_bit(idx, pmu_cpu->used_mask);
 		hwc->idx = idx;
 	}
 
 	write_aux_reg(ARC_REG_PCT_INDEX, idx);
 
-	arc_pmu->act_counter[idx] = event;
+	pmu_cpu->act_counter[idx] = event;
 
 	if (is_sampling_event(event)) {
 		/* Mimic full counter overflow as other arches do */
@@ -357,7 +373,7 @@ static int arc_pmu_add(struct perf_event *event, int flags)
 static irqreturn_t arc_pmu_intr(int irq, void *dev)
 {
 	struct perf_sample_data data;
-	struct arc_pmu *arc_pmu = (struct arc_pmu *)dev;
+	struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu);
 	struct pt_regs *regs;
 	int active_ints;
 	int idx;
@@ -369,7 +385,7 @@ static irqreturn_t arc_pmu_intr(int irq, void *dev)
 	regs = get_irq_regs();
 
 	for (idx = 0; idx < arc_pmu->n_counters; idx++) {
-		struct perf_event *event = arc_pmu->act_counter[idx];
+		struct perf_event *event = pmu_cpu->act_counter[idx];
 		struct hw_perf_event *hwc;
 
 		if (!(active_ints & (1 << idx)))
@@ -412,6 +428,17 @@ static irqreturn_t arc_pmu_intr(int irq, void *dev)
 
 #endif /* CONFIG_ISA_ARCV2 */
 
+void arc_cpu_pmu_irq_init(void)
+{
+	struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu);
+
+	arc_request_percpu_irq(arc_pmu->irq, smp_processor_id(), arc_pmu_intr,
+			       "ARC perf counters", pmu_cpu);
+
+	/* Clear all pending interrupt flags */
+	write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff);
+}
+
 static int arc_pmu_device_probe(struct platform_device *pdev)
 {
 	struct arc_reg_pct_build pct_bcr;
@@ -488,18 +515,30 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
 
 	if (has_interrupts) {
 		int irq = platform_get_irq(pdev, 0);
+		unsigned long flags;
 
 		if (irq < 0) {
 			pr_err("Cannot get IRQ number for the platform\n");
 			return -ENODEV;
 		}
 
-		ret = devm_request_irq(&pdev->dev, irq, arc_pmu_intr, 0,
-				       "arc-pmu", arc_pmu);
-		if (ret) {
-			pr_err("could not allocate PMU IRQ\n");
-			return ret;
-		}
+		arc_pmu->irq = irq;
+
+		/*
+		 * arc_cpu_pmu_irq_init() needs to be called on all cores for
+		 * their respective local PMU.
+		 * However we use opencoded on_each_cpu() to ensure it is called
+		 * on core0 first, so that arc_request_percpu_irq() sets up
+		 * AUTOEN etc. Otherwise enable_percpu_irq() fails to enable
+		 * perf IRQ on non master cores.
+		 * see arc_request_percpu_irq()
+		 */
+		preempt_disable();
+		local_irq_save(flags);
+		arc_cpu_pmu_irq_init();
+		local_irq_restore(flags);
+		smp_call_function((smp_call_func_t)arc_cpu_pmu_irq_init, 0, 1);
+		preempt_enable();
 
 		/* Clean all pending interrupt flags */
 		write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff);
-- 
cgit v0.10.2


From 9b28829d6da391f67a76dbba07a167e2b554bd10 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Tue, 18 Nov 2014 17:36:11 +0530
Subject: ARCv2: perf: Finally introduce HS perf unit

With all features in place, the ARC HS pct block can now be effectively
allowed to be probed/used

Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Alexey Brodkin <abrodkin@synopsys.com>
Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/Documentation/devicetree/bindings/arc/archs-pct.txt b/Documentation/devicetree/bindings/arc/archs-pct.txt
new file mode 100644
index 0000000..1ae98b87
--- /dev/null
+++ b/Documentation/devicetree/bindings/arc/archs-pct.txt
@@ -0,0 +1,17 @@
+* ARC HS Performance Counters
+
+The ARC HS can be configured with a pipeline performance monitor for counting
+CPU and cache events like cache misses and hits. Like conventional PCT there
+are 100+ hardware conditions dynamically mapped to upto 32 counters.
+It also supports overflow interrupts.
+
+Required properties:
+
+- compatible : should contain
+	"snps,archs-pct"
+
+Example:
+
+pmu {
+        compatible = "snps,archs-pct";
+};
diff --git a/MAINTAINERS b/MAINTAINERS
index d7ab736..d4cda42 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9874,7 +9874,7 @@ SYNOPSYS ARC ARCHITECTURE
 M:	Vineet Gupta <vgupta@synopsys.com>
 S:	Supported
 F:	arch/arc/
-F:	Documentation/devicetree/bindings/arc/
+F:	Documentation/devicetree/bindings/arc/*
 F:	drivers/tty/serial/arc_uart.c
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git
 
diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h
index 5824ab4..5f07176 100644
--- a/arch/arc/include/asm/perf_event.h
+++ b/arch/arc/include/asm/perf_event.h
@@ -105,8 +105,11 @@ static const char * const arc_pmu_ev_hw_map[] = {
 	[PERF_COUNT_HW_INSTRUCTIONS] = "iall",
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", /* Excludes ZOL jumps */
 	[PERF_COUNT_ARC_BPOK]         = "bpok",	  /* NP-NT, PT-T, PNT-NT */
+#ifdef CONFIG_ISA_ARCV2
+	[PERF_COUNT_HW_BRANCH_MISSES] = "bpmp",
+#else
 	[PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */
-
+#endif
 	[PERF_COUNT_ARC_LDC] = "imemrdc",	/* Instr: mem read cached */
 	[PERF_COUNT_ARC_STC] = "imemwrc",	/* Instr: mem write cached */
 
diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index 7430652..0c08bb1 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -551,6 +551,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev)
 #ifdef CONFIG_OF
 static const struct of_device_id arc_pmu_match[] = {
 	{ .compatible = "snps,arc700-pct" },
+	{ .compatible = "snps,archs-pct" },
 	{},
 };
 MODULE_DEVICE_TABLE(of, arc_pmu_match);
@@ -558,7 +559,7 @@ MODULE_DEVICE_TABLE(of, arc_pmu_match);
 
 static struct platform_driver arc_pmu_driver = {
 	.driver	= {
-		.name		= "arc700-pct",
+		.name		= "arc-pct",
 		.of_match_table = of_match_ptr(arc_pmu_match),
 	},
 	.probe		= arc_pmu_device_probe,
-- 
cgit v0.10.2


From 3d5926599a6bc551efc0c8b244469a711f0d0166 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Thu, 27 Aug 2015 16:25:07 +0530
Subject: ARCv2: entry: Fix reserved handler

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>

diff --git a/arch/arc/kernel/entry-arcv2.S b/arch/arc/kernel/entry-arcv2.S
index bd7105d..8fa7656 100644
--- a/arch/arc/kernel/entry-arcv2.S
+++ b/arch/arc/kernel/entry-arcv2.S
@@ -57,13 +57,8 @@ VECTOR	handle_interrupt	; (23) End of fixed IRQs
 
 	.section .text, "ax",@progbits
 
-res_service:		; processor restart
-	flag    0x1     ; not implemented
-	nop
-	nop
-
-reserved:		; processor restart
-	rtie            ; jump to processor initializations
+reserved:
+	flag 1		; Unexpected event, halt
 
 ;##################### Interrupt Handling ##############################
 
-- 
cgit v0.10.2