From f872f5400cc01373d8e29d9c7a5296ccfaf4ccf3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Tue, 29 Dec 2015 20:12:19 -0800
Subject: mm: Add a vm_special_mapping.fault() method

Requiring special mappings to give a list of struct pages is
inflexible: it prevents sane use of IO memory in a special
mapping, it's inefficient (it requires arch code to initialize a
list of struct pages, and it requires the mm core to walk the
entire list just to figure out how long it is), and it prevents
arch code from doing anything fancy when a special mapping fault
occurs.

Add a .fault method as an alternative to filling in a .pages
array.

Looks-OK-to: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a26d1677c0bc7e774c33f469451a78ca31e9e6af.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f8d1492..c88e48a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -568,10 +568,26 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm)
 }
 #endif
 
-struct vm_special_mapping
-{
-	const char *name;
+struct vm_fault;
+
+struct vm_special_mapping {
+	const char *name;	/* The name, e.g. "[vdso]". */
+
+	/*
+	 * If .fault is not provided, this points to a
+	 * NULL-terminated array of pages that back the special mapping.
+	 *
+	 * This must not be NULL unless .fault is provided.
+	 */
 	struct page **pages;
+
+	/*
+	 * If non-NULL, then this is called to resolve page faults
+	 * on the special mapping.  If used, .pages is not checked.
+	 */
+	int (*fault)(const struct vm_special_mapping *sm,
+		     struct vm_area_struct *vma,
+		     struct vm_fault *vmf);
 };
 
 enum tlb_flush_reason {
diff --git a/mm/mmap.c b/mm/mmap.c
index 2ce04a6..f717453 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3030,11 +3030,16 @@ static int special_mapping_fault(struct vm_area_struct *vma,
 	pgoff_t pgoff;
 	struct page **pages;
 
-	if (vma->vm_ops == &legacy_special_mapping_vmops)
+	if (vma->vm_ops == &legacy_special_mapping_vmops) {
 		pages = vma->vm_private_data;
-	else
-		pages = ((struct vm_special_mapping *)vma->vm_private_data)->
-			pages;
+	} else {
+		struct vm_special_mapping *sm = vma->vm_private_data;
+
+		if (sm->fault)
+			return sm->fault(sm, vma, vmf);
+
+		pages = sm->pages;
+	}
 
 	for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
 		pgoff--;
-- 
cgit v0.10.2


From 1745cbc5d0dee0749a6bc0ea8e872c5db0074061 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:20 -0800
Subject: mm: Add vm_insert_pfn_prot()

The x86 vvar vma contains pages with differing cacheability
flags.  x86 currently implements this by manually inserting all
the ptes using (io_)remap_pfn_range when the vma is set up.

x86 wants to move to using .fault with VM_FAULT_NOPAGE to set up
the mappings as needed.  The correct API to use to insert a pfn
in .fault is vm_insert_pfn(), but vm_insert_pfn() can't override the
vma's cache mode, and the HPET page in particular needs to be
uncached despite the fact that the rest of the VMA is cached.

Add vm_insert_pfn_prot() to support varying cacheability within
the same non-COW VMA in a more sane manner.

x86 could alternatively use multiple VMAs, but that's messy,
would break CRIU, and would create unnecessary VMAs that would
waste memory.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d2938d1eb37be7a5e4f86182db646551f11e45aa.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00bad77..87ef1d7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2080,6 +2080,8 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn);
+int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long pfn, pgprot_t pgprot);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn);
 int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
diff --git a/mm/memory.c b/mm/memory.c
index c387430..a29f0b9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1564,8 +1564,29 @@ out:
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn)
 {
+	return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_pfn);
+
+/**
+ * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ * @pgprot: pgprot flags for the inserted page
+ *
+ * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * to override pgprot on a per-page basis.
+ *
+ * This only makes sense for IO mappings, and it makes no sense for
+ * cow mappings.  In general, using multiple vmas is preferable;
+ * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * impractical.
+ */
+int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long pfn, pgprot_t pgprot)
+{
 	int ret;
-	pgprot_t pgprot = vma->vm_page_prot;
 	/*
 	 * Technically, architectures with pte_special can avoid all these
 	 * restrictions (same for remap_pfn_range).  However we would like
@@ -1587,7 +1608,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 
 	return ret;
 }
-EXPORT_SYMBOL(vm_insert_pfn);
+EXPORT_SYMBOL(vm_insert_pfn_prot);
 
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn)
-- 
cgit v0.10.2


From 352b78c62f27b356b182008acd3117f3ee03ffd2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:21 -0800
Subject: x86/vdso: Track each mm's loaded vDSO image as well as its base

As we start to do more intelligent things with the vDSO at
runtime (as opposed to just at mm initialization time), we'll
need to know which vDSO is in use.

In principle, we could guess based on the mm type, but that's
over-complicated and error-prone.  Instead, just track it in the
mmu context.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c99ac48681bad709ca7ad5ee899d9042a3af6b00.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index b8f69e2..80b0210 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -121,6 +121,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 
 	text_start = addr - image->sym_vvar_start;
 	current->mm->context.vdso = (void __user *)text_start;
+	current->mm->context.vdso_image = image;
 
 	/*
 	 * MAYWRITE to allow gdb to COW and set breakpoints
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 55234d5..1ea0bae 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -19,7 +19,8 @@ typedef struct {
 #endif
 
 	struct mutex lock;
-	void __user *vdso;
+	void __user *vdso;			/* vdso base address */
+	const struct vdso_image *vdso_image;	/* vdso image in use */
 
 	atomic_t perf_rdpmc_allowed;	/* nonzero if rdpmc is allowed */
 } mm_context_t;
-- 
cgit v0.10.2


From 05ef76b20fc4297b0d3f8a956f1c809a8a1b3f1d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:22 -0800
Subject: x86/vdso: Use .fault for the vDSO text mapping

The old scheme for mapping the vDSO text is rather complicated.
vdso2c generates a struct vm_special_mapping and a blank .pages
array of the correct size for each vdso image.  Init code in
vdso/vma.c populates the .pages array for each vDSO image, and
the mapping code selects the appropriate struct
vm_special_mapping.

With .fault, we can use a less roundabout approach: vdso_fault()
just returns the appropriate page for the selected vDSO image.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f886954c186bafd74e1b967c8931d852ae199aa2.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 0224987..abe961c 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -150,16 +150,9 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
 	}
 	fprintf(outfile, "\n};\n\n");
 
-	fprintf(outfile, "static struct page *pages[%lu];\n\n",
-		mapping_size / 4096);
-
 	fprintf(outfile, "const struct vdso_image %s = {\n", name);
 	fprintf(outfile, "\t.data = raw_data,\n");
 	fprintf(outfile, "\t.size = %lu,\n", mapping_size);
-	fprintf(outfile, "\t.text_mapping = {\n");
-	fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
-	fprintf(outfile, "\t\t.pages = pages,\n");
-	fprintf(outfile, "\t},\n");
 	if (alt_sec) {
 		fprintf(outfile, "\t.alt = %lu,\n",
 			(unsigned long)GET_LE(&alt_sec->sh_offset));
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 80b0210..eb50d7c 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -27,13 +27,7 @@ unsigned int __read_mostly vdso64_enabled = 1;
 
 void __init init_vdso_image(const struct vdso_image *image)
 {
-	int i;
-	int npages = (image->size) / PAGE_SIZE;
-
 	BUG_ON(image->size % PAGE_SIZE != 0);
-	for (i = 0; i < npages; i++)
-		image->text_mapping.pages[i] =
-			virt_to_page(image->data + i*PAGE_SIZE);
 
 	apply_alternatives((struct alt_instr *)(image->data + image->alt),
 			   (struct alt_instr *)(image->data + image->alt +
@@ -90,6 +84,24 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
 #endif
 }
 
+static int vdso_fault(const struct vm_special_mapping *sm,
+		      struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+
+	if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size)
+		return VM_FAULT_SIGBUS;
+
+	vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT));
+	get_page(vmf->page);
+	return 0;
+}
+
+static const struct vm_special_mapping text_mapping = {
+	.name = "[vdso]",
+	.fault = vdso_fault,
+};
+
 static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 {
 	struct mm_struct *mm = current->mm;
@@ -131,7 +143,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 				       image->size,
 				       VM_READ|VM_EXEC|
 				       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-				       &image->text_mapping);
+				       &text_mapping);
 
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index deabaf9..43dc55b 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -13,9 +13,6 @@ struct vdso_image {
 	void *data;
 	unsigned long size;   /* Always a multiple of PAGE_SIZE */
 
-	/* text_mapping.pages is big enough for data/size page pointers */
-	struct vm_special_mapping text_mapping;
-
 	unsigned long alt, alt_len;
 
 	long sym_vvar_start;  /* Negative offset to the vvar area */
-- 
cgit v0.10.2


From a48a7042613eb1524d18b7b1ed7d3a6b611fd21f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:23 -0800
Subject: x86/vdso: Use ->fault() instead of remap_pfn_range() for the vvar
 mapping

This is IMO much less ugly, and it also opens the door to
disallowing unprivileged userspace HPET access on systems with
usable TSCs.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c19c2909e5ee3c3d8742f916586676bb7c40345f.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index eb50d7c..4b5461b 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -102,18 +102,69 @@ static const struct vm_special_mapping text_mapping = {
 	.fault = vdso_fault,
 };
 
+static int vvar_fault(const struct vm_special_mapping *sm,
+		      struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+	long sym_offset;
+	int ret = -EFAULT;
+
+	if (!image)
+		return VM_FAULT_SIGBUS;
+
+	sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) +
+		image->sym_vvar_start;
+
+	/*
+	 * Sanity check: a symbol offset of zero means that the page
+	 * does not exist for this vdso image, not that the page is at
+	 * offset zero relative to the text mapping.  This should be
+	 * impossible here, because sym_offset should only be zero for
+	 * the page past the end of the vvar mapping.
+	 */
+	if (sym_offset == 0)
+		return VM_FAULT_SIGBUS;
+
+	if (sym_offset == image->sym_vvar_page) {
+		ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
+				    __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
+	} else if (sym_offset == image->sym_hpet_page) {
+#ifdef CONFIG_HPET_TIMER
+		if (hpet_address) {
+			ret = vm_insert_pfn_prot(
+				vma,
+				(unsigned long)vmf->virtual_address,
+				hpet_address >> PAGE_SHIFT,
+				pgprot_noncached(PAGE_READONLY));
+		}
+#endif
+	} else if (sym_offset == image->sym_pvclock_page) {
+		struct pvclock_vsyscall_time_info *pvti =
+			pvclock_pvti_cpu0_va();
+		if (pvti) {
+			ret = vm_insert_pfn(
+				vma,
+				(unsigned long)vmf->virtual_address,
+				__pa(pvti) >> PAGE_SHIFT);
+		}
+	}
+
+	if (ret == 0 || ret == -EBUSY)
+		return VM_FAULT_NOPAGE;
+
+	return VM_FAULT_SIGBUS;
+}
+
 static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long addr, text_start;
 	int ret = 0;
-	static struct page *no_pages[] = {NULL};
-	static struct vm_special_mapping vvar_mapping = {
+	static const struct vm_special_mapping vvar_mapping = {
 		.name = "[vvar]",
-		.pages = no_pages,
+		.fault = vvar_fault,
 	};
-	struct pvclock_vsyscall_time_info *pvti;
 
 	if (calculate_addr) {
 		addr = vdso_addr(current->mm->start_stack,
@@ -153,7 +204,8 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 	vma = _install_special_mapping(mm,
 				       addr,
 				       -image->sym_vvar_start,
-				       VM_READ|VM_MAYREAD,
+				       VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP|
+				       VM_PFNMAP,
 				       &vvar_mapping);
 
 	if (IS_ERR(vma)) {
@@ -161,41 +213,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 		goto up_fail;
 	}
 
-	if (image->sym_vvar_page)
-		ret = remap_pfn_range(vma,
-				      text_start + image->sym_vvar_page,
-				      __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
-				      PAGE_SIZE,
-				      PAGE_READONLY);
-
-	if (ret)
-		goto up_fail;
-
-#ifdef CONFIG_HPET_TIMER
-	if (hpet_address && image->sym_hpet_page) {
-		ret = io_remap_pfn_range(vma,
-			text_start + image->sym_hpet_page,
-			hpet_address >> PAGE_SHIFT,
-			PAGE_SIZE,
-			pgprot_noncached(PAGE_READONLY));
-
-		if (ret)
-			goto up_fail;
-	}
-#endif
-
-	pvti = pvclock_pvti_cpu0_va();
-	if (pvti && image->sym_pvclock_page) {
-		ret = remap_pfn_range(vma,
-				      text_start + image->sym_pvclock_page,
-				      __pa(pvti) >> PAGE_SHIFT,
-				      PAGE_SIZE,
-				      PAGE_READONLY);
-
-		if (ret)
-			goto up_fail;
-	}
-
 up_fail:
 	if (ret)
 		current->mm->context.vdso = NULL;
-- 
cgit v0.10.2


From bd902c536298830e4d126dcf6491b46d3f1bf96e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:24 -0800
Subject: x86/vdso: Disallow vvar access to vclock IO for never-used vclocks

It makes me uncomfortable that even modern systems grant every
process direct read access to the HPET.

While fixing this for real without regressing anything is a mess
(unmapping the HPET is tricky because we don't adequately track
all the mappings), we can do almost as well by tracking which
vclocks have ever been used and only allowing pages associated
with used vclocks to be faulted in.

This will cause rogue programs that try to peek at the HPET to
get SIGBUS instead on most systems.

We can't restrict faults to vclock pages that are associated
with the currently selected vclock due to a race: a process
could start to access the HPET for the first time and race
against a switch away from the HPET as the current clocksource.
We can't segfault the process trying to peek at the HPET in this
case, even though the process isn't going to do anything useful
with the data.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e79d06295625c02512277737ab55085a498ac5d8.1451446564.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 4b5461b..7c912fe 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -130,7 +130,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
 				    __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
 	} else if (sym_offset == image->sym_hpet_page) {
 #ifdef CONFIG_HPET_TIMER
-		if (hpet_address) {
+		if (hpet_address && vclock_was_used(VCLOCK_HPET)) {
 			ret = vm_insert_pfn_prot(
 				vma,
 				(unsigned long)vmf->virtual_address,
@@ -141,7 +141,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
 	} else if (sym_offset == image->sym_pvclock_page) {
 		struct pvclock_vsyscall_time_info *pvti =
 			pvclock_pvti_cpu0_va();
-		if (pvti) {
+		if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
 			ret = vm_insert_pfn(
 				vma,
 				(unsigned long)vmf->virtual_address,
diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index 51e3304..0fb3a10 100644
--- a/arch/x86/entry/vsyscall/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
@@ -16,6 +16,8 @@
 #include <asm/vgtod.h>
 #include <asm/vvar.h>
 
+int vclocks_used __read_mostly;
+
 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
 
 void update_vsyscall_tz(void)
@@ -26,12 +28,17 @@ void update_vsyscall_tz(void)
 
 void update_vsyscall(struct timekeeper *tk)
 {
+	int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
 	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
 
+	/* Mark the new vclock used. */
+	BUILD_BUG_ON(VCLOCK_MAX >= 32);
+	WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode));
+
 	gtod_write_begin(vdata);
 
 	/* copy vsyscall data */
-	vdata->vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
+	vdata->vclock_mode	= vclock_mode;
 	vdata->cycle_last	= tk->tkr_mono.cycle_last;
 	vdata->mask		= tk->tkr_mono.mask;
 	vdata->mult		= tk->tkr_mono.mult;
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index eda81dc..d194266 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -3,10 +3,11 @@
 #ifndef _ASM_X86_CLOCKSOURCE_H
 #define _ASM_X86_CLOCKSOURCE_H
 
-#define VCLOCK_NONE 0  /* No vDSO clock available.	*/
-#define VCLOCK_TSC  1  /* vDSO should use vread_tsc.	*/
-#define VCLOCK_HPET 2  /* vDSO should use vread_hpet.	*/
-#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
+#define VCLOCK_NONE	0  /* No vDSO clock available.	*/
+#define VCLOCK_TSC	1  /* vDSO should use vread_tsc.	*/
+#define VCLOCK_HPET	2  /* vDSO should use vread_hpet.	*/
+#define VCLOCK_PVCLOCK	3 /* vDSO should use vread_pvclock. */
+#define VCLOCK_MAX	3
 
 struct arch_clocksource_data {
 	int vclock_mode;
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index f556c48..e728699 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -37,6 +37,12 @@ struct vsyscall_gtod_data {
 };
 extern struct vsyscall_gtod_data vsyscall_gtod_data;
 
+extern int vclocks_used;
+static inline bool vclock_was_used(int vclock)
+{
+	return READ_ONCE(vclocks_used) & (1 << vclock);
+}
+
 static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
 {
 	unsigned ret;
-- 
cgit v0.10.2


From 2024315124b43bb8b25a119ec6c614f0647dcc6d Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Mon, 18 Jan 2016 20:13:14 +0600
Subject: x86/asm/entry: Remove unused SAVE_ALL/RESTORE_ALL macros for
 !CONFIG_x86_64

SAVE_ALL and RESTORE_ALL macros for !CONFIG_X86_64 were
introduced in commit:

  1a338ac32 commit ('sched, x86: Optimize the preempt_schedule() call')

... and were used in the ___preempt_schedule() and ___preempt_schedule_context()
functions from the arch/x86/kernel/preempt.S.

But the arch/x86/kernel/preempt.S file was removed in the following commit:

  0ad6e3c5 commit ('x86: Speed up ___preempt_schedule*() by using THUNK helpers')

The ___preempt_schedule()/___preempt_schedule_context() functions were
reimplemeted and do not use SAVE_ALL/RESTORE_ALL anymore.

These macros have no users anymore, so we can remove them.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453126394-13717-1-git-send-email-kuleshovmail@gmail.com
[ Improved the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index e32206e..9a9e588 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -201,37 +201,6 @@ For 32-bit we have the following conventions - kernel is built with
 	.byte 0xf1
 	.endm
 
-#else /* CONFIG_X86_64 */
-
-/*
- * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
- * are different from the entry_32.S versions in not changing the segment
- * registers. So only suitable for in kernel use, not when transitioning
- * from or to user space. The resulting stack frame is not a standard
- * pt_regs frame. The main use case is calling C code from assembler
- * when all the registers need to be preserved.
- */
-
-	.macro SAVE_ALL
-	pushl %eax
-	pushl %ebp
-	pushl %edi
-	pushl %esi
-	pushl %edx
-	pushl %ecx
-	pushl %ebx
-	.endm
-
-	.macro RESTORE_ALL
-	popl %ebx
-	popl %ecx
-	popl %edx
-	popl %esi
-	popl %edi
-	popl %ebp
-	popl %eax
-	.endm
-
 #endif /* CONFIG_X86_64 */
 
 /*
-- 
cgit v0.10.2


From a1ff5726081858a9ad98934eff7af6616c576875 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 16 Jan 2016 10:58:12 +0100
Subject: x86/cpufeature: Add AMD AVIC bit

CPUID Fn8000_000A_EDX[13] denotes support for AMD's Virtual
Interrupt controller, i.e., APIC virtualization.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Kaplan <david.kaplan@amd.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Link: http://lkml.kernel.org/r/1452938292-12327-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7ad8c94..bbf166e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -268,6 +268,7 @@
 #define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
 #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
 #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+#define X86_FEATURE_AVIC	(15*32+13) /* Virtual Interrupt Controller */
 
 /*
  * BUG word(s)
-- 
cgit v0.10.2


From 43c75f933be26422f166d6d869a19997312f4732 Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjennings@variantweb.net>
Date: Mon, 30 Nov 2015 10:47:43 -0600
Subject: x86/mm: Streamline and restore probe_memory_block_size()

The cumulative effect of the following two commits:

  bdee237c0343 ("x86: mm: Use 2GB memory block size on large-memory x86-64 systems")
  982792c782ef ("x86, mm: probe memory block size for generic x86 64bit")

... is some pretty convoluted code.

The first commit also removed code for the UV case without stated reason,
which might lead to unexpected change in behavior.

This commit has no other (intended) functional change; just seeks to simplify
and make the code more understandable, beyond restoring the UV behavior.

The whole section with the "tail size" doesn't seem to be
reachable, since both the >= 64GB and < 64GB case return, so it
was removed.

Signed-off-by: Seth Jennings <sjennings@variantweb.net>
Cc: Daniel J Blueman <daniel@numascale.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1448902063-18885-1-git-send-email-sjennings@variantweb.net
[ Rewrote the title and changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 8829482..8f18fec 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -52,6 +52,7 @@
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
 #include <asm/init.h>
+#include <asm/uv/uv.h>
 #include <asm/setup.h>
 
 #include "mm_internal.h"
@@ -1193,26 +1194,13 @@ int kern_addr_valid(unsigned long addr)
 
 static unsigned long probe_memory_block_size(void)
 {
-	/* start from 2g */
-	unsigned long bz = 1UL<<31;
+	unsigned long bz = MIN_MEMORY_BLOCK_SIZE;
 
-	if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
-		pr_info("Using 2GB memory block size for large-memory system\n");
-		return 2UL * 1024 * 1024 * 1024;
-	}
-
-	/* less than 64g installed */
-	if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
-		return MIN_MEMORY_BLOCK_SIZE;
-
-	/* get the tail size */
-	while (bz > MIN_MEMORY_BLOCK_SIZE) {
-		if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
-			break;
-		bz >>= 1;
-	}
+	/* if system is UV or has 64GB of RAM or more, use large blocks */
+	if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30)))
+		bz = 2UL << 30; /* 2GB */
 
-	printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
+	pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
 
 	return bz;
 }
-- 
cgit v0.10.2


From 95d97adb2bb85d964bae4538e0574e742e522dda Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Thu, 17 Dec 2015 23:56:52 +0000
Subject: x86/signal: Cleanup get_nr_restart_syscall()

Check for TS_COMPAT instead of TIF_IA32 to distinguish ia32
tasks from 64-bit tasks.

Check for __X32_SYSCALL_BIT iff CONFIG_X86_X32_ABI is defined.

Suggested-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Elvira Khabirova <lineprinter0@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160111145515.GB29007@altlinux.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cb6282c..c07ff5d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -692,12 +692,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 
 static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
 {
-#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64)
+#ifdef CONFIG_X86_64
+	if (is_ia32_task())
+		return __NR_ia32_restart_syscall;
+#endif
+#ifdef CONFIG_X86_X32_ABI
+	return __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
+#else
 	return __NR_restart_syscall;
-#else /* !CONFIG_X86_32 && CONFIG_X86_64 */
-	return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall :
-		__NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT);
-#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */
+#endif
 }
 
 /*
-- 
cgit v0.10.2


From 997963edd912a6d77d68b2bbc19f40ce8facabd7 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 18 Dec 2015 06:39:18 -0600
Subject: x86/asm: Clean up frame pointer macros

The asm macros for setting up and restoring the frame pointer
aren't currently being used.  However, they will be needed soon
to help asm functions to comply with stacktool.

Rename FRAME/ENDFRAME to FRAME_BEGIN/FRAME_END for more
symmetry.  Also make the code more readable and improve the
comments.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3f488a8e3bfc8ac7d4d3d350953e664e7182b044.1450442274.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 793179c..cec213b 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,23 +1,34 @@
+#ifndef _ASM_X86_FRAME_H
+#define _ASM_X86_FRAME_H
+
 #ifdef __ASSEMBLY__
 
 #include <asm/asm.h>
 
-/* The annotation hides the frame from the unwinder and makes it look
-   like a ordinary ebp save/restore. This avoids some special cases for
-   frame pointer later */
+/*
+ * These are stack frame creation macros.  They should be used by every
+ * callable non-leaf asm function to make kernel stack traces more reliable.
+ */
 #ifdef CONFIG_FRAME_POINTER
-	.macro FRAME
-	__ASM_SIZE(push,)	%__ASM_REG(bp)
-	__ASM_SIZE(mov)		%__ASM_REG(sp), %__ASM_REG(bp)
-	.endm
-	.macro ENDFRAME
-	__ASM_SIZE(pop,)	%__ASM_REG(bp)
-	.endm
-#else
-	.macro FRAME
-	.endm
-	.macro ENDFRAME
-	.endm
-#endif
+
+.macro FRAME_BEGIN
+	push %_ASM_BP
+	_ASM_MOV %_ASM_SP, %_ASM_BP
+.endm
+
+.macro FRAME_END
+	pop %_ASM_BP
+.endm
+
+#define FRAME_OFFSET __ASM_SEL(4, 8)
+
+#else /* !CONFIG_FRAME_POINTER */
+
+#define FRAME_BEGIN
+#define FRAME_END
+#define FRAME_OFFSET 0
+
+#endif /* CONFIG_FRAME_POINTER */
 
 #endif  /*  __ASSEMBLY__  */
+#endif /* _ASM_X86_FRAME_H */
-- 
cgit v0.10.2


From ec5186557abbe711dfd34e1863735dfecb0602cc Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 18 Dec 2015 06:39:19 -0600
Subject: x86/asm: Add C versions of frame pointer macros

Add C versions of the frame pointer macros which can be used to
create a stack frame in inline assembly.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Bernd Petrovitsch <bernd@petrovitsch.priv.at>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chris J Arges <chris.j.arges@canonical.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f6786a282bf232ede3e2866414eae3cf02c7d662.1450442274.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index cec213b..6e4d170 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,16 +1,17 @@
 #ifndef _ASM_X86_FRAME_H
 #define _ASM_X86_FRAME_H
 
-#ifdef __ASSEMBLY__
-
 #include <asm/asm.h>
 
 /*
  * These are stack frame creation macros.  They should be used by every
  * callable non-leaf asm function to make kernel stack traces more reliable.
  */
+
 #ifdef CONFIG_FRAME_POINTER
 
+#ifdef __ASSEMBLY__
+
 .macro FRAME_BEGIN
 	push %_ASM_BP
 	_ASM_MOV %_ASM_SP, %_ASM_BP
@@ -20,6 +21,16 @@
 	pop %_ASM_BP
 .endm
 
+#else /* !__ASSEMBLY__ */
+
+#define FRAME_BEGIN				\
+	"push %" _ASM_BP "\n"			\
+	_ASM_MOV "%" _ASM_SP ", %" _ASM_BP "\n"
+
+#define FRAME_END "pop %" _ASM_BP "\n"
+
+#endif /* __ASSEMBLY__ */
+
 #define FRAME_OFFSET __ASM_SEL(4, 8)
 
 #else /* !CONFIG_FRAME_POINTER */
@@ -30,5 +41,4 @@
 
 #endif /* CONFIG_FRAME_POINTER */
 
-#endif  /*  __ASSEMBLY__  */
 #endif /* _ASM_X86_FRAME_H */
-- 
cgit v0.10.2


From 320d25b6a05f8b73c23fc21025d2906ecdd2d4fc Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 19 Jan 2016 13:38:58 -0800
Subject: x86/mm/32: Set NX in __supported_pte_mask before enabling paging

There's a short window in which very early mappings can end up
with NX clear because they are created before we've noticed that
we have NX.

It turns out that we detect NX very early, so there's no need to
defer __supported_pte_mask setup.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2b544627345f7110160545a3f47031eb45c3ad4f.1453239349.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 6bc9ae2..57fc3f8 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -389,6 +389,12 @@ default_entry:
 	/* Make changes effective */
 	wrmsr
 
+	/*
+	 * And make sure that all the mappings we set up have NX set from
+	 * the beginning.
+	 */
+	orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4)
+
 enable_paging:
 
 /*
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 92e2eac..78f5d59 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -31,9 +31,8 @@ early_param("noexec", noexec_setup);
 
 void x86_configure_nx(void)
 {
-	if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx)
-		__supported_pte_mask |= _PAGE_NX;
-	else
+	/* If disable_nx is set, clear NX on all new mappings going forward. */
+	if (disable_nx)
 		__supported_pte_mask &= ~_PAGE_NX;
 }
 
-- 
cgit v0.10.2


From 7c360572b430a0e9757bafc0c20f26c920f2a07f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 19 Jan 2016 13:38:59 -0800
Subject: x86/mm: Make kmap_prot into a #define

The value (once we initialize it) is a foregone conclusion.
Make it a #define to save a tiny amount of text and data size
and to make it more comprehensible.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/0850eb0213de9da88544ff7fae72dc6d06d2b441.1453239349.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 6d7d0e5..8554f96 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -138,7 +138,7 @@ extern void reserve_top_address(unsigned long reserve);
 extern int fixmaps_set;
 
 extern pte_t *kmap_pte;
-extern pgprot_t kmap_prot;
+#define kmap_prot PAGE_KERNEL
 extern pte_t *pkmap_page_table;
 
 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index cb4ef3d..a4bb1c7 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -388,7 +388,6 @@ repeat:
 }
 
 pte_t *kmap_pte;
-pgprot_t kmap_prot;
 
 static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
 {
@@ -405,8 +404,6 @@ static void __init kmap_init(void)
 	 */
 	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
 	kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
-
-	kmap_prot = PAGE_KERNEL;
 }
 
 #ifdef CONFIG_HIGHMEM
-- 
cgit v0.10.2


From c10910c323bb9e7fc53ba25c83d1adeb9fb20878 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 21 Jan 2016 15:51:17 -0800
Subject: sparc: Hook up copy_file_range syscall.

Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/include/uapi/asm/unistd.h b/arch/sparc/include/uapi/asm/unistd.h
index 1c26d44..b6de8b1 100644
--- a/arch/sparc/include/uapi/asm/unistd.h
+++ b/arch/sparc/include/uapi/asm/unistd.h
@@ -422,8 +422,9 @@
 #define __NR_listen		354
 #define __NR_setsockopt		355
 #define __NR_mlock2		356
+#define __NR_copy_file_range	357
 
-#define NR_syscalls		357
+#define NR_syscalls		358
 
 /* Bitmask values returned from kern_features system call.  */
 #define KERN_FEATURE_MIXED_MODE_STACK	0x00000001
diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S
index e663b6c..6c3dd6c 100644
--- a/arch/sparc/kernel/systbls_32.S
+++ b/arch/sparc/kernel/systbls_32.S
@@ -88,4 +88,4 @@ sys_call_table:
 /*340*/	.long sys_ni_syscall, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
 /*345*/	.long sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
 /*350*/	.long sys_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
-/*355*/	.long sys_setsockopt, sys_mlock2
+/*355*/	.long sys_setsockopt, sys_mlock2, sys_copy_file_range
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 1557121..12b524c 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -89,7 +89,7 @@ sys_call_table32:
 /*340*/	.word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
 	.word sys32_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
 /*350*/	.word sys32_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
-	.word compat_sys_setsockopt, sys_mlock2
+	.word compat_sys_setsockopt, sys_mlock2, sys_copy_file_range
 
 #endif /* CONFIG_COMPAT */
 
@@ -170,4 +170,4 @@ sys_call_table:
 /*340*/	.word sys_kern_features, sys_kcmp, sys_finit_module, sys_sched_setattr, sys_sched_getattr
 	.word sys_renameat2, sys_seccomp, sys_getrandom, sys_memfd_create, sys_bpf
 /*350*/	.word sys64_execveat, sys_membarrier, sys_userfaultfd, sys_bind, sys_listen
-	.word sys_setsockopt, sys_mlock2
+	.word sys_setsockopt, sys_mlock2, sys_copy_file_range
-- 
cgit v0.10.2


From 1a40b95374f680625318ab61d81958e949e0afe3 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Mon, 18 Jan 2016 06:32:30 -0500
Subject: sparc: Fix system call tracing register handling.

A system call trace trigger on entry allows the tracing
process to inspect and potentially change the traced
process's registers.

Account for that by reloading the %g1 (syscall number)
and %i0-%i5 (syscall argument) values.  We need to be
careful to revalidate the range of %g1, and reload the
system call table entry it corresponds to into %l7.

Reported-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Tested-by: Mike Frysinger <vapier@gentoo.org>

diff --git a/arch/sparc/kernel/entry.S b/arch/sparc/kernel/entry.S
index 33c02b1..a83707c 100644
--- a/arch/sparc/kernel/entry.S
+++ b/arch/sparc/kernel/entry.S
@@ -948,7 +948,24 @@ linux_syscall_trace:
 	cmp	%o0, 0
 	bne	3f
 	 mov	-ENOSYS, %o0
+
+	/* Syscall tracing can modify the registers.  */
+	ld	[%sp + STACKFRAME_SZ + PT_G1], %g1
+	sethi	%hi(sys_call_table), %l7
+	ld	[%sp + STACKFRAME_SZ + PT_I0], %i0
+	or	%l7, %lo(sys_call_table), %l7
+	ld	[%sp + STACKFRAME_SZ + PT_I1], %i1
+	ld	[%sp + STACKFRAME_SZ + PT_I2], %i2
+	ld	[%sp + STACKFRAME_SZ + PT_I3], %i3
+	ld	[%sp + STACKFRAME_SZ + PT_I4], %i4
+	ld	[%sp + STACKFRAME_SZ + PT_I5], %i5
+	cmp	%g1, NR_syscalls
+	bgeu	3f
+	 mov	-ENOSYS, %o0
+
+	sll	%g1, 2, %l4
 	mov	%i0, %o0
+	ld	[%l7 + %l4], %l7
 	mov	%i1, %o1
 	mov	%i2, %o2
 	mov	%i3, %o3
diff --git a/arch/sparc/kernel/syscalls.S b/arch/sparc/kernel/syscalls.S
index bb00089..c4a1b5c 100644
--- a/arch/sparc/kernel/syscalls.S
+++ b/arch/sparc/kernel/syscalls.S
@@ -158,7 +158,25 @@ linux_syscall_trace32:
 	 add	%sp, PTREGS_OFF, %o0
 	brnz,pn	%o0, 3f
 	 mov	-ENOSYS, %o0
+
+	/* Syscall tracing can modify the registers.  */
+	ldx	[%sp + PTREGS_OFF + PT_V9_G1], %g1
+	sethi	%hi(sys_call_table32), %l7
+	ldx	[%sp + PTREGS_OFF + PT_V9_I0], %i0
+	or	%l7, %lo(sys_call_table32), %l7
+	ldx	[%sp + PTREGS_OFF + PT_V9_I1], %i1
+	ldx	[%sp + PTREGS_OFF + PT_V9_I2], %i2
+	ldx	[%sp + PTREGS_OFF + PT_V9_I3], %i3
+	ldx	[%sp + PTREGS_OFF + PT_V9_I4], %i4
+	ldx	[%sp + PTREGS_OFF + PT_V9_I5], %i5
+
+	cmp	%g1, NR_syscalls
+	bgeu,pn	%xcc, 3f
+	 mov	-ENOSYS, %o0
+
+	sll	%g1, 2, %l4
 	srl	%i0, 0, %o0
+	lduw	[%l7 + %l4], %l7
 	srl	%i4, 0, %o4
 	srl	%i1, 0, %o1
 	srl	%i2, 0, %o2
@@ -170,7 +188,25 @@ linux_syscall_trace:
 	 add	%sp, PTREGS_OFF, %o0
 	brnz,pn	%o0, 3f
 	 mov	-ENOSYS, %o0
+
+	/* Syscall tracing can modify the registers.  */
+	ldx	[%sp + PTREGS_OFF + PT_V9_G1], %g1
+	sethi	%hi(sys_call_table64), %l7
+	ldx	[%sp + PTREGS_OFF + PT_V9_I0], %i0
+	or	%l7, %lo(sys_call_table64), %l7
+	ldx	[%sp + PTREGS_OFF + PT_V9_I1], %i1
+	ldx	[%sp + PTREGS_OFF + PT_V9_I2], %i2
+	ldx	[%sp + PTREGS_OFF + PT_V9_I3], %i3
+	ldx	[%sp + PTREGS_OFF + PT_V9_I4], %i4
+	ldx	[%sp + PTREGS_OFF + PT_V9_I5], %i5
+
+	cmp	%g1, NR_syscalls
+	bgeu,pn	%xcc, 3f
+	 mov	-ENOSYS, %o0
+
+	sll	%g1, 2, %l4
 	mov	%i0, %o0
+	lduw	[%l7 + %l4], %l7
 	mov	%i1, %o1
 	mov	%i2, %o2
 	mov	%i3, %o3
-- 
cgit v0.10.2


From f5a5386bcaee222d2565d87ab33f0896f91a11ef Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 6 Dec 2015 20:18:37 -0800
Subject: rcutorture: Add checks for rcutorture writer starvation

This commit adds checks for rcutorture writer starvation, so that
instances will be added to the test summary.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 844787a..0f1520f 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -33,7 +33,7 @@ if grep -Pq '\x00' < $file
 then
 	print_warning Console output contains nul bytes, old qemu still running?
 fi
-egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start|\?\?\? Writer stall state' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
 if test -s $1.diags
 then
 	print_warning Assertion failure in $file $title
@@ -64,7 +64,7 @@ then
 	then
 		summary="$summary  lockdep: $n_badness"
 	fi
-	n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1`
+	n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start|\?\?\? Writer stall state' $1`
 	if test "$n_stalls" -ne 0
 	then
 		summary="$summary  Stalls: $n_stalls"
-- 
cgit v0.10.2


From fc91cbab11d43a7d241297886665b9a5f94a1e47 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 6 Dec 2015 20:20:14 -0800
Subject: rcutorture: Don't keep empty console.log.diags files

Currently, an error-free run produces an empty console.log.diags file.
This can be annoying when using "vi */console.log.diags" to see a full
summary of the errors.  This commit therefore removes any empty files
during the analysis process.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 0f1520f..c6b0e46 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -70,4 +70,6 @@ then
 		summary="$summary  Stalls: $n_stalls"
 	fi
 	print_warning Summary: $summary
+else
+	rm $1.diags
 fi
-- 
cgit v0.10.2


From 5d43edb40779b3d88e9c8006b281a4c828431193 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 9 Dec 2015 16:10:23 -0800
Subject: rcutorture: Check for self-detected stalls

The current scripts parse console output only for cases where one CPU
detect a stall on some other CPU or task.  This commit therefore adds
checks for self-detected stalls.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index c6b0e46..5eb49b7 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -33,7 +33,7 @@ if grep -Pq '\x00' < $file
 then
 	print_warning Console output contains nul bytes, old qemu still running?
 fi
-egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start|\?\?\? Writer stall state' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
 if test -s $1.diags
 then
 	print_warning Assertion failure in $file $title
@@ -64,7 +64,7 @@ then
 	then
 		summary="$summary  lockdep: $n_badness"
 	fi
-	n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start|\?\?\? Writer stall state' $1`
+	n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $1`
 	if test "$n_stalls" -ne 0
 	then
 		summary="$summary  Stalls: $n_stalls"
-- 
cgit v0.10.2


From d85ce830eef6c10d1e9617172dea4681f02b8424 Mon Sep 17 00:00:00 2001
From: Markus Trippelsdorf <markus@trippelsdorf.de>
Date: Mon, 14 Dec 2015 16:44:40 +0100
Subject: perf pmu: Fix misleadingly indented assignment (whitespace)

One line in perf_pmu__parse_unit() is indented wrongly, leading to a
warning (=> error) from gcc 6:

  util/pmu.c:156:3: error: statement is indented as if it were guarded by... [-Werror=misleading-indentation]

    sret = read(fd, alias->unit, UNIT_MAX_LEN);
    ^~~~

  util/pmu.c:153:2: note: ...this 'if' clause, but it is not
    if (fd == -1)
    ^~

Signed-off-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: 410136f5dd96 ("tools/perf/stat: Add event unit and scale support")
Link: http://lkml.kernel.org/r/20151214154440.GC1409@x4
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index b597bcc..41a9c87 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -153,7 +153,7 @@ static int perf_pmu__parse_unit(struct perf_pmu_alias *alias, char *dir, char *n
 	if (fd == -1)
 		return -1;
 
-		sret = read(fd, alias->unit, UNIT_MAX_LEN);
+	sret = read(fd, alias->unit, UNIT_MAX_LEN);
 	if (sret < 0)
 		goto error;
 
-- 
cgit v0.10.2


From 403567217d3fa5d4801f820317ada52e5c5f0e53 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Jan 2016 12:56:32 +0100
Subject: perf symbols: Do not read symbols/data from device files

With mem sampling we could get data source within mapped device file.
Processing such sample would block during report phase on trying to read
the device file.

Chacking for device files and skip the processing if it's detected.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453290995-18485-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c
index e8e9a9d..8e639543 100644
--- a/tools/perf/util/dso.c
+++ b/tools/perf/util/dso.c
@@ -52,6 +52,11 @@ int dso__read_binary_type_filename(const struct dso *dso,
 			debuglink--;
 		if (*debuglink == '/')
 			debuglink++;
+
+		ret = -1;
+		if (!is_regular_file(filename))
+			break;
+
 		ret = filename__read_debuglink(filename, debuglink,
 					       size - (debuglink - filename));
 		}
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index ab02209..90cedfa 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1466,7 +1466,8 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
 	 * Read the build id if possible. This is required for
 	 * DSO_BINARY_TYPE__BUILDID_DEBUGINFO to work
 	 */
-	if (filename__read_build_id(dso->long_name, build_id, BUILD_ID_SIZE) > 0)
+	if (is_regular_file(name) &&
+	    filename__read_build_id(dso->long_name, build_id, BUILD_ID_SIZE) > 0)
 		dso__set_build_id(dso, build_id);
 
 	/*
@@ -1487,6 +1488,9 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
 						   root_dir, name, PATH_MAX))
 			continue;
 
+		if (!is_regular_file(name))
+			continue;
+
 		/* Name is now the name of the next image to try */
 		if (symsrc__init(ss, dso, name, symtab_type) < 0)
 			continue;
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index ead9509..7a2da7e 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -691,3 +691,13 @@ out:
 
 	return tip;
 }
+
+bool is_regular_file(const char *file)
+{
+	struct stat st;
+
+	if (stat(file, &st))
+		return false;
+
+	return S_ISREG(st.st_mode);
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index fe915e6..61650f0 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -343,5 +343,6 @@ int fetch_kernel_version(unsigned int *puint,
 #define KVER_PARAM(x)	KVER_VERSION(x), KVER_PATCHLEVEL(x), KVER_SUBLEVEL(x)
 
 const char *perf_tip(const char *dirpath);
+bool is_regular_file(const char *file);
 
 #endif /* GIT_COMPAT_UTIL_H */
-- 
cgit v0.10.2


From 86a2cf3123bfec118bfb98728d88be0668779b2b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 20 Jan 2016 12:56:35 +0100
Subject: perf stat: Making several helper functions static

There's no need for the following functions to be global:

  perf_evsel__reset_stat_priv
  perf_evsel__alloc_stat_priv
  perf_evsel__free_stat_priv
  perf_evsel__alloc_prev_raw_counts
  perf_evsel__free_prev_raw_counts
  perf_evsel__alloc_stats

They all ended up in util/stat.c, and they no longer need to be called
from outside this object.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453290995-18485-5-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index 2b58edc..beeed0b 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -97,7 +97,7 @@ void perf_stat_evsel_id_init(struct perf_evsel *evsel)
 	}
 }
 
-void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
+static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
 {
 	int i;
 	struct perf_stat_evsel *ps = evsel->priv;
@@ -108,7 +108,7 @@ void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
 	perf_stat_evsel_id_init(evsel);
 }
 
-int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
+static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
 {
 	evsel->priv = zalloc(sizeof(struct perf_stat_evsel));
 	if (evsel->priv == NULL)
@@ -117,13 +117,13 @@ int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
 	return 0;
 }
 
-void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
+static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
 {
 	zfree(&evsel->priv);
 }
 
-int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
-				      int ncpus, int nthreads)
+static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
+					     int ncpus, int nthreads)
 {
 	struct perf_counts *counts;
 
@@ -134,13 +134,13 @@ int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
 	return counts ? 0 : -ENOMEM;
 }
 
-void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel)
+static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel)
 {
 	perf_counts__delete(evsel->prev_raw_counts);
 	evsel->prev_raw_counts = NULL;
 }
 
-int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw)
+static int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw)
 {
 	int ncpus = perf_evsel__nr_cpus(evsel);
 	int nthreads = thread_map__nr(evsel->threads);
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 086f4e1..2af63c9 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -74,16 +74,6 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
 void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
 				   double avg, int cpu, enum aggr_mode aggr);
 
-void perf_evsel__reset_stat_priv(struct perf_evsel *evsel);
-int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel);
-void perf_evsel__free_stat_priv(struct perf_evsel *evsel);
-
-int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel,
-				      int ncpus, int nthreads);
-void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel);
-
-int perf_evsel__alloc_stats(struct perf_evsel *evsel, bool alloc_raw);
-
 int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw);
 void perf_evlist__free_stats(struct perf_evlist *evlist);
 void perf_evlist__reset_stats(struct perf_evlist *evlist);
-- 
cgit v0.10.2


From c84a5d16711619621f368e84a179790df3377c87 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 20 Jan 2016 10:15:20 +0900
Subject: perf hists: Remove parent filter check in DSO filter function

The --exclude-other option sets HIST_FILTER__PARENT bit and it's only
set when a hist entry was created.  DSO filters don't change this so no
need to have the check in hists__filter_by_dso() IMHO.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453252521-24398-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 68a7612..1d8c8ea 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1266,9 +1266,6 @@ void hists__filter_by_dso(struct hists *hists)
 	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 
-		if (symbol_conf.exclude_other && !h->parent)
-			continue;
-
 		if (hists__filter_entry_by_dso(hists, h))
 			continue;
 
-- 
cgit v0.10.2


From 1f7c254132f098d19ff3fd452ba9f826cd85c4c0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 20 Jan 2016 10:15:21 +0900
Subject: perf hists: Cleanup filtering functions

The hists__filter_by_xxx functions share same logic with different
filters.  Factor out the common code into the hists__filter_by_type.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453252521-24398-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 1d8c8ea..81ce0af 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1254,25 +1254,6 @@ static bool hists__filter_entry_by_dso(struct hists *hists,
 	return false;
 }
 
-void hists__filter_by_dso(struct hists *hists)
-{
-	struct rb_node *nd;
-
-	hists->stats.nr_non_filtered_samples = 0;
-
-	hists__reset_filter_stats(hists);
-	hists__reset_col_len(hists);
-
-	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-		if (hists__filter_entry_by_dso(hists, h))
-			continue;
-
-		hists__remove_entry_filter(hists, h, HIST_FILTER__DSO);
-	}
-}
-
 static bool hists__filter_entry_by_thread(struct hists *hists,
 					  struct hist_entry *he)
 {
@@ -1285,25 +1266,6 @@ static bool hists__filter_entry_by_thread(struct hists *hists,
 	return false;
 }
 
-void hists__filter_by_thread(struct hists *hists)
-{
-	struct rb_node *nd;
-
-	hists->stats.nr_non_filtered_samples = 0;
-
-	hists__reset_filter_stats(hists);
-	hists__reset_col_len(hists);
-
-	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-		if (hists__filter_entry_by_thread(hists, h))
-			continue;
-
-		hists__remove_entry_filter(hists, h, HIST_FILTER__THREAD);
-	}
-}
-
 static bool hists__filter_entry_by_symbol(struct hists *hists,
 					  struct hist_entry *he)
 {
@@ -1317,25 +1279,6 @@ static bool hists__filter_entry_by_symbol(struct hists *hists,
 	return false;
 }
 
-void hists__filter_by_symbol(struct hists *hists)
-{
-	struct rb_node *nd;
-
-	hists->stats.nr_non_filtered_samples = 0;
-
-	hists__reset_filter_stats(hists);
-	hists__reset_col_len(hists);
-
-	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
-		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
-
-		if (hists__filter_entry_by_symbol(hists, h))
-			continue;
-
-		hists__remove_entry_filter(hists, h, HIST_FILTER__SYMBOL);
-	}
-}
-
 static bool hists__filter_entry_by_socket(struct hists *hists,
 					  struct hist_entry *he)
 {
@@ -1348,7 +1291,9 @@ static bool hists__filter_entry_by_socket(struct hists *hists,
 	return false;
 }
 
-void hists__filter_by_socket(struct hists *hists)
+typedef bool (*filter_fn_t)(struct hists *hists, struct hist_entry *he);
+
+static void hists__filter_by_type(struct hists *hists, int type, filter_fn_t filter)
 {
 	struct rb_node *nd;
 
@@ -1360,13 +1305,37 @@ void hists__filter_by_socket(struct hists *hists)
 	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 
-		if (hists__filter_entry_by_socket(hists, h))
+		if (filter(hists, h))
 			continue;
 
-		hists__remove_entry_filter(hists, h, HIST_FILTER__SOCKET);
+		hists__remove_entry_filter(hists, h, type);
 	}
 }
 
+void hists__filter_by_thread(struct hists *hists)
+{
+	hists__filter_by_type(hists, HIST_FILTER__THREAD,
+			      hists__filter_entry_by_thread);
+}
+
+void hists__filter_by_dso(struct hists *hists)
+{
+	hists__filter_by_type(hists, HIST_FILTER__DSO,
+			      hists__filter_entry_by_dso);
+}
+
+void hists__filter_by_symbol(struct hists *hists)
+{
+	hists__filter_by_type(hists, HIST_FILTER__SYMBOL,
+			      hists__filter_entry_by_symbol);
+}
+
+void hists__filter_by_socket(struct hists *hists)
+{
+	hists__filter_by_type(hists, HIST_FILTER__SOCKET,
+			      hists__filter_entry_by_socket);
+}
+
 void events_stats__inc(struct events_stats *stats, u32 type)
 {
 	++stats->nr_events[0];
-- 
cgit v0.10.2


From 3379e0c3effa87d7734fc06277a7023292aadb0c Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Tue, 19 Jan 2016 21:35:15 +0000
Subject: perf tools: Document the perf sysctls

perf_event_paranoid was only documented in source code and a perf error
message.  Copy the documentation from the error message to
Documentation/sysctl/kernel.txt.

perf_cpu_time_max_percent was already documented but missing from the
list at the top, so add it there.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-doc@vger.kernel.org
Link: http://lkml.kernel.org/r/20160119213515.GG2637@decadent.org.uk
[ Remove reference to external Documentation file, provide info inline, as before ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 73c6b1e..c803e73 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -58,6 +58,8 @@ show up in /proc/sys/kernel:
 - panic_on_stackoverflow
 - panic_on_unrecovered_nmi
 - panic_on_warn
+- perf_cpu_time_max_percent
+- perf_event_paranoid
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -639,6 +641,17 @@ allowed to execute.
 
 ==============================================================
 
+perf_event_paranoid:
+
+Controls use of the performance events system by unprivileged
+users (without CAP_SYS_ADMIN).  The default value is 1.
+
+ -1: Allow use of (almost) all events by all users
+>=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
+>=1: Disallow CPU event access by users without CAP_SYS_ADMIN
+>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
+
+==============================================================
 
 pid_max:
 
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index cdbaf9b..4678086 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -2362,12 +2362,15 @@ int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 	case EPERM:
 	case EACCES:
 		return scnprintf(msg, size,
-		 "You may not have permission to collect %sstats.\n"
-		 "Consider tweaking /proc/sys/kernel/perf_event_paranoid:\n"
-		 " -1 - Not paranoid at all\n"
-		 "  0 - Disallow raw tracepoint access for unpriv\n"
-		 "  1 - Disallow cpu events for unpriv\n"
-		 "  2 - Disallow kernel profiling for unpriv",
+		 "You may not have permission to collect %sstats.\n\n"
+		 "Consider tweaking /proc/sys/kernel/perf_event_paranoid,\n"
+		 "which controls use of the performance events system by\n"
+		 "unprivileged users (without CAP_SYS_ADMIN).\n\n"
+		 "The default value is 1:\n\n"
+		 "  -1: Allow use of (almost) all events by all users\n"
+		 ">= 0: Disallow raw tracepoint access by users without CAP_IOC_LOCK\n"
+		 ">= 1: Disallow CPU event access by users without CAP_SYS_ADMIN\n"
+		 ">= 2: Disallow kernel profiling by users without CAP_SYS_ADMIN",
 				 target->system_wide ? "system-wide " : "");
 	case ENOENT:
 		return scnprintf(msg, size, "The %s event is not supported.",
-- 
cgit v0.10.2


From 78ce08dfbd180fb85312ee76e607a6c5fe34a06c Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Fri, 8 Jan 2016 17:16:11 +0900
Subject: perf annotate: Rename 'colors.code' to 'colors.jump_arrows'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

USe 'jump_arrows' config name instead of 'code' on 'colors' section.
'colors.code' config is only for jump arrows on assembly code listings
i.e.

    │     ┌──jmp    1333
    │     │  xchg   %ax,%ax
    │     │  mov    %r15,%r10
    │     └─→cmp    %r15,%r14

But this config name seems unfit.

 'jump_arrows' is more descriptive than 'code'.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1452240971-25418-1-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index b9ca1e3..1ee488b 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -62,7 +62,7 @@ Given a $HOME/.perfconfig like this:
 		medium = green, default
 		normal = lightgray, default
 		selected = white, lightgray
-		code = blue, default
+		jump_arrows = blue, default
 		addr = magenta, default
 		root = white, blue
 
diff --git a/tools/perf/Documentation/perfconfig.example b/tools/perf/Documentation/perfconfig.example
index 767ea24..1d8d5bc 100644
--- a/tools/perf/Documentation/perfconfig.example
+++ b/tools/perf/Documentation/perfconfig.example
@@ -5,7 +5,7 @@
 	medium = green, lightgray
 	normal = black, lightgray
 	selected = lightgray, magenta
-	code = blue, lightgray
+	jump_arrows = blue, lightgray
 	addr = magenta, lightgray
 
 [tui]
diff --git a/tools/perf/ui/browser.c b/tools/perf/ui/browser.c
index d372021..af68a9d 100644
--- a/tools/perf/ui/browser.c
+++ b/tools/perf/ui/browser.c
@@ -531,8 +531,8 @@ static struct ui_browser_colorset {
 		.bg	  = "yellow",
 	},
 	{
-		.colorset = HE_COLORSET_CODE,
-		.name	  = "code",
+		.colorset = HE_COLORSET_JUMP_ARROWS,
+		.name	  = "jump_arrows",
 		.fg	  = "blue",
 		.bg	  = "default",
 	},
diff --git a/tools/perf/ui/browser.h b/tools/perf/ui/browser.h
index 01781de..be3b70e 100644
--- a/tools/perf/ui/browser.h
+++ b/tools/perf/ui/browser.h
@@ -7,7 +7,7 @@
 #define HE_COLORSET_MEDIUM	51
 #define HE_COLORSET_NORMAL	52
 #define HE_COLORSET_SELECTED	53
-#define HE_COLORSET_CODE	54
+#define HE_COLORSET_JUMP_ARROWS	54
 #define HE_COLORSET_ADDR	55
 #define HE_COLORSET_ROOT	56
 
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index 718bd46..4fc208e 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -284,7 +284,7 @@ static void annotate_browser__draw_current_jump(struct ui_browser *browser)
 		to = (u64)btarget->idx;
 	}
 
-	ui_browser__set_color(browser, HE_COLORSET_CODE);
+	ui_browser__set_color(browser, HE_COLORSET_JUMP_ARROWS);
 	__ui_browser__line_arrow(browser, pcnt_width + 2 + ab->addr_width,
 				 from, to);
 }
-- 
cgit v0.10.2


From 89debf178708458ac62f5b53dfc97437009d02d3 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Fri, 8 Jan 2016 20:39:31 +0900
Subject: perf config: Document variables for 'colors' section in man page

Explain 'colors' section and its variables, used for The variables for
customizing the colors used in the output for the 'report', 'top' and
'annotate' in the TUI, those are:

'top', 'medium', 'normal', 'selected',
'jump_arrows', 'addr' and 'root'.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1452253193-30502-2-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 1ee488b..8051782 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -98,6 +98,52 @@ Given a $HOME/.perfconfig like this:
 		order = caller
 		sort-key = function
 
+Variables
+~~~~~~~~~
+
+colors.*::
+	The variables for customizing the colors used in the output for the
+	'report', 'top' and 'annotate' in the TUI. They should specify the
+	foreground and background colors, separated by a comma, for example:
+
+		medium = green, lightgray
+
+	If you want to use the color configured for you terminal, just leave it
+	as 'default', for example:
+
+		medium = default, lightgray
+
+	Available colors:
+	red, yellow, green, cyan, gray, black, blue,
+	white, default, magenta, lightgray
+
+	colors.top::
+		'top' means a overhead percentage which is more than 5%.
+		And values of this variable specify percentage colors.
+		Basic key values are foreground-color 'red' and
+		background-color 'default'.
+	colors.medium::
+		'medium' means a overhead percentage which has more than 0.5%.
+		Default values are 'green' and 'default'.
+	colors.normal::
+		'normal' means the rest of overhead percentages
+		except 'top', 'medium', 'selected'.
+		Default values are 'lightgray' and 'default'.
+	colors.selected::
+		This selects the colors for the current entry in a list of entries
+		from sub-commands (top, report, annotate).
+		Default values are 'black' and 'lightgray'.
+	colors.jump_arrows::
+		Colors for jump arrows on assembly code listings
+		such as 'jns', 'jmp', 'jane', etc.
+		Default values are 'blue', 'default'.
+	colors.addr::
+		This selects colors for addresses from 'annotate'.
+		Default values are 'magenta', 'default'.
+	colors.root::
+		Colors for headers in the output of a sub-commands (top, report).
+		Default values are 'white', 'blue'.
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From 3fa9f40718a33d27eb2f4bd36c13318a2d58839d Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Fri, 8 Jan 2016 20:39:32 +0900
Subject: perf config: Document variables for 'tui' and 'gtk' sections in man
 page

Explain 'tui' and 'gtk' sections and these variables.

'top', 'report' and 'annotate'

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1452253193-30502-3-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 8051782..ccbdb64 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -144,6 +144,16 @@ colors.*::
 		Colors for headers in the output of a sub-commands (top, report).
 		Default values are 'white', 'blue'.
 
+tui.*, gtk.*::
+	Subcommands that can be configured here are 'top', 'report' and 'annotate'.
+	These values are booleans, for example:
+
+	[tui]
+		top = true
+
+	will make the TUI be the default for the 'top' subcommand. Those will be
+	available if the required libs were detected at tool build time.
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From 2733525b8c1a5f9b6e55338d836b835c9c698913 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Fri, 8 Jan 2016 20:39:33 +0900
Subject: perf config: Document 'buildid.dir' variable in man page

Explain 'buildid.dir' variable.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1452253193-30502-4-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index ccbdb64..a095f0c 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -154,6 +154,21 @@ tui.*, gtk.*::
 	will make the TUI be the default for the 'top' subcommand. Those will be
 	available if the required libs were detected at tool build time.
 
+buildid.*::
+	buildid.dir::
+		Each executable and shared library in modern distributions comes with a
+		content based identifier that, if available, will be inserted in a
+		'perf.data' file header to, at analysis time find what is needed to do
+		symbol resolution, code annotation, etc.
+
+		The recording tools also stores a hard link or copy in a per-user
+		directory, $HOME/.debug/, of binaries, shared libraries, /proc/kallsyms
+		and /proc/kcore files to be used at analysis time.
+
+		The buildid.dir variable can be used to either change this directory
+		cache location, or to disable it altogether. If you want to disable it,
+		set buildid.dir to /dev/null. The default is $HOME/.debug
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From 3b97629d139b19cbcd15b0b3c128a4d6587d2091 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Fri, 8 Jan 2016 20:39:34 +0900
Subject: perf config: Document variables for 'annotate' section in man page

Explain 'annotate' section and its variables.

'hide_src_code', 'use_offset', 'jump_arrows',
'show_linenr', 'show_nr_jump' and 'show_total_period'.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1452253193-30502-5-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index a095f0c..cb7ca50 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -169,6 +169,116 @@ buildid.*::
 		cache location, or to disable it altogether. If you want to disable it,
 		set buildid.dir to /dev/null. The default is $HOME/.debug
 
+annotate.*::
+	These options work only for TUI.
+	These are in control of addresses, jump function, source code
+	in lines of assembly code from a specific program.
+
+	annotate.hide_src_code::
+		If a program which is analyzed has source code,
+		this option lets 'annotate' print a list of assembly code with the source code.
+		For example, let's see a part of a program. There're four lines.
+		If this option is 'true', they can be printed
+		without source code from a program as below.
+
+		│        push   %rbp
+		│        mov    %rsp,%rbp
+		│        sub    $0x10,%rsp
+		│        mov    (%rdi),%rdx
+
+		But if this option is 'false', source code of the part
+		can be also printed as below. Default is 'false'.
+
+		│      struct rb_node *rb_next(const struct rb_node *node)
+		│      {
+		│        push   %rbp
+		│        mov    %rsp,%rbp
+		│        sub    $0x10,%rsp
+		│              struct rb_node *parent;
+		│
+		│              if (RB_EMPTY_NODE(node))
+		│        mov    (%rdi),%rdx
+		│              return n;
+
+        annotate.use_offset::
+		Basing on a first address of a loaded function, offset can be used.
+		Instead of using original addresses of assembly code,
+		addresses subtracted from a base address can be printed.
+		Let's illustrate an example.
+		If a base address is 0XFFFFFFFF81624d50 as below,
+
+		ffffffff81624d50 <load0>
+
+		an address on assembly code has a specific absolute address as below
+
+		ffffffff816250b8:│  mov    0x8(%r14),%rdi
+
+		but if use_offset is 'true', an address subtracted from a base address is printed.
+		Default is true. This option is only applied to TUI.
+
+		             368:│  mov    0x8(%r14),%rdi
+
+	annotate.jump_arrows::
+		There can be jump instruction among assembly code.
+		Depending on a boolean value of jump_arrows,
+		arrows can be printed or not which represent
+		where do the instruction jump into as below.
+
+		│     ┌──jmp    1333
+		│     │  xchg   %ax,%ax
+		│1330:│  mov    %r15,%r10
+		│1333:└─→cmp    %r15,%r14
+
+		If jump_arrow is 'false', the arrows isn't printed as below.
+		Default is 'false'.
+
+		│      ↓ jmp    1333
+		│        xchg   %ax,%ax
+		│1330:   mov    %r15,%r10
+		│1333:   cmp    %r15,%r14
+
+        annotate.show_linenr::
+		When showing source code if this option is 'true',
+		line numbers are printed as below.
+
+		│1628         if (type & PERF_SAMPLE_IDENTIFIER) {
+		│     ↓ jne    508
+		│1628                 data->id = *array;
+		│1629                 array++;
+		│1630         }
+
+		However if this option is 'false', they aren't printed as below.
+		Default is 'false'.
+
+		│             if (type & PERF_SAMPLE_IDENTIFIER) {
+		│     ↓ jne    508
+		│                     data->id = *array;
+		│                     array++;
+		│             }
+
+        annotate.show_nr_jumps::
+		Let's see a part of assembly code.
+
+		│1382:   movb   $0x1,-0x270(%rbp)
+
+		If use this, the number of branches jumping to that address can be printed as below.
+		Default is 'false'.
+
+		│1 1382:   movb   $0x1,-0x270(%rbp)
+
+        annotate.show_total_period::
+		To compare two records on an instruction base, with this option
+		provided, display total number of samples that belong to a line
+		in assembly code. If this option is 'true', total periods are printed
+		instead of percent values as below.
+
+		  302 │      mov    %eax,%eax
+
+		But if this option is 'false', percent values for overhead are printed i.e.
+		Default is 'false'.
+
+		99.93 │      mov    %eax,%eax
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From 485311d97863f2810646e17c8075be2992225f98 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Fri, 8 Jan 2016 20:39:36 +0900
Subject: perf config: Document 'hist.percentage' variable in man page

Explain 'hist.percentage' variable.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1452253193-30502-7-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index cb7ca50..74589c6 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -279,6 +279,23 @@ annotate.*::
 
 		99.93 │      mov    %eax,%eax
 
+hist.*::
+	hist.percentage::
+		This option control the way to calculate overhead of filtered entries -
+		that means the value of this option is effective only if there's a
+		filter (by comm, dso or symbol name). Suppose a following example:
+
+		       Overhead  Symbols
+		       ........  .......
+		        33.33%     foo
+		        33.33%     bar
+		        33.33%     baz
+
+	       This is an original overhead and we'll filter out the first 'foo'
+	       entry. The value of 'relative' would increase the overhead of 'bar'
+	       and 'baz' to 50.00% for each, while 'absolute' would show their
+	       current overhead (33.33%).
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From cfd92dadc5e830268036efb25ff41618f29c3306 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 21 Jan 2016 19:13:24 -0300
Subject: perf sort: Provide a way to find out if per-thread bucketing is in
 place

Now the UI browsers will be able to offer thread related operations only
if the thread is part of the sort order in use, i.e. if hist_entry stats
are all for a single thread.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>,
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1452960197-5323-9-git-send-email-namhyung@kernel.org
[ Carved out from a  larger patch ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index ec72234..898e4b0 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -25,6 +25,7 @@ int		sort__has_parent = 0;
 int		sort__has_sym = 0;
 int		sort__has_dso = 0;
 int		sort__has_socket = 0;
+int		sort__has_thread = 0;
 enum sort_mode	sort__mode = SORT_MODE__NORMAL;
 
 
@@ -2136,6 +2137,8 @@ static int sort_dimension__add(const char *tok,
 			sort__has_dso = 1;
 		} else if (sd->entry == &sort_socket) {
 			sort__has_socket = 1;
+		} else if (sd->entry == &sort_thread) {
+			sort__has_thread = 1;
 		}
 
 		return __sort_dimension__add(sd);
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 687bbb12..09616f0 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -35,6 +35,7 @@ extern int sort__need_collapse;
 extern int sort__has_parent;
 extern int sort__has_sym;
 extern int sort__has_socket;
+extern int sort__has_thread;
 extern enum sort_mode sort__mode;
 extern struct sort_entry sort_comm;
 extern struct sort_entry sort_dso;
-- 
cgit v0.10.2


From 2eafd410e669c744208f8110940e42caa7d79447 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 21 Jan 2016 19:13:24 -0300
Subject: perf hists browser: Only 'Zoom into thread' only when sort order has
 'pid'

We can't offer a zoom into thread when a bucket (struct hist_entry) may
have samples for more than one thread, i.e. when 'pid' is not part of
the sort order, fix it.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>,
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1452960197-5323-9-git-send-email-namhyung@kernel.org
[ Carved out from a  larger patch ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 08c09ad..e66b3a3 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1782,7 +1782,7 @@ static int
 add_thread_opt(struct hist_browser *browser, struct popup_action *act,
 	       char **optstr, struct thread *thread)
 {
-	if (thread == NULL)
+	if (!sort__has_thread || thread == NULL)
 		return 0;
 
 	if (asprintf(optstr, "Zoom %s %s(%d) thread",
@@ -2307,10 +2307,12 @@ skip_annotation:
 					     socked_id);
 		/* perf script support */
 		if (browser->he_selection) {
-			nr_options += add_script_opt(browser,
-						     &actions[nr_options],
-						     &options[nr_options],
-						     thread, NULL);
+			if (sort__has_thread && thread) {
+				nr_options += add_script_opt(browser,
+							     &actions[nr_options],
+							     &options[nr_options],
+							     thread, NULL);
+			}
 			/*
 			 * Note that browser->selection != NULL
 			 * when browser->he_selection is not NULL,
-- 
cgit v0.10.2


From c221acb0f970d3b80d72c812cda19c121acf5d52 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 21 Jan 2016 19:50:09 -0300
Subject: perf hists browser: Only offer symbol scripting when a symbol is
 under the cursor

When this feature was introduced a check was made if there was a
resolved symbol under the cursor, it got lost in commit ea7cd5923309
("perf hists browser: Split popup menu actions - part 2"), reinstate it.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>,
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: ea7cd5923309 ("perf hists browser: Split popup menu actions - part 2")
Link: http://lkml.kernel.org/r/1452960197-5323-9-git-send-email-namhyung@kernel.org
[ Carved out from a  larger patch ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index e66b3a3..2801d80 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -2322,10 +2322,12 @@ skip_annotation:
 			 *
 			 * See hist_browser__show_entry.
 			 */
-			nr_options += add_script_opt(browser,
-						     &actions[nr_options],
-						     &options[nr_options],
-						     NULL, browser->selection->sym);
+			if (sort__has_sym && browser->selection->sym) {
+				nr_options += add_script_opt(browser,
+							     &actions[nr_options],
+							     &options[nr_options],
+							     NULL, browser->selection->sym);
+			}
 		}
 		nr_options += add_script_opt(browser, &actions[nr_options],
 					     &options[nr_options], NULL, NULL);
-- 
cgit v0.10.2


From b1447a54f5b41eaf1cc469d9bd3834caa2ff9afb Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 22 Jan 2016 11:22:41 -0300
Subject: perf hists browser: Offer 'Zoom into DSO'/'Map details' only when
 sort order has 'dso'

We can't offer a zoom into DSO when a bucket (struct hist_entry) may
have samples for more than one DSO, i.e. when 'dso' is not part of
the sort order, ditto for 'Map details', fix it.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>,
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1452960197-5323-9-git-send-email-namhyung@kernel.org
[ Carved out from a  larger patch, moved check to add_{dso,map}_opt() ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 2801d80..e892106 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1825,7 +1825,7 @@ static int
 add_dso_opt(struct hist_browser *browser, struct popup_action *act,
 	    char **optstr, struct map *map)
 {
-	if (map == NULL)
+	if (!sort__has_dso || map == NULL)
 		return 0;
 
 	if (asprintf(optstr, "Zoom %s %s DSO",
@@ -1850,7 +1850,7 @@ static int
 add_map_opt(struct hist_browser *browser __maybe_unused,
 	    struct popup_action *act, char **optstr, struct map *map)
 {
-	if (map == NULL)
+	if (!sort__has_dso || map == NULL)
 		return 0;
 
 	if (asprintf(optstr, "Browse map details") < 0)
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 09616f0..89a1273 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -32,6 +32,7 @@ extern const char default_sort_order[];
 extern regex_t ignore_callees_regex;
 extern int have_ignore_callees;
 extern int sort__need_collapse;
+extern int sort__has_dso;
 extern int sort__has_parent;
 extern int sort__has_sym;
 extern int sort__has_socket;
-- 
cgit v0.10.2


From d9695d9f93649ecc00877ec2c847739c54a4cbb3 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 22 Jan 2016 12:20:18 -0300
Subject: perf hists browser: Be a bit more strict about presenting CPU socket
 zoom

For consistency with the other sort order checks.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>,
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1452960197-5323-9-git-send-email-namhyung@kernel.org
[ Carved out from a  larger patch, moved check to add_socket_opt() ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index e892106..b919582 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1971,7 +1971,7 @@ static int
 add_socket_opt(struct hist_browser *browser, struct popup_action *act,
 	       char **optstr, int socket_id)
 {
-	if (socket_id < 0)
+	if (!sort__has_socket || socket_id < 0)
 		return 0;
 
 	if (asprintf(optstr, "Zoom %s Processor Socket %d",
-- 
cgit v0.10.2


From 4056132e1072f02bbad77f2071770271cc5b58fc Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 22 Jan 2016 12:26:06 -0300
Subject: perf hists browser: Offer non-symbol specific menu options for --sort
 without 'sym'

Now that we check more strictly what each of the menu entries need, we
can stop bailing out when 'sym' is not in the --sort order, instead we
let each option be added if what it needs is present.

This way, for instance, we can run scripts on all samples, see DSO map
details when 'dso' is in the --sort provided, etc.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>,
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1452960197-5323-9-git-send-email-namhyung@kernel.org
[ Carved out from a  larger patch ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index b919582..d07e6be 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -2263,10 +2263,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 			continue;
 		}
 
-		if (!sort__has_sym)
-			goto add_exit_option;
-
-		if (browser->selection == NULL)
+		if (!sort__has_sym || browser->selection == NULL)
 			goto skip_annotation;
 
 		if (sort__mode == SORT_MODE__BRANCH) {
@@ -2333,7 +2330,6 @@ skip_annotation:
 					     &options[nr_options], NULL, NULL);
 		nr_options += add_switch_opt(browser, &actions[nr_options],
 					     &options[nr_options]);
-add_exit_option:
 		nr_options += add_exit_opt(browser, &actions[nr_options],
 					   &options[nr_options]);
 
-- 
cgit v0.10.2


From 8acd3da03c3f6e4e31472c5c73402b95a5d0f6cb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 25 Jan 2016 18:01:57 -0300
Subject: perf machine: Introduce machine__find_kernel_symbol_by_name()

To be used in the 'vmlinux matches kallsyms' 'perf test'  entry.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-m56g1853lz2c6nhnqxibq4jd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index 2c2b443..1a3e45b 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -180,6 +180,16 @@ struct symbol *machine__find_kernel_symbol(struct machine *machine,
 }
 
 static inline
+struct symbol *machine__find_kernel_symbol_by_name(struct machine *machine,
+						   enum map_type type, const char *name,
+						   struct map **mapp,
+						   symbol_filter_t filter)
+{
+	return map_groups__find_symbol_by_name(&machine->kmaps, type, name,
+					       mapp, filter);
+}
+
+static inline
 struct symbol *machine__find_kernel_function(struct machine *machine, u64 addr,
 					     struct map **mapp,
 					     symbol_filter_t filter)
-- 
cgit v0.10.2


From ab414dcda8fa307388c40a540b35e3c98a9da5ae Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 25 Jan 2016 18:04:47 -0300
Subject: perf test: Fixup aliases checking in the 'vmlinux matches kallsyms'
 test

There are cases where looking at just the next and prev entries is not
enough, like with:

  $ readelf -sW /usr/lib/debug/lib/modules/4.3.3-301.fc23.x86_64/vmlinux | grep ffffffff81065ec0
   4979: ffffffff81065ec0 53 FUNC  LOCAL  DEFAULT 1 try_to_free_pud_page
   4980: ffffffff81065ec0 53 FUNC  LOCAL  DEFAULT 1 try_to_free_pte_page
   4981: ffffffff81065ec0 53 FUNC  LOCAL  DEFAULT 1 try_to_free_pmd_page

So just search by name to see if the symbol is in kallsyms.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-jj1vlljg7ol4i713l60rt5ai@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index f0bfc9e..630b0b4 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -110,7 +110,6 @@ int test__vmlinux_matches_kallsyms(int subtest __maybe_unused)
 	 */
 	for (nd = rb_first(&vmlinux_map->dso->symbols[type]); nd; nd = rb_next(nd)) {
 		struct symbol *pair, *first_pair;
-		bool backwards = true;
 
 		sym  = rb_entry(nd, struct symbol, rb_node);
 
@@ -151,27 +150,14 @@ next_pair:
 				continue;
 
 			} else {
-				struct rb_node *nnd;
-detour:
-				nnd = backwards ? rb_prev(&pair->rb_node) :
-						  rb_next(&pair->rb_node);
-				if (nnd) {
-					struct symbol *next = rb_entry(nnd, struct symbol, rb_node);
-
-					if (UM(next->start) == mem_start) {
-						pair = next;
+				pair = machine__find_kernel_symbol_by_name(&kallsyms, type, sym->name, NULL, NULL);
+				if (pair) {
+					if (UM(pair->start) == mem_start)
 						goto next_pair;
-					}
-				}
 
-				if (backwards) {
-					backwards = false;
-					pair = first_pair;
-					goto detour;
+					pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
+						 mem_start, sym->name, pair->name);
 				}
-
-				pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n",
-					 mem_start, sym->name, pair->name);
 			}
 		} else
 			pr_debug("%#" PRIx64 ": %s not on kallsyms\n",
-- 
cgit v0.10.2


From 7b6982ce4b38ecc3f63be46beb7bd079aa290fd7 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 25 Jan 2016 09:55:48 +0000
Subject: perf test: Add libbpf relocation checker

There's a bug in LLVM that it can generate unneeded relocation
information. See [1] and [2]. Libbpf should check the target section of
a relocation symbol.

This patch adds a testcase which references a global variable (BPF
doesn't support global variables). Before fixing libbpf, the new test
case can be loaded into kernel, the global variable acts like the first
map. It is incorrect.

Result:

  # ~/perf test BPF
  37: Test BPF filter                                          :
  37.1: Test basic BPF filtering                               : Ok
  37.2: Test BPF prologue generation                           : Ok
  37.3: Test BPF relocation checker                            : FAILED!

  # ~/perf test -v BPF
  ...
  libbpf: loading object '[bpf_relocation_test]' from buffer
  libbpf: section .strtab, size 126, link 0, flags 0, type=3
  libbpf: section .text, size 0, link 0, flags 6, type=1
  libbpf: section .data, size 0, link 0, flags 3, type=1
  libbpf: section .bss, size 0, link 0, flags 3, type=8
  libbpf: section func=sys_write, size 104, link 0, flags 6, type=1
  libbpf: found program func=sys_write
  libbpf: section .relfunc=sys_write, size 16, link 10, flags 0, type=9
  libbpf: section maps, size 16, link 0, flags 3, type=1
  libbpf: maps in [bpf_relocation_test]: 16 bytes
  libbpf: section license, size 4, link 0, flags 3, type=1
  libbpf: license of [bpf_relocation_test] is GPL
  libbpf: section version, size 4, link 0, flags 3, type=1
  libbpf: kernel version of [bpf_relocation_test] is 40400
  libbpf: section .symtab, size 144, link 1, flags 0, type=2
  libbpf: map 0 is "my_table"
  libbpf: collecting relocating info for: 'func=sys_write'
  libbpf: relocation: insn_idx=7
  Success unexpectedly: libbpf error when dealing with relocation
  test child finished with -1
  ---- end ----
  Test BPF filter subtest 2: FAILED!

[1] https://llvm.org/bugs/show_bug.cgi?id=26243
[2] https://patchwork.ozlabs.org/patch/571385/

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1453715801-7732-2-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 5d34815..97ce869 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -618,7 +618,7 @@ clean: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean
 	$(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32
 	$(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)FEATURE-DUMP $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex* \
 		$(OUTPUT)util/intel-pt-decoder/inat-tables.c $(OUTPUT)fixdep \
-		$(OUTPUT)tests/llvm-src-{base,kbuild,prologue}.c
+		$(OUTPUT)tests/llvm-src-{base,kbuild,prologue,relocation}.c
 	$(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
 	$(python-clean)
 
diff --git a/tools/perf/tests/.gitignore b/tools/perf/tests/.gitignore
index bf016c4..8cc30e7 100644
--- a/tools/perf/tests/.gitignore
+++ b/tools/perf/tests/.gitignore
@@ -1,3 +1,4 @@
 llvm-src-base.c
 llvm-src-kbuild.c
 llvm-src-prologue.c
+llvm-src-relocation.c
diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build
index 614899b..1ba628e 100644
--- a/tools/perf/tests/Build
+++ b/tools/perf/tests/Build
@@ -31,7 +31,7 @@ perf-y += sample-parsing.o
 perf-y += parse-no-sample-id-all.o
 perf-y += kmod-path.o
 perf-y += thread-map.o
-perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o
+perf-y += llvm.o llvm-src-base.o llvm-src-kbuild.o llvm-src-prologue.o llvm-src-relocation.o
 perf-y += bpf.o
 perf-y += topology.o
 perf-y += cpumap.o
@@ -59,6 +59,13 @@ $(OUTPUT)tests/llvm-src-prologue.c: tests/bpf-script-test-prologue.c tests/Build
 	$(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
 	$(Q)echo ';' >> $@
 
+$(OUTPUT)tests/llvm-src-relocation.c: tests/bpf-script-test-relocation.c tests/Build
+	$(call rule_mkdir)
+	$(Q)echo '#include <tests/llvm.h>' > $@
+	$(Q)echo 'const char test_llvm__bpf_test_relocation[] =' >> $@
+	$(Q)sed -e 's/"/\\"/g' -e 's/\(.*\)/"\1\\n"/g' $< >> $@
+	$(Q)echo ';' >> $@
+
 ifeq ($(ARCH),$(filter $(ARCH),x86 arm arm64))
 perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
 endif
diff --git a/tools/perf/tests/bpf-script-test-relocation.c b/tools/perf/tests/bpf-script-test-relocation.c
new file mode 100644
index 0000000..93af774
--- /dev/null
+++ b/tools/perf/tests/bpf-script-test-relocation.c
@@ -0,0 +1,50 @@
+/*
+ * bpf-script-test-relocation.c
+ * Test BPF loader checking relocation
+ */
+#ifndef LINUX_VERSION_CODE
+# error Need LINUX_VERSION_CODE
+# error Example: for 4.2 kernel, put 'clang-opt="-DLINUX_VERSION_CODE=0x40200" into llvm section of ~/.perfconfig'
+#endif
+#define BPF_ANY 0
+#define BPF_MAP_TYPE_ARRAY 2
+#define BPF_FUNC_map_lookup_elem 1
+#define BPF_FUNC_map_update_elem 2
+
+static void *(*bpf_map_lookup_elem)(void *map, void *key) =
+	(void *) BPF_FUNC_map_lookup_elem;
+static void *(*bpf_map_update_elem)(void *map, void *key, void *value, int flags) =
+	(void *) BPF_FUNC_map_update_elem;
+
+struct bpf_map_def {
+	unsigned int type;
+	unsigned int key_size;
+	unsigned int value_size;
+	unsigned int max_entries;
+};
+
+#define SEC(NAME) __attribute__((section(NAME), used))
+struct bpf_map_def SEC("maps") my_table = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1,
+};
+
+int this_is_a_global_val;
+
+SEC("func=sys_write")
+int bpf_func__sys_write(void *ctx)
+{
+	int key = 0;
+	int value = 0;
+
+	/*
+	 * Incorrect relocation. Should not allow this program be
+	 * loaded into kernel.
+	 */
+	bpf_map_update_elem(&this_is_a_global_val, &key, &value, 0);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+int _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 33689a0..952ca99 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -71,6 +71,15 @@ static struct {
 		(NR_ITERS + 1) / 4,
 	},
 #endif
+	{
+		LLVM_TESTCASE_BPF_RELOCATION,
+		"Test BPF relocation checker",
+		"[bpf_relocation_test]",
+		"fix 'perf test LLVM' first",
+		"libbpf error when dealing with relocation",
+		NULL,
+		0,
+	},
 };
 
 static int do_test(struct bpf_object *obj, int (*func)(void),
@@ -190,7 +199,7 @@ static int __test__bpf(int idx)
 
 	ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz,
 				       bpf_testcase_table[idx].prog_id,
-				       true);
+				       true, NULL);
 	if (ret != TEST_OK || !obj_buf || !obj_buf_sz) {
 		pr_debug("Unable to get BPF object, %s\n",
 			 bpf_testcase_table[idx].msg_compile_fail);
@@ -202,14 +211,21 @@ static int __test__bpf(int idx)
 
 	obj = prepare_bpf(obj_buf, obj_buf_sz,
 			  bpf_testcase_table[idx].name);
-	if (!obj) {
+	if ((!!bpf_testcase_table[idx].target_func) != (!!obj)) {
+		if (!obj)
+			pr_debug("Fail to load BPF object: %s\n",
+				 bpf_testcase_table[idx].msg_load_fail);
+		else
+			pr_debug("Success unexpectedly: %s\n",
+				 bpf_testcase_table[idx].msg_load_fail);
 		ret = TEST_FAIL;
 		goto out;
 	}
 
-	ret = do_test(obj,
-		      bpf_testcase_table[idx].target_func,
-		      bpf_testcase_table[idx].expect_result);
+	if (obj)
+		ret = do_test(obj,
+			      bpf_testcase_table[idx].target_func,
+			      bpf_testcase_table[idx].expect_result);
 out:
 	bpf__clear();
 	return ret;
diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c
index 06f45c1..70edcdf 100644
--- a/tools/perf/tests/llvm.c
+++ b/tools/perf/tests/llvm.c
@@ -35,6 +35,7 @@ static int test__bpf_parsing(void *obj_buf __maybe_unused,
 static struct {
 	const char *source;
 	const char *desc;
+	bool should_load_fail;
 } bpf_source_table[__LLVM_TESTCASE_MAX] = {
 	[LLVM_TESTCASE_BASE] = {
 		.source = test_llvm__bpf_base_prog,
@@ -48,14 +49,19 @@ static struct {
 		.source = test_llvm__bpf_test_prologue_prog,
 		.desc = "Compile source for BPF prologue generation test",
 	},
+	[LLVM_TESTCASE_BPF_RELOCATION] = {
+		.source = test_llvm__bpf_test_relocation,
+		.desc = "Compile source for BPF relocation test",
+		.should_load_fail = true,
+	},
 };
 
-
 int
 test_llvm__fetch_bpf_obj(void **p_obj_buf,
 			 size_t *p_obj_buf_sz,
 			 enum test_llvm__testcase idx,
-			 bool force)
+			 bool force,
+			 bool *should_load_fail)
 {
 	const char *source;
 	const char *desc;
@@ -68,6 +74,8 @@ test_llvm__fetch_bpf_obj(void **p_obj_buf,
 
 	source = bpf_source_table[idx].source;
 	desc = bpf_source_table[idx].desc;
+	if (should_load_fail)
+		*should_load_fail = bpf_source_table[idx].should_load_fail;
 
 	perf_config(perf_config_cb, NULL);
 
@@ -136,14 +144,15 @@ int test__llvm(int subtest)
 	int ret;
 	void *obj_buf = NULL;
 	size_t obj_buf_sz = 0;
+	bool should_load_fail = false;
 
 	if ((subtest < 0) || (subtest >= __LLVM_TESTCASE_MAX))
 		return TEST_FAIL;
 
 	ret = test_llvm__fetch_bpf_obj(&obj_buf, &obj_buf_sz,
-				       subtest, false);
+				       subtest, false, &should_load_fail);
 
-	if (ret == TEST_OK) {
+	if (ret == TEST_OK && !should_load_fail) {
 		ret = test__bpf_parsing(obj_buf, obj_buf_sz);
 		if (ret != TEST_OK) {
 			pr_debug("Failed to parse test case '%s'\n",
diff --git a/tools/perf/tests/llvm.h b/tools/perf/tests/llvm.h
index 5150b4d..0eaa604 100644
--- a/tools/perf/tests/llvm.h
+++ b/tools/perf/tests/llvm.h
@@ -7,14 +7,17 @@
 extern const char test_llvm__bpf_base_prog[];
 extern const char test_llvm__bpf_test_kbuild_prog[];
 extern const char test_llvm__bpf_test_prologue_prog[];
+extern const char test_llvm__bpf_test_relocation[];
 
 enum test_llvm__testcase {
 	LLVM_TESTCASE_BASE,
 	LLVM_TESTCASE_KBUILD,
 	LLVM_TESTCASE_BPF_PROLOGUE,
+	LLVM_TESTCASE_BPF_RELOCATION,
 	__LLVM_TESTCASE_MAX,
 };
 
 int test_llvm__fetch_bpf_obj(void **p_obj_buf, size_t *p_obj_buf_sz,
-			     enum test_llvm__testcase index, bool force);
+			     enum test_llvm__testcase index, bool force,
+			     bool *should_load_fail);
 #endif
-- 
cgit v0.10.2


From 666810e86a3b7531cce892fbeda3b2f2322e1d72 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 25 Jan 2016 09:55:49 +0000
Subject: perf bpf: Check relocation target section

Libbpf should check the target section before doing relocation to ensure
the relocation is correct. If not, a bug in LLVM causes an error. See
[1].  Also, if an incorrect BPF script uses both global variable and
map, global variable whould be treated as map and be relocated without
error.

This patch saves the id of the map section into obj->efile and compare
target section of a relocation symbol against it during relocation.

Previous patch introduces a test case about this problem.  After this
patch:

  # ~/perf test BPF
  37: Test BPF filter                                          :
  37.1: Test basic BPF filtering                               : Ok
  37.2: Test BPF prologue generation                           : Ok
  37.3: Test BPF relocation checker                            : Ok

  # perf test -v BPF
  ...
  37.3: Test BPF relocation checker                            :
  ...
  libbpf: loading object '[bpf_relocation_test]' from buffer
  libbpf: section .strtab, size 126, link 0, flags 0, type=3
  libbpf: section .text, size 0, link 0, flags 6, type=1
  libbpf: section .data, size 0, link 0, flags 3, type=1
  libbpf: section .bss, size 0, link 0, flags 3, type=8
  libbpf: section func=sys_write, size 104, link 0, flags 6, type=1
  libbpf: found program func=sys_write
  libbpf: section .relfunc=sys_write, size 16, link 10, flags 0, type=9
  libbpf: section maps, size 16, link 0, flags 3, type=1
  libbpf: maps in [bpf_relocation_test]: 16 bytes
  libbpf: section license, size 4, link 0, flags 3, type=1
  libbpf: license of [bpf_relocation_test] is GPL
  libbpf: section version, size 4, link 0, flags 3, type=1
  libbpf: kernel version of [bpf_relocation_test] is 40400
  libbpf: section .symtab, size 144, link 1, flags 0, type=2
  libbpf: map 0 is "my_table"
  libbpf: collecting relocating info for: 'func=sys_write'
  libbpf: Program 'func=sys_write' contains non-map related relo data pointing to section 65522
  bpf: failed to load buffer
  Compile BPF program failed.
  test child finished with 0
  ---- end ----
  Test BPF filter subtest 2: Ok

[1] https://llvm.org/bugs/show_bug.cgi?id=26243

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1453715801-7732-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 8334a5a..7e543c3 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -201,6 +201,7 @@ struct bpf_object {
 			Elf_Data *data;
 		} *reloc;
 		int nr_reloc;
+		int maps_shndx;
 	} efile;
 	/*
 	 * All loaded bpf_object is linked in a list, which is
@@ -350,6 +351,7 @@ static struct bpf_object *bpf_object__new(const char *path,
 	 */
 	obj->efile.obj_buf = obj_buf;
 	obj->efile.obj_buf_sz = obj_buf_sz;
+	obj->efile.maps_shndx = -1;
 
 	obj->loaded = false;
 
@@ -529,12 +531,12 @@ bpf_object__init_maps(struct bpf_object *obj, void *data,
 }
 
 static int
-bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
+bpf_object__init_maps_name(struct bpf_object *obj)
 {
 	int i;
 	Elf_Data *symbols = obj->efile.symbols;
 
-	if (!symbols || maps_shndx < 0)
+	if (!symbols || obj->efile.maps_shndx < 0)
 		return -EINVAL;
 
 	for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
@@ -544,7 +546,7 @@ bpf_object__init_maps_name(struct bpf_object *obj, int maps_shndx)
 
 		if (!gelf_getsym(symbols, i, &sym))
 			continue;
-		if (sym.st_shndx != maps_shndx)
+		if (sym.st_shndx != obj->efile.maps_shndx)
 			continue;
 
 		map_name = elf_strptr(obj->efile.elf,
@@ -572,7 +574,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 	Elf *elf = obj->efile.elf;
 	GElf_Ehdr *ep = &obj->efile.ehdr;
 	Elf_Scn *scn = NULL;
-	int idx = 0, err = 0, maps_shndx = -1;
+	int idx = 0, err = 0;
 
 	/* Elf is corrupted/truncated, avoid calling elf_strptr. */
 	if (!elf_rawdata(elf_getscn(elf, ep->e_shstrndx), NULL)) {
@@ -625,7 +627,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 		else if (strcmp(name, "maps") == 0) {
 			err = bpf_object__init_maps(obj, data->d_buf,
 						    data->d_size);
-			maps_shndx = idx;
+			obj->efile.maps_shndx = idx;
 		} else if (sh.sh_type == SHT_SYMTAB) {
 			if (obj->efile.symbols) {
 				pr_warning("bpf: multiple SYMTAB in %s\n",
@@ -674,8 +676,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
 		pr_warning("Corrupted ELF file: index of strtab invalid\n");
 		return LIBBPF_ERRNO__FORMAT;
 	}
-	if (maps_shndx >= 0)
-		err = bpf_object__init_maps_name(obj, maps_shndx);
+	if (obj->efile.maps_shndx >= 0)
+		err = bpf_object__init_maps_name(obj);
 out:
 	return err;
 }
@@ -697,7 +699,8 @@ bpf_object__find_prog_by_idx(struct bpf_object *obj, int idx)
 static int
 bpf_program__collect_reloc(struct bpf_program *prog,
 			   size_t nr_maps, GElf_Shdr *shdr,
-			   Elf_Data *data, Elf_Data *symbols)
+			   Elf_Data *data, Elf_Data *symbols,
+			   int maps_shndx)
 {
 	int i, nrels;
 
@@ -724,9 +727,6 @@ bpf_program__collect_reloc(struct bpf_program *prog,
 			return -LIBBPF_ERRNO__FORMAT;
 		}
 
-		insn_idx = rel.r_offset / sizeof(struct bpf_insn);
-		pr_debug("relocation: insn_idx=%u\n", insn_idx);
-
 		if (!gelf_getsym(symbols,
 				 GELF_R_SYM(rel.r_info),
 				 &sym)) {
@@ -735,6 +735,15 @@ bpf_program__collect_reloc(struct bpf_program *prog,
 			return -LIBBPF_ERRNO__FORMAT;
 		}
 
+		if (sym.st_shndx != maps_shndx) {
+			pr_warning("Program '%s' contains non-map related relo data pointing to section %u\n",
+				   prog->section_name, sym.st_shndx);
+			return -LIBBPF_ERRNO__RELOC;
+		}
+
+		insn_idx = rel.r_offset / sizeof(struct bpf_insn);
+		pr_debug("relocation: insn_idx=%u\n", insn_idx);
+
 		if (insns[insn_idx].code != (BPF_LD | BPF_IMM | BPF_DW)) {
 			pr_warning("bpf: relocation: invalid relo for insns[%d].code 0x%x\n",
 				   insn_idx, insns[insn_idx].code);
@@ -863,7 +872,8 @@ static int bpf_object__collect_reloc(struct bpf_object *obj)
 
 		err = bpf_program__collect_reloc(prog, nr_maps,
 						 shdr, data,
-						 obj->efile.symbols);
+						 obj->efile.symbols,
+						 obj->efile.maps_shndx);
 		if (err)
 			return err;
 	}
-- 
cgit v0.10.2


From 9fd4186ac19a4c8182dffc9b15dd288b50f09f76 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 25 Jan 2016 09:55:50 +0000
Subject: tools build: Allow subprojects select all feature checkers

Put feature checkers not in original FEATURE_TESTS to a new list and
allow subproject select all feature checkers by setting FEATURE_TESTS to
'all'.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/1453715801-7732-4-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 02db3cd..674c47d 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -27,7 +27,7 @@ endef
 #   the rule that uses them - an example for that is the 'bionic'
 #   feature check. ]
 #
-FEATURE_TESTS ?=			\
+FEATURE_TESTS_BASIC :=			\
 	backtrace			\
 	dwarf				\
 	fortify-source			\
@@ -56,6 +56,25 @@ FEATURE_TESTS ?=			\
 	get_cpuid			\
 	bpf
 
+# FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list
+# of all feature tests
+FEATURE_TESTS_EXTRA :=			\
+	bionic				\
+	compile-32			\
+	compile-x32			\
+	cplus-demangle			\
+	hello				\
+	libbabeltrace			\
+	liberty				\
+	liberty-z			\
+	libunwind-debug-frame
+
+FEATURE_TESTS ?= $(FEATURE_TESTS_BASIC)
+
+ifeq ($(FEATURE_TESTS),all)
+  FEATURE_TESTS := $(FEATURE_TESTS_BASIC) $(FEATURE_TESTS_EXTRA)
+endif
+
 FEATURE_DISPLAY ?=			\
 	dwarf				\
 	glibc				\
-- 
cgit v0.10.2


From c053a1506faee399cbc2105f2131bb5a5d99eedd Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 25 Jan 2016 09:55:51 +0000
Subject: perf build: Select all feature checkers for feature-dump

Set FEATURE_TESTS to 'all' so all possible feature checkers are
executed. Without this setting the output feature dump file miss some
feature, for example, liberity. Select all checker so we won't get an
incomplete feature dump file.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/1453715801-7732-5-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 97ce869..0ef3d97 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -165,7 +165,16 @@ ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),)
 endif
 endif
 
+# Set FEATURE_TESTS to 'all' so all possible feature checkers are executed.
+# Without this setting the output feature dump file misses some features, for
+# example, liberty. Select all checkers so we won't get an incomplete feature
+# dump file.
 ifeq ($(config),1)
+ifdef MAKECMDGOALS
+ifeq ($(filter feature-dump,$(MAKECMDGOALS)),feature-dump)
+FEATURE_TESTS := all
+endif
+endif
 include config/Makefile
 endif
 
-- 
cgit v0.10.2


From b1baae89197e21cd115e9493d5a17f18fca81e6a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 26 Jan 2016 15:37:30 -0300
Subject: perf hists browser: Skip scripting when perf.data file not available

The script and data-switch context menu are only meaningful when it
deals with a data file.  So add a check so that it cannot be shown when
perf-top is run.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>,
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453555902-18401-4-git-send-email-namhyung@kernel.org
[ Use goto skip_scripting instead of two is_report_browser() tests ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index d07e6be..1da30f8 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -2303,6 +2303,9 @@ skip_annotation:
 					     &options[nr_options],
 					     socked_id);
 		/* perf script support */
+		if (!is_report_browser(hbt))
+			goto skip_scripting;
+
 		if (browser->he_selection) {
 			if (sort__has_thread && thread) {
 				nr_options += add_script_opt(browser,
@@ -2330,6 +2333,7 @@ skip_annotation:
 					     &options[nr_options], NULL, NULL);
 		nr_options += add_switch_opt(browser, &actions[nr_options],
 					     &options[nr_options]);
+skip_scripting:
 		nr_options += add_exit_opt(browser, &actions[nr_options],
 					   &options[nr_options]);
 
-- 
cgit v0.10.2


From 5ac76283b32b116c58e362e99542182ddcfc8262 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 Jan 2016 15:51:46 -0300
Subject: perf cpumap: Auto initialize cpu__max_{node,cpu}

Since it was always checking if the initialization was done, use that
branch to do the initialization if not done already.

With this we reduce the number of exported globals from these files.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/20160125212955.GG22501@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index fa93509..9bcf2be 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -8,6 +8,10 @@
 #include <linux/bitmap.h>
 #include "asm/bug.h"
 
+static int max_cpu_num;
+static int max_node_num;
+static int *cpunode_map;
+
 static struct cpu_map *cpu_map__default_new(void)
 {
 	struct cpu_map *cpus;
@@ -486,6 +490,32 @@ out:
 		pr_err("Failed to read max nodes, using default of %d\n", max_node_num);
 }
 
+int cpu__max_node(void)
+{
+	if (unlikely(!max_node_num))
+		set_max_node_num();
+
+	return max_node_num;
+}
+
+int cpu__max_cpu(void)
+{
+	if (unlikely(!max_cpu_num))
+		set_max_cpu_num();
+
+	return max_cpu_num;
+}
+
+int cpu__get_node(int cpu)
+{
+	if (unlikely(cpunode_map == NULL)) {
+		pr_debug("cpu_map not initialized\n");
+		return -1;
+	}
+
+	return cpunode_map[cpu];
+}
+
 static int init_cpunode_map(void)
 {
 	int i;
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 71c41b9..81a2562 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -57,37 +57,11 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
 	return map ? map->map[0] == -1 : true;
 }
 
-int max_cpu_num;
-int max_node_num;
-int *cpunode_map;
-
 int cpu__setup_cpunode_map(void);
 
-static inline int cpu__max_node(void)
-{
-	if (unlikely(!max_node_num))
-		pr_debug("cpu_map not initialized\n");
-
-	return max_node_num;
-}
-
-static inline int cpu__max_cpu(void)
-{
-	if (unlikely(!max_cpu_num))
-		pr_debug("cpu_map not initialized\n");
-
-	return max_cpu_num;
-}
-
-static inline int cpu__get_node(int cpu)
-{
-	if (unlikely(cpunode_map == NULL)) {
-		pr_debug("cpu_map not initialized\n");
-		return -1;
-	}
-
-	return cpunode_map[cpu];
-}
+int cpu__max_node(void);
+int cpu__max_cpu(void);
+int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
 		       int (*f)(struct cpu_map *map, int cpu, void *data),
-- 
cgit v0.10.2


From 14365449b6ce34cf6a3040ff8ebbb39d89d67159 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Tue, 26 Jan 2016 18:21:21 +0600
Subject: x86/asm: Remove unused L3_PAGE_OFFSET

L3_PAGE_OFFSET was introduced in commit a6523748bd (paravirt/x86, 64-bit: move
__PAGE_OFFSET to leave a space for hypervisor), but has no users.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Link: http://lkml.kernel.org/r/1453810881-30622-1-git-send-email-kuleshovmail@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ffdc0e8..2e97468 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -38,7 +38,6 @@
 #define pud_index(x)	(((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
 
 L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET)
-L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
 L4_START_KERNEL = pgd_index(__START_KERNEL_map)
 L3_START_KERNEL = pud_index(__START_KERNEL_map)
 
-- 
cgit v0.10.2


From dd42ac8f02aea32661756554aace2095f7181d34 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Fri, 16 Oct 2015 15:20:53 +0600
Subject: clockevents: Rename last parameter of clocks_calc_mult_shift() to
 maxsec

Last parameter of the clocks_calc_mult_shift() was renamed from minsec to
maxsec in the 5fdade95 (time: Rename misnamed minsec argument of
clocks_calc_mult_shift()).

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Link: http://lkml.kernel.org/r/1444987253-11018-1-git-send-email-kuleshovmail@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index bdcf358..0d442e3 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -190,9 +190,9 @@ extern void clockevents_config_and_register(struct clock_event_device *dev,
 extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq);
 
 static inline void
-clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec)
+clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec)
 {
-	return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, minsec);
+	return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec);
 }
 
 extern void clockevents_suspend(void);
-- 
cgit v0.10.2


From 9c808765e88efb6fa6af7e2206ef89512f1840a7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 15 Jan 2016 17:41:08 +0000
Subject: hrtimer: Add support for CLOCK_MONOTONIC_RAW

The KVM/ARM timer implementation arms a hrtimer when a vcpu is
blocked (usually because it is waiting for an interrupt)
while its timer is going to kick in the future.

It is essential that this timer doesn't get adjusted, or the
guest will end up being woken-up at the wrong time (NTP running
on the host seems to confuse the hell out of some guests).

In order to allow this, let's add CLOCK_MONOTONIC_RAW support
to hrtimer (it is so far only supported for posix timers). It also
has the (limited) benefit of fixing de0421d53bfb ("mac80211_hwsim:
shuffle code to prepare for dynamic radios"), which already uses
this functionnality without realizing wasn't implemented (just being
lucky...).

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Tomasz Nowicki <tn@semihalf.com>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Link: http://lkml.kernel.org/r/1452879670-16133-2-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 76dd4f0..a6d64af 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -151,6 +151,7 @@ enum  hrtimer_base_type {
 	HRTIMER_BASE_REALTIME,
 	HRTIMER_BASE_BOOTTIME,
 	HRTIMER_BASE_TAI,
+	HRTIMER_BASE_MONOTONIC_RAW,
 	HRTIMER_MAX_CLOCK_BASES,
 };
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 435b885..a125f22 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -90,12 +90,18 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
 		},
+		{
+			.index = HRTIMER_BASE_MONOTONIC_RAW,
+			.clockid = CLOCK_MONOTONIC_RAW,
+			.get_time = &ktime_get_raw,
+		},
 	}
 };
 
 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
 	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
 	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
+	[CLOCK_MONOTONIC_RAW]	= HRTIMER_BASE_MONOTONIC_RAW,
 	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
 	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
@@ -1268,7 +1274,10 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
 		if (!(active & 0x01))
 			continue;
 
-		basenow = ktime_add(now, base->offset);
+		if (unlikely(base->index == HRTIMER_BASE_MONOTONIC_RAW))
+			basenow = ktime_get_raw();
+		else
+			basenow = ktime_add(now, base->offset);
 
 		while ((node = timerqueue_getnext(&base->active))) {
 			struct hrtimer *timer;
-- 
cgit v0.10.2


From 9006a01829a50cfd6bbd4980910ed46e895e93d7 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 15 Jan 2016 17:41:09 +0000
Subject: hrtimer: Catch illegal clockids

It is way too easy to take any random clockid and feed it to
the hrtimer subsystem. At best, it gets mapped to a monotonic
base, but it would be better to just catch illegal values as
early as possible.

This patch does exactly that, mapping illegal clockids to an
illegal base index, and panicing when we detect the illegal
condition.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Tomasz Nowicki <tn@semihalf.com>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Link: http://lkml.kernel.org/r/1452879670-16133-3-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index a125f22..cb0fe70 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -99,6 +99,9 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 };
 
 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+	/* Make sure we catch unsupported clockids */
+	[0 ... MAX_CLOCKS - 1]	= HRTIMER_MAX_CLOCK_BASES,
+
 	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
 	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
 	[CLOCK_MONOTONIC_RAW]	= HRTIMER_BASE_MONOTONIC_RAW,
@@ -108,7 +111,9 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
 
 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 {
-	return hrtimer_clock_to_base_table[clock_id];
+	int base = hrtimer_clock_to_base_table[clock_id];
+	BUG_ON(base == HRTIMER_MAX_CLOCK_BASES);
+	return base;
 }
 
 /*
-- 
cgit v0.10.2


From a6e707ddbdf150bd1c2a5c0eccc55abdc62a0039 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 15 Jan 2016 17:41:10 +0000
Subject: KVM: arm/arm64: timer: Switch to CLOCK_MONOTONIC_RAW

In order to avoid NTP messing with the guest timer behind our back,
use the new and improved monotonic raw version of the hrtimers.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Tomasz Nowicki <tn@semihalf.com>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Link: http://lkml.kernel.org/r/1452879670-16133-4-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 69bca18..97c5815 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -48,7 +48,7 @@ static bool timer_is_armed(struct arch_timer_cpu *timer)
 static void timer_arm(struct arch_timer_cpu *timer, u64 ns)
 {
 	timer->armed = true;
-	hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns),
+	hrtimer_start(&timer->timer, ktime_add_ns(ktime_get_raw(), ns),
 		      HRTIMER_MODE_ABS);
 }
 
@@ -308,7 +308,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
 	INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
-	hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	hrtimer_init(&timer->timer, CLOCK_MONOTONIC_RAW, HRTIMER_MODE_ABS);
 	timer->timer.function = kvm_timer_expire;
 }
 
-- 
cgit v0.10.2


From bbf66d897adf2bb0c310db96c97e8db6369f39e1 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Fri, 22 Jan 2016 18:31:53 +0100
Subject: clocksource: Allow unregistering the watchdog

Hyper-V vmbus module registers TSC page clocksource when loaded. This is
the clocksource with the highest rating and thus it becomes the watchdog
making unloading of the vmbus module impossible.
Separate clocksource_select_watchdog() from clocksource_enqueue_watchdog()
and use it on clocksource register/rating change/unregister.

After all, lobotomized monkeys may need some love too.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Link: http://lkml.kernel.org/r/1453483913-25672-1-git-send-email-vkuznets@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 664de53..56ece14 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
 		/* cs is a watchdog. */
 		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
 			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
+	}
+	spin_unlock_irqrestore(&watchdog_lock, flags);
+}
+
+static void clocksource_select_watchdog(bool fallback)
+{
+	struct clocksource *cs, *old_wd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&watchdog_lock, flags);
+	/* save current watchdog */
+	old_wd = watchdog;
+	if (fallback)
+		watchdog = NULL;
+
+	list_for_each_entry(cs, &clocksource_list, list) {
+		/* cs is a clocksource to be watched. */
+		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
+			continue;
+
+		/* Skip current if we were requested for a fallback. */
+		if (fallback && cs == old_wd)
+			continue;
+
 		/* Pick the best watchdog. */
-		if (!watchdog || cs->rating > watchdog->rating) {
+		if (!watchdog || cs->rating > watchdog->rating)
 			watchdog = cs;
-			/* Reset watchdog cycles */
-			clocksource_reset_watchdog();
-		}
 	}
+	/* If we failed to find a fallback restore the old one. */
+	if (!watchdog)
+		watchdog = old_wd;
+
+	/* If we changed the watchdog we need to reset cycles. */
+	if (watchdog != old_wd)
+		clocksource_reset_watchdog();
+
 	/* Check if the watchdog timer needs to be started. */
 	clocksource_start_watchdog();
 	spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
 		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
 }
 
+static void clocksource_select_watchdog(bool fallback) { }
 static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
 static inline int __clocksource_watchdog_kthread(void) { return 0; }
@@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 	clocksource_enqueue(cs);
 	clocksource_enqueue_watchdog(cs);
 	clocksource_select();
+	clocksource_select_watchdog(false);
 	mutex_unlock(&clocksource_mutex);
 	return 0;
 }
@@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
 	mutex_lock(&clocksource_mutex);
 	__clocksource_change_rating(cs, rating);
 	clocksource_select();
+	clocksource_select_watchdog(false);
 	mutex_unlock(&clocksource_mutex);
 }
 EXPORT_SYMBOL(clocksource_change_rating);
@@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating);
  */
 static int clocksource_unbind(struct clocksource *cs)
 {
-	/*
-	 * I really can't convince myself to support this on hardware
-	 * designed by lobotomized monkeys.
-	 */
-	if (clocksource_is_watchdog(cs))
-		return -EBUSY;
+	if (clocksource_is_watchdog(cs)) {
+		/* Select and try to install a replacement watchdog. */
+		clocksource_select_watchdog(true);
+		if (clocksource_is_watchdog(cs))
+			return -EBUSY;
+	}
 
 	if (cs == curr_clocksource) {
 		/* Select and try to install a replacement clock source */
-- 
cgit v0.10.2


From cf9162c290447cdf6fca7b64dd6e2200dc52f03b Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Wed, 27 Jan 2016 11:22:22 +0000
Subject: tools build: Check basic headers for test-compile feature checker

An i386 binary can be linked correctly even without correct headers.
Which causes problem. For exmaple:

 $ mv /tmp/oxygen_root/usr/include/gnu/stubs-32.h{,.bak}
 $ make tools/perf
 Auto-detecting system features:
 ...                         dwarf: [ on  ]
 [SNIP]
   GEN      common-cmds.h
   CC       perf-read-vdso32
 In file included from /tmp/oxygen_root/usr/include/features.h:388:0,
                  from /tmp/oxygen_root/usr/include/stdio.h:27,
                  from perf-read-vdso.c:1:
 /tmp/oxygen_root/usr/include/gnu/stubs.h:7:27: fatal error: gnu/stubs-32.h: No such file or directory
  # include <gnu/stubs-32.h>
                           ^
 compilation terminated.
 ...

In this patch we checks not only compiler and linker, but also basic
headers in test-compile test case, make it fail on a platform
lacking correct headers.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1453893742-20603-1-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/build/feature/test-compile.c b/tools/build/feature/test-compile.c
index 31dbf45..c54e655 100644
--- a/tools/build/feature/test-compile.c
+++ b/tools/build/feature/test-compile.c
@@ -1,4 +1,6 @@
+#include <stdio.h>
 int main(void)
 {
+	printf("Hello World!\n");
 	return 0;
 }
-- 
cgit v0.10.2


From 76c4aaec41b76afb7f914aaf6080f21ab331b0c6 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 28 Jan 2016 16:13:24 -0300
Subject: perf build: Fix feature-dump checks, we need to test all features

I see problem with test-all case speedup, because it does not
comprise checks for 32bits compilations, fix it.

The problem could be noticed by calling:

  make -C tools/perf feature-dump

That would end up misdetecting the feature-compile-x32, that, building
using 'gcc -mx32' needs stub headers not present in a fedora 23 devel
machine and thus fail to compile, but ended up appearing as detected,
i.e. present in tools/perf/FEATURE-DUMP as 'feature-compile-x32=1'.

With this fix it correctly appears as 'feature-compile-x32=0' and if we
uninstall the libc devel files for 32 bits (glibc-devel.i686), then the
relevant variable is flipped from 'feature-compile-32=1' to
'feature-compile-32=0'.

The same things happened for bionic and libbabeltrace, that were
misdetected because the are no tested in test-all.c

Reported-and-Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-u0sjaddf1r9m8icpd98ry7fz@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 674c47d..7bff2ea 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -119,6 +119,14 @@ ifeq ($(feature-all), 1)
   # test-all.c passed - just set all the core feature flags to 1:
   #
   $(foreach feat,$(FEATURE_TESTS),$(call feature_set,$(feat)))
+  #
+  # test-all.c does not comprise these tests, so we need to
+  # for this case to get features proper values
+  #
+  $(call feature_check,compile-32)
+  $(call feature_check,compile-x32)
+  $(call feature_check,bionic)
+  $(call feature_check,libbabeltrace)
 else
   $(foreach feat,$(FEATURE_TESTS),$(call feature_check,$(feat)))
 endif
-- 
cgit v0.10.2


From bd922477d9350a3006d73dabb241400e6c4181b0 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 28 Jan 2016 19:02:29 +0200
Subject: locking/x86: Add cc clobber for ADDL

ADDL clobbers flags (such as CF) but barrier.h didn't tell this
to GCC. Historically, GCC doesn't need one on x86, and always
considers flags clobbered. We are probably missing the cc
clobber in a *lot* of places for this reason.

But even if not necessary, it's probably a good thing to add for
documentation, and in case GCC semantcs ever change.

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: virtualization <virtualization@lists.linux-foundation.org>
Link: http://lkml.kernel.org/r/1453921746-16178-2-git-send-email-mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index a584e1c..a65bdb1 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -15,9 +15,12 @@
  * Some non-Intel clones support out of order store. wmb() ceases to be a
  * nop for these.
  */
-#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \
+				      X86_FEATURE_XMM2) ::: "memory", "cc")
+#define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \
+				       X86_FEATURE_XMM2) ::: "memory", "cc")
+#define wmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "sfence", \
+				       X86_FEATURE_XMM2) ::: "memory", "cc")
 #else
 #define mb() 	asm volatile("mfence":::"memory")
 #define rmb()	asm volatile("lfence":::"memory")
-- 
cgit v0.10.2


From e37cee133c72c9529f74a20d9b7eb3b6dfb928b5 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 28 Jan 2016 19:02:37 +0200
Subject: locking/x86: Drop a comment left over from X86_OOSTORE

The comment about wmb being non-NOP to deal with non-Intel CPUs
is a left over from before the following commit:

  09df7c4c8097 ("x86: Remove CONFIG_X86_OOSTORE")

It makes no sense now: in particular, wmb() is not a NOP even for
regular Intel CPUs because of weird use-cases e.g. dealing with
WC memory.

Drop this comment.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: virtualization <virtualization@lists.linux-foundation.org>
Link: http://lkml.kernel.org/r/1453921746-16178-3-git-send-email-mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index a65bdb1..a291745 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -11,10 +11,6 @@
  */
 
 #ifdef CONFIG_X86_32
-/*
- * Some non-Intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
 #define mb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "mfence", \
 				      X86_FEATURE_XMM2) ::: "memory", "cc")
 #define rmb() asm volatile(ALTERNATIVE("lock; addl $0,0(%%esp)", "lfence", \
-- 
cgit v0.10.2


From 57d9b1b43433a6ba7267c80b87d8e8f6e86edceb Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 28 Jan 2016 19:02:44 +0200
Subject: locking/x86: Tweak the comment about use of wmb() for IO

On x86, we *do* still use the non-NOP rmb()/wmb() for IO barriers,
but even that is generally questionable.

Leave them around as historial unless somebody can point to a
case where they care about the performance, but tweak the
comment so people don't think they are strictly required in all
cases.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: virtualization <virtualization@lists.linux-foundation.org>
Link: http://lkml.kernel.org/r/1453921746-16178-4-git-send-email-mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index a291745..bfb28ca 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -6,7 +6,7 @@
 
 /*
  * Force strict CPU ordering.
- * And yes, this is required on UP too when we're talking
+ * And yes, this might be required on UP too when we're talking
  * to devices.
  */
 
-- 
cgit v0.10.2


From ca59809ff6d572ae58fc6bedf7500f5a60fdbd64 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 28 Jan 2016 19:02:51 +0200
Subject: locking/x86: Use mb() around clflush()

The following commit:

  f8e617f4582995f ("sched/idle/x86: Optimize unnecessary mwait_idle() resched IPIs")

adds memory barriers around clflush(), but this seems wrong for UP since
barrier() has no effect on clflush().  We really want MFENCE, so switch
to mb() instead.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: virtualization <virtualization@lists.linux-foundation.org>
Link: http://lkml.kernel.org/r/1453921746-16178-5-git-send-email-mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f7c21c..9decee2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -418,9 +418,9 @@ static void mwait_idle(void)
 	if (!current_set_polling_and_test()) {
 		trace_cpu_idle_rcuidle(1, smp_processor_id());
 		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
-			smp_mb(); /* quirk */
+			mb(); /* quirk */
 			clflush((void *)&current_thread_info()->flags);
-			smp_mb(); /* quirk */
+			mb(); /* quirk */
 		}
 
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
-- 
cgit v0.10.2


From c31b34255b48d1a169693c9c70c49ad6418cfd20 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:19 -0800
Subject: selftests/x86: Extend Makefile to allow 64-bit-only tests

Previously the Makefile supported 32-bit-only tests and tests
that were 32-bit and 64-bit.  This adds the support for tests
that are only built as 64-bit binaries.

There aren't any yet, but there might be a few some day.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuah Khan <shuahkhan@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/99789bfe65706e6df32cc7e13f656e8c9fa92031.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index d0c473f..9c81f26 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -11,8 +11,9 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall
 			vdso_restorer
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
+TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
-BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64)
+BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
 
 CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
 
@@ -40,7 +41,7 @@ clean:
 $(TARGETS_C_32BIT_ALL:%=%_32): %_32: %.c
 	$(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm
 
-$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c
+$(TARGETS_C_64BIT_ALL:%=%_64): %_64: %.c
 	$(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
 
 # x86_64 users should be encouraged to install 32-bit libraries
-- 
cgit v0.10.2


From e21d50f3864e2a8995f5d2a41dea3f0fa07758b4 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:20 -0800
Subject: selftests/x86: Add check_initial_reg_state()

This checks that ELF binaries are started with an appropriately
blank register state.

( There's currently a nasty special case in the entry asm to
  arrange for this. I'm planning on removing the special case,
  and this will help make sure I don't break it. )

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/ef54f8d066b30a3eb36bbf26300eebb242185700.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 9c81f26..df4f767 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,7 +4,8 @@ include ../lib.mk
 
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
-TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \
+			check_initial_reg_state
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
 			ldt_gdt \
@@ -66,3 +67,9 @@ endif
 sysret_ss_attrs_64: thunks.S
 ptrace_syscall_32: raw_syscall_helper_32.S
 test_syscall_vdso_32: thunks_32.S
+
+# check_initial_reg_state is special: it needs a custom entry, and it
+# needs to be static so that its interpreter doesn't destroy its initial
+# state.
+check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
+check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c
new file mode 100644
index 0000000..6aaed9b
--- /dev/null
+++ b/tools/testing/selftests/x86/check_initial_reg_state.c
@@ -0,0 +1,109 @@
+/*
+ * check_initial_reg_state.c - check that execve sets the correct state
+ * Copyright (c) 2014-2016 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+
+unsigned long ax, bx, cx, dx, si, di, bp, sp, flags;
+unsigned long r8, r9, r10, r11, r12, r13, r14, r15;
+
+asm (
+	".pushsection .text\n\t"
+	".type real_start, @function\n\t"
+	".global real_start\n\t"
+	"real_start:\n\t"
+#ifdef __x86_64__
+	"mov %rax, ax\n\t"
+	"mov %rbx, bx\n\t"
+	"mov %rcx, cx\n\t"
+	"mov %rdx, dx\n\t"
+	"mov %rsi, si\n\t"
+	"mov %rdi, di\n\t"
+	"mov %rbp, bp\n\t"
+	"mov %rsp, sp\n\t"
+	"mov %r8, r8\n\t"
+	"mov %r9, r9\n\t"
+	"mov %r10, r10\n\t"
+	"mov %r11, r11\n\t"
+	"mov %r12, r12\n\t"
+	"mov %r13, r13\n\t"
+	"mov %r14, r14\n\t"
+	"mov %r15, r15\n\t"
+	"pushfq\n\t"
+	"popq flags\n\t"
+#else
+	"mov %eax, ax\n\t"
+	"mov %ebx, bx\n\t"
+	"mov %ecx, cx\n\t"
+	"mov %edx, dx\n\t"
+	"mov %esi, si\n\t"
+	"mov %edi, di\n\t"
+	"mov %ebp, bp\n\t"
+	"mov %esp, sp\n\t"
+	"pushfl\n\t"
+	"popl flags\n\t"
+#endif
+	"jmp _start\n\t"
+	".size real_start, . - real_start\n\t"
+	".popsection");
+
+int main()
+{
+	int nerrs = 0;
+
+	if (sp == 0) {
+		printf("[FAIL]\tTest was built incorrectly\n");
+		return 1;
+	}
+
+	if (ax || bx || cx || dx || si || di || bp
+#ifdef __x86_64__
+	    || r8 || r9 || r10 || r11 || r12 || r13 || r14 || r15
+#endif
+		) {
+		printf("[FAIL]\tAll GPRs except SP should be 0\n");
+#define SHOW(x) printf("\t" #x " = 0x%lx\n", x);
+		SHOW(ax);
+		SHOW(bx);
+		SHOW(cx);
+		SHOW(dx);
+		SHOW(si);
+		SHOW(di);
+		SHOW(bp);
+		SHOW(sp);
+#ifdef __x86_64__
+		SHOW(r8);
+		SHOW(r9);
+		SHOW(r10);
+		SHOW(r11);
+		SHOW(r12);
+		SHOW(r13);
+		SHOW(r14);
+		SHOW(r15);
+#endif
+		nerrs++;
+	} else {
+		printf("[OK]\tAll GPRs except SP are 0\n");
+	}
+
+	if (flags != 0x202) {
+		printf("[FAIL]\tFLAGS is 0x%lx, but it should be 0x202\n", flags);
+		nerrs++;
+	} else {
+		printf("[OK]\tFLAGS is 0x202\n");
+	}
+
+	return nerrs ? 1 : 0;
+}
-- 
cgit v0.10.2


From fba324744bfd2a7948a7710d7a021d76dafb9b67 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:21 -0800
Subject: x86/syscalls: Refactor syscalltbl.sh

This splits out the code to emit a syscall line.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1bfcbba991f5cfaa9291ff950a593daa972a205f.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 0e7f8ec..167965e 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,13 +3,21 @@
 in="$1"
 out="$2"
 
+emit() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+    compat="$4"
+    if [ -n "$compat" ]; then
+	echo "__SYSCALL_${abi}($nr, $entry, $compat)"
+    elif [ -n "$entry" ]; then
+	echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+    fi
+}
+
 grep '^[0-9]' "$in" | sort -n | (
     while read nr abi name entry compat; do
 	abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
-	if [ -n "$compat" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry, $compat)"
-	elif [ -n "$entry" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry, $entry)"
-	fi
+	emit "$abi" "$nr" "$entry" "$compat"
     done
 ) > "$out"
-- 
cgit v0.10.2


From 32324ce15ea8cb4c8acc28acb2fd36fabf73e9db Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:22 -0800
Subject: x86/syscalls: Remove __SYSCALL_COMMON and __SYSCALL_X32

The common/64/x32 distinction has no effect other than
determining which kernels actually support the syscall.  Move
the logic into syscalltbl.sh.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/58d4a95f40e43b894f93288b4a3633963d0ee22e.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 41283d2..974fd89 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,14 +6,6 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
-
 #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 167965e..5ebeaf1 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -18,6 +18,21 @@ emit() {
 grep '^[0-9]' "$in" | sort -n | (
     while read nr abi name entry compat; do
 	abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
-	emit "$abi" "$nr" "$entry" "$compat"
+	if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
+	    # COMMON is the same as 64, except that we don't expect X32
+	    # programs to use it.  Our expectation has nothing to do with
+	    # any generated code, so treat them the same.
+	    emit 64 "$nr" "$entry" "$compat"
+	elif [ "$abi" == "X32" ]; then
+	    # X32 is equivalent to 64 on an X32-compatible kernel.
+	    echo "#ifdef CONFIG_X86_X32_ABI"
+	    emit 64 "$nr" "$entry" "$compat"
+	    echo "#endif"
+	elif [ "$abi" == "I386" ]; then
+	    emit "$abi" "$nr" "$entry" "$compat"
+	else
+	    echo "Unknown abi $abi" >&2
+	    exit 1
+	fi
     done
 ) > "$out"
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index f2edafb..29db3b3 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -5,12 +5,6 @@
 #include <asm/ia32.h>
 
 #define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#ifdef CONFIG_X86_X32_ABI
-# define __SYSCALL_X32(nr, sym, compat) [nr] = 1,
-#else
-# define __SYSCALL_X32(nr, sym, compat) /* nothing */
-#endif
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index b74ea6c..71a497c 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,9 +35,6 @@
 #define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
-
 #define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index ce7e360..5edf4f4 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -15,8 +15,6 @@ static char syscalls[] = {
 };
 #else
 #define __SYSCALL_64(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1,
-#define __SYSCALL_X32(nr, sym, compat) /* Not supported */
 static char syscalls[] = {
 #include <asm/syscalls_64.h>
 };
-- 
cgit v0.10.2


From 3e65654e3db6df6aba9c5b895f8b8e6a8d8eb508 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:23 -0800
Subject: x86/syscalls: Move compat syscall entry handling into syscalltbl.sh

Rather than duplicating the compat entry handling in all
consumers of syscalls_BITS.h, handle it directly in
syscalltbl.sh.  Now we generate entries in syscalls_32.h like:

__SYSCALL_I386(5, sys_open)
__SYSCALL_I386(5, compat_sys_open)

and all of its consumers implicitly get the right entry point.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b7c2b501dc0e6e43050e916b95807c3e2e16e9bb.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 9a66498..3e28297 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,17 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#ifdef CONFIG_IA32_EMULATION
-#define SYM(sym, compat) compat
-#else
-#define SYM(sym, compat) sym
-#endif
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
-#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
+#define __SYSCALL_I386(nr, sym) [nr] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 974fd89..3781989 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_64(nr, sym) [nr] = sym,
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 5ebeaf1..b81479c 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -8,10 +8,24 @@ emit() {
     nr="$2"
     entry="$3"
     compat="$4"
-    if [ -n "$compat" ]; then
-	echo "__SYSCALL_${abi}($nr, $entry, $compat)"
-    elif [ -n "$entry" ]; then
-	echo "__SYSCALL_${abi}($nr, $entry, $entry)"
+
+    if [ "$abi" == "64" -a -n "$compat" ]; then
+	echo "a compat entry for a 64-bit syscall makes no sense" >&2
+	exit 1
+    fi
+
+    if [ -z "$compat" ]; then
+	if [ -n "$entry" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry)"
+	fi
+    else
+	echo "#ifdef CONFIG_X86_32"
+	if [ -n "$entry" ]; then
+	    echo "__SYSCALL_${abi}($nr, $entry)"
+	fi
+	echo "#else"
+	echo "__SYSCALL_${abi}($nr, $compat)"
+	echo "#endif"
     fi
 }
 
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6ce3902..abec4c9 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -7,7 +7,7 @@
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
 
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 29db3b3..9677bf9 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,11 +4,11 @@
 
 #include <asm/ia32.h>
 
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_64(nr, sym) [nr] = 1,
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
 static char syscalls_ia32[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index 439c099..d4669a6 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -25,11 +25,11 @@
 
 #define old_mmap sys_old_mmap
 
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 
 #undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_I386(nr, sym) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 71a497c..6ee5268 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,11 +35,11 @@
 #define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
 #undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym, compat) [ nr ] = sym,
+#define __SYSCALL_64(nr, sym) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index 5edf4f4..6c9a9c1 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -9,12 +9,12 @@
 #include <asm/types.h>
 
 #ifdef __i386__
-#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_I386(nr, sym) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
 #else
-#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
+#define __SYSCALL_64(nr, sym) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_64.h>
 };
-- 
cgit v0.10.2


From cfcbadb49dabb05efa23e1a0f95f3391c0a815bc Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:24 -0800
Subject: x86/syscalls: Add syscall entry qualifiers

This will let us specify something like 'sys_xyz/foo' instead of
'sys_xyz' in the syscall table, where the 'foo' qualifier conveys
some extra information to the C code.

The intent is to allow things like sys_execve/ptregs to indicate
that sys_execve() touches pt_regs.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2de06e33dce62556b3ec662006fcb295504e296e.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 3e28297..8f895ee 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -6,11 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
-#define __SYSCALL_I386(nr, sym) [nr] = sym,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 3781989..a1d4087 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,11 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index b81479c..cd3d301 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -3,6 +3,19 @@
 in="$1"
 out="$2"
 
+syscall_macro() {
+    abi="$1"
+    nr="$2"
+    entry="$3"
+
+    # Entry can be either just a function name or "function/qualifier"
+    real_entry="${entry%%/*}"
+    qualifier="${entry:${#real_entry}}"		# Strip the function name
+    qualifier="${qualifier:1}"			# Strip the slash, if any
+
+    echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
+}
+
 emit() {
     abi="$1"
     nr="$2"
@@ -16,15 +29,15 @@ emit() {
 
     if [ -z "$compat" ]; then
 	if [ -n "$entry" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry)"
+	    syscall_macro "$abi" "$nr" "$entry"
 	fi
     else
 	echo "#ifdef CONFIG_X86_32"
 	if [ -n "$entry" ]; then
-	    echo "__SYSCALL_${abi}($nr, $entry)"
+	    syscall_macro "$abi" "$nr" "$entry"
 	fi
 	echo "#else"
-	echo "__SYSCALL_${abi}($nr, $compat)"
+	syscall_macro "$abi" "$nr" "$compat"
 	echo "#endif"
     fi
 }
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index abec4c9..fdeb0ce 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -7,7 +7,7 @@
 #include <linux/lguest.h>
 #include "../../../drivers/lguest/lg.h"
 
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 9677bf9..d875f97 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -4,11 +4,11 @@
 
 #include <asm/ia32.h>
 
-#define __SYSCALL_64(nr, sym) [nr] = 1,
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
 static char syscalls_64[] = {
 #include <asm/syscalls_64.h>
 };
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls_ia32[] = {
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c
index d4669a6..bfce503 100644
--- a/arch/x86/um/sys_call_table_32.c
+++ b/arch/x86/um/sys_call_table_32.c
@@ -25,11 +25,11 @@
 
 #define old_mmap sys_old_mmap
 
-#define __SYSCALL_I386(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_32.h>
 
 #undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym) [ nr ] = sym,
+#define __SYSCALL_I386(nr, sym, qual) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 6ee5268..f306413 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -35,11 +35,11 @@
 #define stub_execveat sys_execveat
 #define stub_rt_sigreturn sys_rt_sigreturn
 
-#define __SYSCALL_64(nr, sym) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
 #include <asm/syscalls_64.h>
 
 #undef __SYSCALL_64
-#define __SYSCALL_64(nr, sym) [ nr ] = sym,
+#define __SYSCALL_64(nr, sym, qual) [ nr ] = sym,
 
 extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c
index 6c9a9c1..470564b 100644
--- a/arch/x86/um/user-offsets.c
+++ b/arch/x86/um/user-offsets.c
@@ -9,12 +9,12 @@
 #include <asm/types.h>
 
 #ifdef __i386__
-#define __SYSCALL_I386(nr, sym) [nr] = 1,
+#define __SYSCALL_I386(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_32.h>
 };
 #else
-#define __SYSCALL_64(nr, sym) [nr] = 1,
+#define __SYSCALL_64(nr, sym, qual) [nr] = 1,
 static char syscalls[] = {
 #include <asm/syscalls_64.h>
 };
-- 
cgit v0.10.2


From 302f5b260c322696cbeb962a263a4d2d99864aed Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:25 -0800
Subject: x86/entry/64: Always run ptregs-using syscalls on the slow path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

64-bit syscalls currently have an optimization in which they are
called with partial pt_regs.  A small handful require full
pt_regs.

In the 32-bit and compat cases, I cleaned this up by forcing
full pt_regs for all syscalls.  The performance hit doesn't
really matter as the affected system calls are fundamentally
heavy and this is the 32-bit compat case.

I want to clean up the 64-bit case as well, but I don't want to
hurt fast path performance.  To do that, I want to force the
syscalls that use pt_regs onto the slow path.  This will enable
us to make slow path syscalls be real ABI-compliant C functions.

Use the new syscall entry qualification machinery for this.
'stub_clone' is now 'stub_clone/ptregs'.

The next patch will eliminate the stubs, and we'll just have
'sys_clone/ptregs'.

As of this patch, two-phase entry tracing is no longer used.  It
has served its purpose (namely a huge speedup on some workloads
prior to more general opportunistic SYSRET support), and once
the dust settles I'll send patches to back it out.

The implementation is heavily based on a patch from Brian Gerst:

  http://lkml.kernel.org/g/1449666173-15366-1-git-send-email-brgerst@gmail.com

Originally-From: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux Kernel Mailing List <linux-kernel@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b9beda88460bcefec6e7d792bd44eca9b760b0c4.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9d34d3c..f1c8f15 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -182,7 +182,15 @@ entry_SYSCALL_64_fastpath:
 #endif
 	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
 	movq	%r10, %rcx
+
+	/*
+	 * This call instruction is handled specially in stub_ptregs_64.
+	 * It might end up jumping to the slow path.  If it jumps, RAX is
+	 * clobbered.
+	 */
 	call	*sys_call_table(, %rax, 8)
+.Lentry_SYSCALL_64_after_fastpath_call:
+
 	movq	%rax, RAX(%rsp)
 1:
 /*
@@ -235,25 +243,13 @@ GLOBAL(int_ret_from_sys_call_irqs_off)
 
 	/* Do syscall entry tracing */
 tracesys:
-	movq	%rsp, %rdi
-	movl	$AUDIT_ARCH_X86_64, %esi
-	call	syscall_trace_enter_phase1
-	test	%rax, %rax
-	jnz	tracesys_phase2			/* if needed, run the slow path */
-	RESTORE_C_REGS_EXCEPT_RAX		/* else restore clobbered regs */
-	movq	ORIG_RAX(%rsp), %rax
-	jmp	entry_SYSCALL_64_fastpath	/* and return to the fast path */
-
-tracesys_phase2:
 	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
-	movl	$AUDIT_ARCH_X86_64, %esi
-	movq	%rax, %rdx
-	call	syscall_trace_enter_phase2
+	call	syscall_trace_enter
 
 	/*
 	 * Reload registers from stack in case ptrace changed them.
-	 * We don't reload %rax because syscall_trace_entry_phase2() returned
+	 * We don't reload %rax because syscall_trace_enter() returned
 	 * the value it wants us to use in the table lookup.
 	 */
 	RESTORE_C_REGS_EXCEPT_RAX
@@ -355,6 +351,38 @@ opportunistic_sysret_failed:
 	jmp	restore_c_regs_and_iret
 END(entry_SYSCALL_64)
 
+ENTRY(stub_ptregs_64)
+	/*
+	 * Syscalls marked as needing ptregs land here.
+	 * If we are on the fast path, we need to save the extra regs.
+	 * If we are on the slow path, the extra regs are already saved.
+	 *
+	 * RAX stores a pointer to the C function implementing the syscall.
+	 */
+	cmpq	$.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
+	jne	1f
+
+	/* Called from fast path -- pop return address and jump to slow path */
+	popq	%rax
+	jmp	tracesys	/* called from fast path */
+
+1:
+	/* Called from C */
+	jmp	*%rax				/* called from C */
+END(stub_ptregs_64)
+
+.macro ptregs_stub func
+ENTRY(ptregs_\func)
+	leaq	\func(%rip), %rax
+	jmp	stub_ptregs_64
+END(ptregs_\func)
+.endm
+
+/* Instantiate ptregs_stub for each ptregs-using syscall */
+#define __SYSCALL_64_QUAL_(sym)
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
+#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
+#include <asm/syscalls_64.h>
 
 	.macro FORK_LIKE func
 ENTRY(stub_\func)
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index a1d4087..9dbc5ab 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -6,11 +6,14 @@
 #include <asm/asm-offsets.h>
 #include <asm/syscall.h>
 
-#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
+#define __SYSCALL_64_QUAL_(sym) sym
+#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
+
+#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 #include <asm/syscalls_64.h>
 #undef __SYSCALL_64
 
-#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
+#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
 
 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
 
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index dc1040a..5de342a 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -21,7 +21,7 @@
 12	common	brk			sys_brk
 13	64	rt_sigaction		sys_rt_sigaction
 14	common	rt_sigprocmask		sys_rt_sigprocmask
-15	64	rt_sigreturn		stub_rt_sigreturn
+15	64	rt_sigreturn		stub_rt_sigreturn/ptregs
 16	64	ioctl			sys_ioctl
 17	common	pread64			sys_pread64
 18	common	pwrite64		sys_pwrite64
@@ -62,10 +62,10 @@
 53	common	socketpair		sys_socketpair
 54	64	setsockopt		sys_setsockopt
 55	64	getsockopt		sys_getsockopt
-56	common	clone			stub_clone
-57	common	fork			stub_fork
-58	common	vfork			stub_vfork
-59	64	execve			stub_execve
+56	common	clone			stub_clone/ptregs
+57	common	fork			stub_fork/ptregs
+58	common	vfork			stub_vfork/ptregs
+59	64	execve			stub_execve/ptregs
 60	common	exit			sys_exit
 61	common	wait4			sys_wait4
 62	common	kill			sys_kill
@@ -328,7 +328,7 @@
 319	common	memfd_create		sys_memfd_create
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
-322	64	execveat		stub_execveat
+322	64	execveat		stub_execveat/ptregs
 323	common	userfaultfd		sys_userfaultfd
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
@@ -346,7 +346,7 @@
 517	x32	recvfrom		compat_sys_recvfrom
 518	x32	sendmsg			compat_sys_sendmsg
 519	x32	recvmsg			compat_sys_recvmsg
-520	x32	execve			stub_x32_execve
+520	x32	execve			stub_x32_execve/ptregs
 521	x32	ptrace			compat_sys_ptrace
 522	x32	rt_sigpending		compat_sys_rt_sigpending
 523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait
@@ -371,4 +371,4 @@
 542	x32	getsockopt		compat_sys_getsockopt
 543	x32	io_setup		compat_sys_io_setup
 544	x32	io_submit		compat_sys_io_submit
-545	x32	execveat		stub_x32_execveat
+545	x32	execveat		stub_x32_execveat/ptregs
-- 
cgit v0.10.2


From 46eabf06c04a6847a694a0c1413d4ac57e5b058a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:26 -0800
Subject: x86/entry/64: Call all native slow-path syscalls with full pt-regs

This removes all of the remaining asm syscall stubs except for
stub_ptregs_64.  Entries in the main syscall table are now all
callable from C.

The resulting asm is every bit as ridiculous as it looks.  The
next few patches will clean it up.  This patch is here to let
reviewers rest their brains and for bisection.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a6b3801be0d505d50aefabda02d3b93efbfc9c73.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f1c8f15..f7050a5 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -253,7 +253,6 @@ tracesys:
 	 * the value it wants us to use in the table lookup.
 	 */
 	RESTORE_C_REGS_EXCEPT_RAX
-	RESTORE_EXTRA_REGS
 #if __SYSCALL_MASK == ~0
 	cmpq	$__NR_syscall_max, %rax
 #else
@@ -264,6 +263,7 @@ tracesys:
 	movq	%r10, %rcx			/* fixup for C */
 	call	*sys_call_table(, %rax, 8)
 	movq	%rax, RAX(%rsp)
+	RESTORE_EXTRA_REGS
 1:
 	/* Use IRET because user could have changed pt_regs->foo */
 
@@ -384,83 +384,6 @@ END(ptregs_\func)
 #define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
 #include <asm/syscalls_64.h>
 
-	.macro FORK_LIKE func
-ENTRY(stub_\func)
-	SAVE_EXTRA_REGS 8
-	jmp	sys_\func
-END(stub_\func)
-	.endm
-
-	FORK_LIKE  clone
-	FORK_LIKE  fork
-	FORK_LIKE  vfork
-
-ENTRY(stub_execve)
-	call	sys_execve
-return_from_execve:
-	testl	%eax, %eax
-	jz	1f
-	/* exec failed, can use fast SYSRET code path in this case */
-	ret
-1:
-	/* must use IRET code path (pt_regs->cs may have changed) */
-	addq	$8, %rsp
-	ZERO_EXTRA_REGS
-	movq	%rax, RAX(%rsp)
-	jmp	int_ret_from_sys_call
-END(stub_execve)
-/*
- * Remaining execve stubs are only 7 bytes long.
- * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
- */
-	.align	8
-GLOBAL(stub_execveat)
-	call	sys_execveat
-	jmp	return_from_execve
-END(stub_execveat)
-
-#if defined(CONFIG_X86_X32_ABI)
-	.align	8
-GLOBAL(stub_x32_execve)
-	call	compat_sys_execve
-	jmp	return_from_execve
-END(stub_x32_execve)
-	.align	8
-GLOBAL(stub_x32_execveat)
-	call	compat_sys_execveat
-	jmp	return_from_execve
-END(stub_x32_execveat)
-#endif
-
-/*
- * sigreturn is special because it needs to restore all registers on return.
- * This cannot be done with SYSRET, so use the IRET return path instead.
- */
-ENTRY(stub_rt_sigreturn)
-	/*
-	 * SAVE_EXTRA_REGS result is not normally needed:
-	 * sigreturn overwrites all pt_regs->GPREGS.
-	 * But sigreturn can fail (!), and there is no easy way to detect that.
-	 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
-	 * we SAVE_EXTRA_REGS here.
-	 */
-	SAVE_EXTRA_REGS 8
-	call	sys_rt_sigreturn
-return_from_stub:
-	addq	$8, %rsp
-	RESTORE_EXTRA_REGS
-	movq	%rax, RAX(%rsp)
-	jmp	int_ret_from_sys_call
-END(stub_rt_sigreturn)
-
-#ifdef CONFIG_X86_X32_ABI
-ENTRY(stub_x32_rt_sigreturn)
-	SAVE_EXTRA_REGS 8
-	call	sys32_x32_rt_sigreturn
-	jmp	return_from_stub
-END(stub_x32_rt_sigreturn)
-#endif
-
 /*
  * A newly forked process directly context switches into this address.
  *
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5de342a..dcf107c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -21,7 +21,7 @@
 12	common	brk			sys_brk
 13	64	rt_sigaction		sys_rt_sigaction
 14	common	rt_sigprocmask		sys_rt_sigprocmask
-15	64	rt_sigreturn		stub_rt_sigreturn/ptregs
+15	64	rt_sigreturn		sys_rt_sigreturn/ptregs
 16	64	ioctl			sys_ioctl
 17	common	pread64			sys_pread64
 18	common	pwrite64		sys_pwrite64
@@ -62,10 +62,10 @@
 53	common	socketpair		sys_socketpair
 54	64	setsockopt		sys_setsockopt
 55	64	getsockopt		sys_getsockopt
-56	common	clone			stub_clone/ptregs
-57	common	fork			stub_fork/ptregs
-58	common	vfork			stub_vfork/ptregs
-59	64	execve			stub_execve/ptregs
+56	common	clone			sys_clone/ptregs
+57	common	fork			sys_fork/ptregs
+58	common	vfork			sys_vfork/ptregs
+59	64	execve			sys_execve/ptregs
 60	common	exit			sys_exit
 61	common	wait4			sys_wait4
 62	common	kill			sys_kill
@@ -328,7 +328,7 @@
 319	common	memfd_create		sys_memfd_create
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
-322	64	execveat		stub_execveat/ptregs
+322	64	execveat		sys_execveat/ptregs
 323	common	userfaultfd		sys_userfaultfd
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
@@ -339,14 +339,14 @@
 # for native 64-bit operation.
 #
 512	x32	rt_sigaction		compat_sys_rt_sigaction
-513	x32	rt_sigreturn		stub_x32_rt_sigreturn
+513	x32	rt_sigreturn		sys32_x32_rt_sigreturn
 514	x32	ioctl			compat_sys_ioctl
 515	x32	readv			compat_sys_readv
 516	x32	writev			compat_sys_writev
 517	x32	recvfrom		compat_sys_recvfrom
 518	x32	sendmsg			compat_sys_sendmsg
 519	x32	recvmsg			compat_sys_recvmsg
-520	x32	execve			stub_x32_execve/ptregs
+520	x32	execve			compat_sys_execve/ptregs
 521	x32	ptrace			compat_sys_ptrace
 522	x32	rt_sigpending		compat_sys_rt_sigpending
 523	x32	rt_sigtimedwait		compat_sys_rt_sigtimedwait
@@ -371,4 +371,4 @@
 542	x32	getsockopt		compat_sys_getsockopt
 543	x32	io_setup		compat_sys_io_setup
 544	x32	io_submit		compat_sys_io_submit
-545	x32	execveat		stub_x32_execveat/ptregs
+545	x32	execveat		compat_sys_execveat/ptregs
-- 
cgit v0.10.2


From 24d978b76ffd20ecff8a8d1c21b16fe740f8b119 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:27 -0800
Subject: x86/entry/64: Stop using int_ret_from_sys_call in ret_from_fork

ret_from_fork is now open-coded and is no longer tangled up with
the syscall code.  This isn't so bad -- this adds very little
code, and IMO the result is much easier to understand.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a0747e2a5e47084655a1e96351c545b755c41fa7.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f7050a5..cb5d940 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -390,7 +390,6 @@ END(ptregs_\func)
  * rdi: prev task we switched from
  */
 ENTRY(ret_from_fork)
-
 	LOCK ; btr $TIF_FORK, TI_flags(%r8)
 
 	pushq	$0x0002
@@ -398,28 +397,32 @@ ENTRY(ret_from_fork)
 
 	call	schedule_tail			/* rdi: 'prev' task parameter */
 
-	RESTORE_EXTRA_REGS
-
 	testb	$3, CS(%rsp)			/* from kernel_thread? */
+	jnz	1f
 
 	/*
-	 * By the time we get here, we have no idea whether our pt_regs,
-	 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
-	 * the slow path, or one of the 32-bit compat paths.
-	 * Use IRET code path to return, since it can safely handle
-	 * all of the above.
+	 * We came from kernel_thread.  This code path is quite twisted, and
+	 * someone should clean it up.
+	 *
+	 * copy_thread_tls stashes the function pointer in RBX and the
+	 * parameter to be passed in RBP.  The called function is permitted
+	 * to call do_execve and thereby jump to user mode.
 	 */
-	jnz	int_ret_from_sys_call
+	movq	RBP(%rsp), %rdi
+	call	*RBX(%rsp)
+	movl	$0, RAX(%rsp)
 
 	/*
-	 * We came from kernel_thread
-	 * nb: we depend on RESTORE_EXTRA_REGS above
+	 * Fall through as though we're exiting a syscall.  This makes a
+	 * twisted sort of sense if we just called do_execve.
 	 */
-	movq	%rbp, %rdi
-	call	*%rbx
-	movl	$0, RAX(%rsp)
-	RESTORE_EXTRA_REGS
-	jmp	int_ret_from_sys_call
+
+1:
+	movq	%rsp, %rdi
+	call	syscall_return_slowpath	/* returns with IRQs disabled */
+	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
+	SWAPGS
+	jmp	restore_regs_and_iret
 END(ret_from_fork)
 
 /*
-- 
cgit v0.10.2


From 1e423bff959e48166f5b7efca01fdb0dbdf05846 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 28 Jan 2016 15:11:28 -0800
Subject: x86/entry/64: Migrate the 64-bit syscall slow path to C

This is more complicated than the 32-bit and compat cases
because it preserves an asm fast path for the case where the
callee-saved regs aren't needed in pt_regs and no entry or exit
work needs to be done.

This appears to slow down fastpath syscalls by no more than one
cycle on my Skylake laptop.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/ce2335a4d42dc164b24132ee5e8c7716061f947b.1454022279.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 0366374..75175f9 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -344,6 +344,32 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
 	prepare_exit_to_usermode(regs);
 }
 
+#ifdef CONFIG_X86_64
+__visible void do_syscall_64(struct pt_regs *regs)
+{
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
+	unsigned long nr = regs->orig_ax;
+
+	local_irq_enable();
+
+	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
+		nr = syscall_trace_enter(regs);
+
+	/*
+	 * NB: Native and x32 syscalls are dispatched from the same
+	 * table.  The only functional difference is the x32 bit in
+	 * regs->orig_ax, which changes the behavior of some syscalls.
+	 */
+	if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
+		regs->ax = sys_call_table[nr & __SYSCALL_MASK](
+			regs->di, regs->si, regs->dx,
+			regs->r10, regs->r8, regs->r9);
+	}
+
+	syscall_return_slowpath(regs);
+}
+#endif
+
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 /*
  * Does a 32-bit syscall.  Called with IRQs on and does all entry and
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cb5d940..567aa52 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -145,17 +145,11 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
+	TRACE_IRQS_OFF
+
 	/* Construct struct pt_regs on stack */
 	pushq	$__USER_DS			/* pt_regs->ss */
 	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
-	/*
-	 * Re-enable interrupts.
-	 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
-	 * must execute atomically in the face of possible interrupt-driven
-	 * task preemption. We must enable interrupts only after we're done
-	 * with using rsp_scratch:
-	 */
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	pushq	%r11				/* pt_regs->flags */
 	pushq	$__USER_CS			/* pt_regs->cs */
 	pushq	%rcx				/* pt_regs->ip */
@@ -171,9 +165,21 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	pushq	%r11				/* pt_regs->r11 */
 	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
 
-	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz	tracesys
+	/*
+	 * If we need to do entry work or if we guess we'll need to do
+	 * exit work, go straight to the slow path.
+	 */
+	testl	$_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	entry_SYSCALL64_slow_path
+
 entry_SYSCALL_64_fastpath:
+	/*
+	 * Easy case: enable interrupts and issue the syscall.  If the syscall
+	 * needs pt_regs, we'll call a stub that disables interrupts again
+	 * and jumps to the slow path.
+	 */
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
 #if __SYSCALL_MASK == ~0
 	cmpq	$__NR_syscall_max, %rax
 #else
@@ -193,88 +199,43 @@ entry_SYSCALL_64_fastpath:
 
 	movq	%rax, RAX(%rsp)
 1:
-/*
- * Syscall return path ending with SYSRET (fast path).
- * Has incompletely filled pt_regs.
- */
-	LOCKDEP_SYS_EXIT
-	/*
-	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-	 * it is too small to ever cause noticeable irq latency.
-	 */
-	DISABLE_INTERRUPTS(CLBR_NONE)
 
 	/*
-	 * We must check ti flags with interrupts (or at least preemption)
-	 * off because we must *never* return to userspace without
-	 * processing exit work that is enqueued if we're preempted here.
-	 * In particular, returning to userspace with any of the one-shot
-	 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
-	 * very bad.
+	 * If we get here, then we know that pt_regs is clean for SYSRET64.
+	 * If we see that no exit work is required (which we are required
+	 * to check with IRQs off), then we can go straight to SYSRET64.
 	 */
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
 	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz	int_ret_from_sys_call_irqs_off	/* Go to the slow path */
+	jnz	1f
 
-	RESTORE_C_REGS_EXCEPT_RCX_R11
-	movq	RIP(%rsp), %rcx
-	movq	EFLAGS(%rsp), %r11
+	LOCKDEP_SYS_EXIT
+	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
+	RESTORE_C_REGS
 	movq	RSP(%rsp), %rsp
-	/*
-	 * 64-bit SYSRET restores rip from rcx,
-	 * rflags from r11 (but RF and VM bits are forced to 0),
-	 * cs and ss are loaded from MSRs.
-	 * Restoration of rflags re-enables interrupts.
-	 *
-	 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
-	 * descriptor is not reinitialized.  This means that we should
-	 * avoid SYSRET with SS == NULL, which could happen if we schedule,
-	 * exit the kernel, and re-enter using an interrupt vector.  (All
-	 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
-	 * from happening by reloading SS in __switch_to.  (Actually
-	 * detecting the failure in 64-bit userspace is tricky but can be
-	 * done.)
-	 */
 	USERGS_SYSRET64
 
-GLOBAL(int_ret_from_sys_call_irqs_off)
+1:
+	/*
+	 * The fast path looked good when we started, but something changed
+	 * along the way and we need to switch to the slow path.  Calling
+	 * raise(3) will trigger this, for example.  IRQs are off.
+	 */
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	jmp int_ret_from_sys_call
-
-	/* Do syscall entry tracing */
-tracesys:
 	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
-	call	syscall_trace_enter
-
-	/*
-	 * Reload registers from stack in case ptrace changed them.
-	 * We don't reload %rax because syscall_trace_enter() returned
-	 * the value it wants us to use in the table lookup.
-	 */
-	RESTORE_C_REGS_EXCEPT_RAX
-#if __SYSCALL_MASK == ~0
-	cmpq	$__NR_syscall_max, %rax
-#else
-	andl	$__SYSCALL_MASK, %eax
-	cmpl	$__NR_syscall_max, %eax
-#endif
-	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
-	movq	%r10, %rcx			/* fixup for C */
-	call	*sys_call_table(, %rax, 8)
-	movq	%rax, RAX(%rsp)
-	RESTORE_EXTRA_REGS
-1:
-	/* Use IRET because user could have changed pt_regs->foo */
+	call	syscall_return_slowpath	/* returns with IRQs disabled */
+	jmp	return_from_SYSCALL_64
 
-/*
- * Syscall return path ending with IRET.
- * Has correct iret frame.
- */
-GLOBAL(int_ret_from_sys_call)
+entry_SYSCALL64_slow_path:
+	/* IRQs are off. */
 	SAVE_EXTRA_REGS
 	movq	%rsp, %rdi
-	call	syscall_return_slowpath	/* returns with IRQs disabled */
+	call	do_syscall_64		/* returns with IRQs disabled */
+
+return_from_SYSCALL_64:
 	RESTORE_EXTRA_REGS
 	TRACE_IRQS_IRETQ		/* we're about to change IF */
 
@@ -364,7 +325,7 @@ ENTRY(stub_ptregs_64)
 
 	/* Called from fast path -- pop return address and jump to slow path */
 	popq	%rax
-	jmp	tracesys	/* called from fast path */
+	jmp	entry_SYSCALL64_slow_path	/* called from fast path */
 
 1:
 	/* Called from C */
-- 
cgit v0.10.2


From 8bf862739a7786ae72409220914df960a0aa80d8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 27 Jan 2016 12:37:52 +0100
Subject: wext: fix message delay/ordering

Beniamino reported that he was getting an RTM_NEWLINK message for a
given interface, after the RTM_DELLINK for it. It turns out that the
message is a wireless extensions message, which was sent because the
interface had been connected and disconnection while it was deleted
caused a wext message.

For its netlink messages, wext uses RTM_NEWLINK, but the message is
without all the regular rtnetlink attributes, so "ip monitor link"
prints just rudimentary information:

5: wlan1: <BROADCAST,MULTICAST> mtu 1500 qdisc mq state DOWN group default
    link/ether 02:00:00:00:01:00 brd ff:ff:ff:ff:ff:ff
Deleted 5: wlan1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default
    link/ether 02:00:00:00:01:00 brd ff:ff:ff:ff:ff:ff
5: wlan1: <BROADCAST,MULTICAST,UP>
    link/ether
(from my hwsim reproduction)

This can cause userspace to get confused since it doesn't expect an
RTM_NEWLINK message after RTM_DELLINK.

The reason for this is that wext schedules a worker to send out the
messages, and the scheduling delay can cause the messages to get out
to userspace in different order.

To fix this, have wext register a netdevice notifier and flush out
any pending messages when netdevice state changes. This fixes any
ordering whenever the original message wasn't sent by a notifier
itself.

Cc: stable@vger.kernel.org
Reported-by: Beniamino Galvani <bgalvani@redhat.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index c8717c1..87dd619 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -342,6 +342,39 @@ static const int compat_event_type_size[] = {
 
 /* IW event code */
 
+static void wireless_nlevent_flush(void)
+{
+	struct sk_buff *skb;
+	struct net *net;
+
+	ASSERT_RTNL();
+
+	for_each_net(net) {
+		while ((skb = skb_dequeue(&net->wext_nlevents)))
+			rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
+				    GFP_KERNEL);
+	}
+}
+
+static int wext_netdev_notifier_call(struct notifier_block *nb,
+				     unsigned long state, void *ptr)
+{
+	/*
+	 * When a netdev changes state in any way, flush all pending messages
+	 * to avoid them going out in a strange order, e.g. RTM_NEWLINK after
+	 * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close()
+	 * or similar - all of which could otherwise happen due to delays from
+	 * schedule_work().
+	 */
+	wireless_nlevent_flush();
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block wext_netdev_notifier = {
+	.notifier_call = wext_netdev_notifier_call,
+};
+
 static int __net_init wext_pernet_init(struct net *net)
 {
 	skb_queue_head_init(&net->wext_nlevents);
@@ -360,7 +393,12 @@ static struct pernet_operations wext_pernet_ops = {
 
 static int __init wireless_nlevent_init(void)
 {
-	return register_pernet_subsys(&wext_pernet_ops);
+	int err = register_pernet_subsys(&wext_pernet_ops);
+
+	if (err)
+		return err;
+
+	return register_netdevice_notifier(&wext_netdev_notifier);
 }
 
 subsys_initcall(wireless_nlevent_init);
@@ -368,17 +406,8 @@ subsys_initcall(wireless_nlevent_init);
 /* Process events generated by the wireless layer or the driver. */
 static void wireless_nlevent_process(struct work_struct *work)
 {
-	struct sk_buff *skb;
-	struct net *net;
-
 	rtnl_lock();
-
-	for_each_net(net) {
-		while ((skb = skb_dequeue(&net->wext_nlevents)))
-			rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
-				    GFP_KERNEL);
-	}
-
+	wireless_nlevent_flush();
 	rtnl_unlock();
 }
 
-- 
cgit v0.10.2


From cb150b9d23be6ee7f3a0fff29784f1c5b5ac514d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 27 Jan 2016 13:29:34 +0100
Subject: cfg80211/wext: fix message ordering

Since cfg80211 frequently takes actions from its netdev notifier
call, wireless extensions messages could still be ordered badly
since the wext netdev notifier, since wext is built into the
kernel, runs before the cfg80211 netdev notifier. For example,
the following can happen:

5: wlan1: <BROADCAST,MULTICAST> mtu 1500 qdisc mq state DOWN group default
    link/ether 02:00:00:00:01:00 brd ff:ff:ff:ff:ff:ff
5: wlan1: <BROADCAST,MULTICAST,UP>
    link/ether

when setting the interface down causes the wext message.

To also fix this, export the wireless_nlevent_flush() function
and also call it from the cfg80211 notifier.

Cc: stable@vger.kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h
index 8f81bbb..e0f4109 100644
--- a/include/net/iw_handler.h
+++ b/include/net/iw_handler.h
@@ -439,6 +439,12 @@ int dev_get_wireless_info(char *buffer, char **start, off_t offset, int length);
 /* Send a single event to user space */
 void wireless_send_event(struct net_device *dev, unsigned int cmd,
 			 union iwreq_data *wrqu, const char *extra);
+#ifdef CONFIG_WEXT_CORE
+/* flush all previous wext events - if work is done from netdev notifiers */
+void wireless_nlevent_flush(void);
+#else
+static inline void wireless_nlevent_flush(void) {}
+#endif
 
 /* We may need a function to send a stream of events to user space.
  * More on that later... */
diff --git a/net/wireless/core.c b/net/wireless/core.c
index b091551..8f0bac7 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1147,6 +1147,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
 		return NOTIFY_DONE;
 	}
 
+	wireless_nlevent_flush();
+
 	return NOTIFY_OK;
 }
 
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 87dd619..b50ee5d 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -342,7 +342,7 @@ static const int compat_event_type_size[] = {
 
 /* IW event code */
 
-static void wireless_nlevent_flush(void)
+void wireless_nlevent_flush(void)
 {
 	struct sk_buff *skb;
 	struct net *net;
@@ -355,6 +355,7 @@ static void wireless_nlevent_flush(void)
 				    GFP_KERNEL);
 	}
 }
+EXPORT_SYMBOL_GPL(wireless_nlevent_flush);
 
 static int wext_netdev_notifier_call(struct notifier_block *nb,
 				     unsigned long state, void *ptr)
-- 
cgit v0.10.2


From f39ea2690bd61efec97622c48323f40ed6e16317 Mon Sep 17 00:00:00 2001
From: Chris Bainbridge <chris.bainbridge@gmail.com>
Date: Wed, 27 Jan 2016 15:46:18 +0000
Subject: mac80211: fix use of uninitialised values in RX aggregation

Use kzalloc instead of kmalloc for struct tid_ampdu_rx to
initialize the "removed" field (all others are initialized
manually). That fixes:

UBSAN: Undefined behaviour in net/mac80211/rx.c:932:29
load of value 2 is not a valid value for type '_Bool'
CPU: 3 PID: 1134 Comm: kworker/u16:7 Not tainted 4.5.0-rc1+ #265
Workqueue: phy0 rt2x00usb_work_rxdone
 0000000000000004 ffff880254a7ba50 ffffffff8181d866 0000000000000007
 ffff880254a7ba78 ffff880254a7ba68 ffffffff8188422d ffffffff8379b500
 ffff880254a7bab8 ffffffff81884747 0000000000000202 0000000348620032
Call Trace:
 [<ffffffff8181d866>] dump_stack+0x45/0x5f
 [<ffffffff8188422d>] ubsan_epilogue+0xd/0x40
 [<ffffffff81884747>] __ubsan_handle_load_invalid_value+0x67/0x70
 [<ffffffff82227b4d>] ieee80211_sta_reorder_release.isra.16+0x5ed/0x730
 [<ffffffff8222ca14>] ieee80211_prepare_and_rx_handle+0xd04/0x1c00
 [<ffffffff8222db03>] __ieee80211_rx_handle_packet+0x1f3/0x750
 [<ffffffff8222e4a7>] ieee80211_rx_napi+0x447/0x990

While at it, convert to use sizeof(*tid_agg_rx) instead.

Fixes: 788211d81bfdf ("mac80211: fix RX A-MPDU session reorder timer deletion")
Cc: stable@vger.kernel.org
Signed-off-by: Chris Bainbridge <chris.bainbridge@gmail.com>
[reword commit message, use sizeof(*tid_agg_rx)]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 10ad4ac..367784b 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -291,7 +291,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
 	}
 
 	/* prepare A-MPDU MLME for Rx aggregation */
-	tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL);
+	tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL);
 	if (!tid_agg_rx)
 		goto end;
 
-- 
cgit v0.10.2


From 5a155bb77a673dda941121142d686c3f47b49981 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 29 Jan 2016 05:57:30 +0000
Subject: perf build: Remove all condition feature check {C,LD}FLAGS

'make feature-dump' should give a stable result, so even 'NO_SOMETHING=1'
is given (for babeltrace, if LIBBABELTRACE=1 is not given), we should
try to detect those feature and {C,LD}FLAGS. Build or not should be
controled independent.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1454047050-204993-1-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 511141b..0045a5d 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -61,50 +61,45 @@ endif
 
 ifeq ($(LIBUNWIND_LIBS),)
   NO_LIBUNWIND := 1
-else
-  #
-  # For linking with debug library, run like:
-  #
-  #   make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
-  #
-  ifdef LIBUNWIND_DIR
-    LIBUNWIND_CFLAGS  = -I$(LIBUNWIND_DIR)/include
-    LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib
-  endif
-  LIBUNWIND_LDFLAGS += $(LIBUNWIND_LIBS)
-
-  # Set per-feature check compilation flags
-  FEATURE_CHECK_CFLAGS-libunwind = $(LIBUNWIND_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS)
-  FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS)
 endif
+#
+# For linking with debug library, run like:
+#
+#   make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
+#
+ifdef LIBUNWIND_DIR
+  LIBUNWIND_CFLAGS  = -I$(LIBUNWIND_DIR)/include
+  LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib
+endif
+LIBUNWIND_LDFLAGS += $(LIBUNWIND_LIBS)
+
+# Set per-feature check compilation flags
+FEATURE_CHECK_CFLAGS-libunwind = $(LIBUNWIND_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS)
+FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS)
 
 ifeq ($(NO_PERF_REGS),0)
   CFLAGS += -DHAVE_PERF_REGS_SUPPORT
 endif
 
-ifndef NO_LIBELF
-  # for linking with debug library, run like:
-  # make DEBUG=1 LIBDW_DIR=/opt/libdw/
-  ifdef LIBDW_DIR
-    LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
-    LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
-  endif
-  FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) -ldw
+# for linking with debug library, run like:
+# make DEBUG=1 LIBDW_DIR=/opt/libdw/
+ifdef LIBDW_DIR
+  LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
+  LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
 endif
+FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) -ldw
 
-ifdef LIBBABELTRACE
-  # for linking with debug library, run like:
-  # make DEBUG=1 LIBBABELTRACE_DIR=/opt/libbabeltrace/
-  ifdef LIBBABELTRACE_DIR
-    LIBBABELTRACE_CFLAGS  := -I$(LIBBABELTRACE_DIR)/include
-    LIBBABELTRACE_LDFLAGS := -L$(LIBBABELTRACE_DIR)/lib
-  endif
-  FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
-  FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf
+# for linking with debug library, run like:
+# make DEBUG=1 LIBBABELTRACE_DIR=/opt/libbabeltrace/
+ifdef LIBBABELTRACE_DIR
+  LIBBABELTRACE_CFLAGS  := -I$(LIBBABELTRACE_DIR)/include
+  LIBBABELTRACE_LDFLAGS := -L$(LIBBABELTRACE_DIR)/lib
 endif
+FEATURE_CHECK_CFLAGS-libbabeltrace := $(LIBBABELTRACE_CFLAGS)
+FEATURE_CHECK_LDFLAGS-libbabeltrace := $(LIBBABELTRACE_LDFLAGS) -lbabeltrace-ctf
 
 FEATURE_CHECK_CFLAGS-bpf = -I. -I$(srctree)/tools/include -I$(srctree)/arch/$(ARCH)/include/uapi -I$(srctree)/include/uapi
 # include ARCH specific config
@@ -145,28 +140,26 @@ ifdef PARSER_DEBUG
   $(call detected_var,PARSER_DEBUG_FLEX)
 endif
 
-ifndef NO_LIBPYTHON
-  # Try different combinations to accommodate systems that only have
-  # python[2][-config] in weird combinations but always preferring
-  # python2 and python2-config as per pep-0394. If we catch a
-  # python[-config] in version 3, the version check will kill it.
-  PYTHON2 := $(if $(call get-executable,python2),python2,python)
-  override PYTHON := $(call get-executable-or-default,PYTHON,$(PYTHON2))
-  PYTHON2_CONFIG := \
-    $(if $(call get-executable,$(PYTHON)-config),$(PYTHON)-config,python-config)
-  override PYTHON_CONFIG := \
-    $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON2_CONFIG))
+# Try different combinations to accommodate systems that only have
+# python[2][-config] in weird combinations but always preferring
+# python2 and python2-config as per pep-0394. If we catch a
+# python[-config] in version 3, the version check will kill it.
+PYTHON2 := $(if $(call get-executable,python2),python2,python)
+override PYTHON := $(call get-executable-or-default,PYTHON,$(PYTHON2))
+PYTHON2_CONFIG := \
+  $(if $(call get-executable,$(PYTHON)-config),$(PYTHON)-config,python-config)
+override PYTHON_CONFIG := \
+  $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON2_CONFIG))
 
-  PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
+PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
 
-  PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
-  PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
+PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
 
-  FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
-  FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
-  FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
-  FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS)
-endif
+FEATURE_CHECK_CFLAGS-libpython := $(PYTHON_EMBED_CCOPTS)
+FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS)
+FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS)
+FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS)
 
 CFLAGS += -fno-omit-frame-pointer
 CFLAGS += -ggdb3
-- 
cgit v0.10.2


From 79191c89a049a9c525ce22a7d1e5674699c58818 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 29 Jan 2016 11:51:09 +0000
Subject: perf build: Use feature dump file for build-test

To prevent the feature check tests to run repeately, one time per
'tests/make' target/test, this patch utilizes the previously introduced
'feature-dump' make target and FEATURES_DUMP variable, making sure that
the feature checkers run only once when doing build-test for normal test
cases.

However, since standard users doesn't reuse features dump result, we'd
better give an option to check their behaviors. The above feature
should be used to make build-test faster only. Only utilize it for
build-test.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1454068269-235999-1-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index dcd9a70..e4ff0bd 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -78,7 +78,7 @@ clean:
 # The build-test target is not really parallel, don't print the jobs info:
 #
 build-test:
-	@$(MAKE) SHUF=1 -f tests/make --no-print-directory
+	@$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 --no-print-directory
 
 #
 # All other targets get passed through:
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index f918015..7f663f4 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -15,6 +15,7 @@ else
 PERF := .
 PERF_O := $(PERF)
 O_OPT :=
+FULL_O := $(shell readlink -f $(PERF_O) || echo $(PERF_O))
 
 ifneq ($(O),)
   FULL_O := $(shell readlink -f $(O) || echo $(O))
@@ -313,11 +314,43 @@ make_kernelsrc_tools:
 	(make -C ../../tools $(PARALLEL_OPT) $(K_O_OPT) perf) > $@ 2>&1 && \
 	test -x $(KERNEL_O)/tools/perf/perf && rm -f $@ || (cat $@ ; false)
 
+FEATURES_DUMP_FILE := $(FULL_O)/BUILD_TEST_FEATURE_DUMP
+FEATURES_DUMP_FILE_STATIC := $(FULL_O)/BUILD_TEST_FEATURE_DUMP_STATIC
+
 all: $(run) $(run_O) tarpkg make_kernelsrc make_kernelsrc_tools
 	@echo OK
+	@rm -f $(FEATURES_DUMP_FILE) $(FEATURES_DUMP_FILE_STATIC)
 
 out: $(run_O)
 	@echo OK
+	@rm -f $(FEATURES_DUMP_FILE) $(FEATURES_DUMP_FILE_STATIC)
+
+ifeq ($(REUSE_FEATURES_DUMP),1)
+$(FEATURES_DUMP_FILE):
+	$(call clean)
+	@cmd="cd $(PERF) && make FEATURE_DUMP_COPY=$@ $(O_OPT) feature-dump"; \
+	echo "- $@: $$cmd" && echo $$cmd && \
+	( eval $$cmd ) > /dev/null 2>&1
+
+$(FEATURES_DUMP_FILE_STATIC):
+	$(call clean)
+	@cmd="cd $(PERF) && make FEATURE_DUMP_COPY=$@ $(O_OPT) LDFLAGS='-static' feature-dump"; \
+	echo "- $@: $$cmd" && echo $$cmd && \
+	( eval $$cmd ) > /dev/null 2>&1
+
+# Add feature dump dependency for run/run_O targets
+$(foreach t,$(run) $(run_O),$(eval \
+	$(t): $(if $(findstring make_static,$(t)),\
+		$(FEATURES_DUMP_FILE_STATIC),\
+		$(FEATURES_DUMP_FILE))))
+
+# Append 'FEATURES_DUMP=' option to all test cases. For example:
+# make_no_libbpf: NO_LIBBPF=1  --> NO_LIBBPF=1 FEATURES_DUMP=/a/b/BUILD_TEST_FEATURE_DUMP
+# make_static: LDFLAGS=-static --> LDFLAGS=-static FEATURES_DUMP=/a/b/BUILD_TEST_FEATURE_DUMP_STATIC
+$(foreach t,$(run),$(if $(findstring make_static,$(t)),\
+			$(eval $(t) := $($(t)) FEATURES_DUMP=$(FEATURES_DUMP_FILE_STATIC)),\
+			$(eval $(t) := $($(t)) FEATURES_DUMP=$(FEATURES_DUMP_FILE))))
+endif
 
 .PHONY: all $(run) $(run_O) tarpkg clean make_kernelsrc make_kernelsrc_tools
 endif # ifndef MK
-- 
cgit v0.10.2


From a639a623904cc526cebd7679debf86e5c8e5590b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 29 Jan 2016 14:49:31 -0300
Subject: perf tools: Speed up build-tests by reducing the number of builds
 tested

The 'tools/perf/test/make' makefile has in its default, 'all' target
builds that will pollute the source code directory, i.e. that will not
use O= variable.

The 'build-test' should be run as often as possible, preferrably after
each non strictly non-code commit, so speed it up by selecting just
the O= targets.

Furthermore it tests both the Makefile.perf file, that is normally
driven by the main Makefile, and the Makefile, reduce the time in half
by having just MK=Makefile, the most usual, tested by 'build-test'.

Please run:

  make -C tools/perf -f tests/make

from time to time for testing also the in-place build tests.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-jrt9utscsiqkmjy3ccufostd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index e4ff0bd..4b68f46 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -75,10 +75,17 @@ clean:
 	$(make)
 
 #
-# The build-test target is not really parallel, don't print the jobs info:
+# The build-test target is not really parallel, don't print the jobs info,
+# it also uses only the tests/make targets that don't pollute the source
+# repository, i.e. that uses O= or builds the tarpkg outside the source
+# repo directories.
+#
+# For a full test, use:
+#
+# make -C tools/perf -f tests/make
 #
 build-test:
-	@$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 --no-print-directory
+	@$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile --no-print-directory tarpkg out
 
 #
 # All other targets get passed through:
-- 
cgit v0.10.2


From 14a05e13a044c1cd6aaa3eb1a5fcdad7b4f6c990 Mon Sep 17 00:00:00 2001
From: Mathieu Poirier <mathieu.poirier@linaro.org>
Date: Thu, 14 Jan 2016 14:46:15 -0700
Subject: perf auxtrace: Add perf_evlist pointer to *info_priv_size()

On some architecture the size of the private header may be dependent on
the number of tracers used in the session.  As such adding a "struct
perf_evlist *" parameter, which should contain all the required
information.

Also adjusting the existing client of the interface to take the new
parameter into account.

Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Al Grant <al.grant@arm.com>
Cc: Chunyan Zhang <zhang.chunyan@linaro.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-doc@vger.kernel.org
Cc: Mike Leach <mike.leach@arm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rabin Vincent <rabin@rab.in>
Cc: Tor Jeremiassen <tor@ti.com>
Link: http://lkml.kernel.org/r/1452807977-8069-22-git-send-email-mathieu.poirier@linaro.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/arch/x86/util/intel-bts.c b/tools/perf/arch/x86/util/intel-bts.c
index 8d8150f..d66f9ad 100644
--- a/tools/perf/arch/x86/util/intel-bts.c
+++ b/tools/perf/arch/x86/util/intel-bts.c
@@ -60,7 +60,9 @@ struct branch {
 	u64 misc;
 };
 
-static size_t intel_bts_info_priv_size(struct auxtrace_record *itr __maybe_unused)
+static size_t
+intel_bts_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+			 struct perf_evlist *evlist __maybe_unused)
 {
 	return INTEL_BTS_AUXTRACE_PRIV_SIZE;
 }
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index f05daac..6f7d453 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -273,7 +273,9 @@ intel_pt_pmu_default_config(struct perf_pmu *intel_pt_pmu)
 	return attr;
 }
 
-static size_t intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused)
+static size_t
+intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+			struct perf_evlist *evlist __maybe_unused)
 {
 	return INTEL_PT_AUXTRACE_PRIV_SIZE;
 }
diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c
index 360fda0..ec164fe 100644
--- a/tools/perf/util/auxtrace.c
+++ b/tools/perf/util/auxtrace.c
@@ -478,10 +478,11 @@ void auxtrace_heap__pop(struct auxtrace_heap *heap)
 			 heap_array[last].ordinal);
 }
 
-size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr)
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr,
+				       struct perf_evlist *evlist)
 {
 	if (itr)
-		return itr->info_priv_size(itr);
+		return itr->info_priv_size(itr, evlist);
 	return 0;
 }
 
@@ -852,7 +853,7 @@ int perf_event__synthesize_auxtrace_info(struct auxtrace_record *itr,
 	int err;
 
 	pr_debug2("Synthesizing auxtrace information\n");
-	priv_size = auxtrace_record__info_priv_size(itr);
+	priv_size = auxtrace_record__info_priv_size(itr, session->evlist);
 	ev = zalloc(sizeof(struct auxtrace_info_event) + priv_size);
 	if (!ev)
 		return -ENOMEM;
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index b86f90db..e5a8e2d 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -293,7 +293,8 @@ struct auxtrace_record {
 	int (*recording_options)(struct auxtrace_record *itr,
 				 struct perf_evlist *evlist,
 				 struct record_opts *opts);
-	size_t (*info_priv_size)(struct auxtrace_record *itr);
+	size_t (*info_priv_size)(struct auxtrace_record *itr,
+				 struct perf_evlist *evlist);
 	int (*info_fill)(struct auxtrace_record *itr,
 			 struct perf_session *session,
 			 struct auxtrace_info_event *auxtrace_info,
@@ -429,7 +430,8 @@ int auxtrace_parse_snapshot_options(struct auxtrace_record *itr,
 int auxtrace_record__options(struct auxtrace_record *itr,
 			     struct perf_evlist *evlist,
 			     struct record_opts *opts);
-size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr);
+size_t auxtrace_record__info_priv_size(struct auxtrace_record *itr,
+				       struct perf_evlist *evlist);
 int auxtrace_record__info_fill(struct auxtrace_record *itr,
 			       struct perf_session *session,
 			       struct auxtrace_info_event *auxtrace_info,
-- 
cgit v0.10.2


From fd786fac78affe4a005065bc2b6f90d8f8953961 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 29 Jan 2016 17:40:51 +0000
Subject: perf buildid: Fix cpumode of buildid event

There is a nasty confusion that, for kernel module, dso->kernel is not
necessary to be DSO_TYPE_KERNEL or DSO_TYPE_GUEST_KERNEL.  These two
enums are for vmlinux. See thread [1]. We tried to fix this part but it
is costy.

Code machine__write_buildid_table() is another unfortunate function fall
into this trap that, when issuing buildid event for a kernel module,
cpumode it gives to the event is PERF_RECORD_MISC_USER, not
PERF_RECORD_MISC_KERNEL.

However, even with this bug, most of the time it doesn't causes real
problem. I find this issue when trying to use a perf before commit
3d39ac538629 ("perf machine: No need to have two DSOs lists") to parse a
perf.data generated by newest perf.

[1] https://lkml.org/lkml/2015/9/21/908

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1454089251-203152-1-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index 6a7e273..b28100e 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -211,6 +211,7 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
 	dsos__for_each_with_build_id(pos, &machine->dsos.head) {
 		const char *name;
 		size_t name_len;
+		bool in_kernel = false;
 
 		if (!pos->hit)
 			continue;
@@ -227,8 +228,11 @@ static int machine__write_buildid_table(struct machine *machine, int fd)
 			name_len = pos->long_name_len + 1;
 		}
 
+		in_kernel = pos->kernel ||
+				is_kernel_module(name,
+					PERF_RECORD_MISC_CPUMODE_UNKNOWN);
 		err = write_buildid(name, name_len, pos->build_id, machine->pid,
-				    pos->kernel ? kmisc : umisc, fd);
+				    in_kernel ? kmisc : umisc, fd);
 		if (err)
 			break;
 	}
-- 
cgit v0.10.2


From 6a7d550e8b2eeb380ab85d9bc53571123b98345b Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 25 Jan 2016 09:55:53 +0000
Subject: perf test: Check environment before start real BPF test

Copying perf to old kernel system results:

  # perf test bpf
  37: Test BPF filter                                          :
  37.1: Test basic BPF filtering                               : FAILED!
  37.2: Test BPF prologue generation                           : Skip

However, in case when kernel doesn't support a test case it should
return 'Skip', 'FAILED!' should be reserved for kernel tests for when
the kernel supports a feature that then fails to work as advertised.

This patch checks environment before real testcase.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Suggested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1453715801-7732-7-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 952ca99..4aed5cb 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -1,7 +1,11 @@
 #include <stdio.h>
 #include <sys/epoll.h>
+#include <util/util.h>
 #include <util/bpf-loader.h>
 #include <util/evlist.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <bpf/bpf.h>
 #include "tests.h"
 #include "llvm.h"
 #include "debug.h"
@@ -243,6 +247,36 @@ const char *test__bpf_subtest_get_desc(int i)
 	return bpf_testcase_table[i].desc;
 }
 
+static int check_env(void)
+{
+	int err;
+	unsigned int kver_int;
+	char license[] = "GPL";
+
+	struct bpf_insn insns[] = {
+		BPF_MOV64_IMM(BPF_REG_0, 1),
+		BPF_EXIT_INSN(),
+	};
+
+	err = fetch_kernel_version(&kver_int, NULL, 0);
+	if (err) {
+		pr_debug("Unable to get kernel version\n");
+		return err;
+	}
+
+	err = bpf_load_program(BPF_PROG_TYPE_KPROBE, insns,
+			       sizeof(insns) / sizeof(insns[0]),
+			       license, kver_int, NULL, 0);
+	if (err < 0) {
+		pr_err("Missing basic BPF support, skip this test: %s\n",
+		       strerror(errno));
+		return err;
+	}
+	close(err);
+
+	return 0;
+}
+
 int test__bpf(int i)
 {
 	int err;
@@ -255,6 +289,9 @@ int test__bpf(int i)
 		return TEST_SKIP;
 	}
 
+	if (check_env())
+		return TEST_SKIP;
+
 	err = __test__bpf(i);
 	return err;
 }
-- 
cgit v0.10.2


From 8fd34e1cce180eb0c726e7ed88f7b70c11c38e21 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 25 Jan 2016 09:55:55 +0000
Subject: perf test: Improve bp_signal

Will Deacon [1] has some question on patch [2]. This patch improves
test__bp_signal so we can test:

 1. A watchpoint and a breakpoint that fire on the same instruction
 2. Nested signals

Test result:

 On x86_64 and ARM64 (result are similar with patch [2] on ARM64):

  # ./perf test -v signal
  17: Test breakpoint overflow signal handler                  :
  --- start ---
  test child forked, pid 10213
  count1 1, count2 3, count3 2, overflow 3, overflows_2 3
  test child finished with 0
  ---- end ----
  Test breakpoint overflow signal handler: Ok

So at least 2 cases Will doubted are handled correctly.

[1] http://lkml.kernel.org/g/20160104165535.GI1616@arm.com
[2] http://lkml.kernel.org/g/1450921362-198371-1-git-send-email-wangnan0@huawei.com

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: He Kuang <hekuang@huawei.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1453715801-7732-9-git-send-email-wangnan0@huawei.com
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c
index fb80c9e..1d1bb48 100644
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -29,14 +29,59 @@
 
 static int fd1;
 static int fd2;
+static int fd3;
 static int overflows;
+static int overflows_2;
+
+volatile long the_var;
+
+
+/*
+ * Use ASM to ensure watchpoint and breakpoint can be triggered
+ * at one instruction.
+ */
+#if defined (__x86_64__)
+extern void __test_function(volatile long *ptr);
+asm (
+	".globl __test_function\n"
+	"__test_function:\n"
+	"incq (%rdi)\n"
+	"ret\n");
+#elif defined (__aarch64__)
+extern void __test_function(volatile long *ptr);
+asm (
+	".globl __test_function\n"
+	"__test_function:\n"
+	"str x30, [x0]\n"
+	"ret\n");
+
+#else
+static void __test_function(volatile long *ptr)
+{
+	*ptr = 0x1234;
+}
+#endif
 
 __attribute__ ((noinline))
 static int test_function(void)
 {
+	__test_function(&the_var);
+	the_var++;
 	return time(NULL);
 }
 
+static void sig_handler_2(int signum __maybe_unused,
+			  siginfo_t *oh __maybe_unused,
+			  void *uc __maybe_unused)
+{
+	overflows_2++;
+	if (overflows_2 > 10) {
+		ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
+		ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+		ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
+	}
+}
+
 static void sig_handler(int signum __maybe_unused,
 			siginfo_t *oh __maybe_unused,
 			void *uc __maybe_unused)
@@ -54,10 +99,11 @@ static void sig_handler(int signum __maybe_unused,
 		 */
 		ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
 		ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+		ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
 	}
 }
 
-static int bp_event(void *fn, int setup_signal)
+static int __event(bool is_x, void *addr, int signal)
 {
 	struct perf_event_attr pe;
 	int fd;
@@ -67,8 +113,8 @@ static int bp_event(void *fn, int setup_signal)
 	pe.size = sizeof(struct perf_event_attr);
 
 	pe.config = 0;
-	pe.bp_type = HW_BREAKPOINT_X;
-	pe.bp_addr = (unsigned long) fn;
+	pe.bp_type = is_x ? HW_BREAKPOINT_X : HW_BREAKPOINT_W;
+	pe.bp_addr = (unsigned long) addr;
 	pe.bp_len = sizeof(long);
 
 	pe.sample_period = 1;
@@ -86,17 +132,25 @@ static int bp_event(void *fn, int setup_signal)
 		return TEST_FAIL;
 	}
 
-	if (setup_signal) {
-		fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC);
-		fcntl(fd, F_SETSIG, SIGIO);
-		fcntl(fd, F_SETOWN, getpid());
-	}
+	fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC);
+	fcntl(fd, F_SETSIG, signal);
+	fcntl(fd, F_SETOWN, getpid());
 
 	ioctl(fd, PERF_EVENT_IOC_RESET, 0);
 
 	return fd;
 }
 
+static int bp_event(void *addr, int signal)
+{
+	return __event(true, addr, signal);
+}
+
+static int wp_event(void *addr, int signal)
+{
+	return __event(false, addr, signal);
+}
+
 static long long bp_count(int fd)
 {
 	long long count;
@@ -114,7 +168,7 @@ static long long bp_count(int fd)
 int test__bp_signal(int subtest __maybe_unused)
 {
 	struct sigaction sa;
-	long long count1, count2;
+	long long count1, count2, count3;
 
 	/* setup SIGIO signal handler */
 	memset(&sa, 0, sizeof(struct sigaction));
@@ -126,21 +180,52 @@ int test__bp_signal(int subtest __maybe_unused)
 		return TEST_FAIL;
 	}
 
+	sa.sa_sigaction = (void *) sig_handler_2;
+	if (sigaction(SIGUSR1, &sa, NULL) < 0) {
+		pr_debug("failed setting up signal handler 2\n");
+		return TEST_FAIL;
+	}
+
 	/*
 	 * We create following events:
 	 *
-	 * fd1 - breakpoint event on test_function with SIGIO
+	 * fd1 - breakpoint event on __test_function with SIGIO
 	 *       signal configured. We should get signal
 	 *       notification each time the breakpoint is hit
 	 *
-	 * fd2 - breakpoint event on sig_handler without SIGIO
+	 * fd2 - breakpoint event on sig_handler with SIGUSR1
+	 *       configured. We should get SIGUSR1 each time when
+	 *       breakpoint is hit
+	 *
+	 * fd3 - watchpoint event on __test_function with SIGIO
 	 *       configured.
 	 *
 	 * Following processing should happen:
-	 *   - execute test_function
-	 *   - fd1 event breakpoint hit -> count1 == 1
-	 *   - SIGIO is delivered       -> overflows == 1
-	 *   - fd2 event breakpoint hit -> count2 == 1
+	 *   Exec:               Action:                       Result:
+	 *   incq (%rdi)       - fd1 event breakpoint hit   -> count1 == 1
+	 *                     - SIGIO is delivered
+	 *   sig_handler       - fd2 event breakpoint hit   -> count2 == 1
+	 *                     - SIGUSR1 is delivered
+	 *   sig_handler_2                                  -> overflows_2 == 1  (nested signal)
+	 *   sys_rt_sigreturn  - return from sig_handler_2
+	 *   overflows++                                    -> overflows = 1
+	 *   sys_rt_sigreturn  - return from sig_handler
+	 *   incq (%rdi)       - fd3 event watchpoint hit   -> count3 == 1       (wp and bp in one insn)
+	 *                     - SIGIO is delivered
+	 *   sig_handler       - fd2 event breakpoint hit   -> count2 == 2
+	 *                     - SIGUSR1 is delivered
+	 *   sig_handler_2                                  -> overflows_2 == 2  (nested signal)
+	 *   sys_rt_sigreturn  - return from sig_handler_2
+	 *   overflows++                                    -> overflows = 2
+	 *   sys_rt_sigreturn  - return from sig_handler
+	 *   the_var++         - fd3 event watchpoint hit   -> count3 == 2       (standalone watchpoint)
+	 *                     - SIGIO is delivered
+	 *   sig_handler       - fd2 event breakpoint hit   -> count2 == 3
+	 *                     - SIGUSR1 is delivered
+	 *   sig_handler_2                                  -> overflows_2 == 3  (nested signal)
+	 *   sys_rt_sigreturn  - return from sig_handler_2
+	 *   overflows++                                    -> overflows == 3
+	 *   sys_rt_sigreturn  - return from sig_handler
 	 *
 	 * The test case check following error conditions:
 	 * - we get stuck in signal handler because of debug
@@ -152,11 +237,13 @@ int test__bp_signal(int subtest __maybe_unused)
 	 *
 	 */
 
-	fd1 = bp_event(test_function, 1);
-	fd2 = bp_event(sig_handler, 0);
+	fd1 = bp_event(__test_function, SIGIO);
+	fd2 = bp_event(sig_handler, SIGUSR1);
+	fd3 = wp_event((void *)&the_var, SIGIO);
 
 	ioctl(fd1, PERF_EVENT_IOC_ENABLE, 0);
 	ioctl(fd2, PERF_EVENT_IOC_ENABLE, 0);
+	ioctl(fd3, PERF_EVENT_IOC_ENABLE, 0);
 
 	/*
 	 * Kick off the test by trigering 'fd1'
@@ -166,15 +253,18 @@ int test__bp_signal(int subtest __maybe_unused)
 
 	ioctl(fd1, PERF_EVENT_IOC_DISABLE, 0);
 	ioctl(fd2, PERF_EVENT_IOC_DISABLE, 0);
+	ioctl(fd3, PERF_EVENT_IOC_DISABLE, 0);
 
 	count1 = bp_count(fd1);
 	count2 = bp_count(fd2);
+	count3 = bp_count(fd3);
 
 	close(fd1);
 	close(fd2);
+	close(fd3);
 
-	pr_debug("count1 %lld, count2 %lld, overflow %d\n",
-		 count1, count2, overflows);
+	pr_debug("count1 %lld, count2 %lld, count3 %lld, overflow %d, overflows_2 %d\n",
+		 count1, count2, count3, overflows, overflows_2);
 
 	if (count1 != 1) {
 		if (count1 == 11)
@@ -183,12 +273,18 @@ int test__bp_signal(int subtest __maybe_unused)
 			pr_debug("failed: wrong count for bp1%lld\n", count1);
 	}
 
-	if (overflows != 1)
+	if (overflows != 3)
 		pr_debug("failed: wrong overflow hit\n");
 
-	if (count2 != 1)
+	if (overflows_2 != 3)
+		pr_debug("failed: wrong overflow_2 hit\n");
+
+	if (count2 != 3)
 		pr_debug("failed: wrong count for bp2\n");
 
-	return count1 == 1 && overflows == 1 && count2 == 1 ?
+	if (count3 != 2)
+		pr_debug("failed: wrong count for bp3\n");
+
+	return count1 == 1 && overflows == 3 && count2 == 3 && overflows_2 == 3 && count3 == 2 ?
 		TEST_OK : TEST_FAIL;
 }
-- 
cgit v0.10.2


From 37b20151efe002a4a43532d3791d11d39d080248 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 25 Jan 2016 09:56:13 +0000
Subject: perf tools: Move timestamp creation to util

Timestamp generation becomes a public available helper. Which will
be used by 'perf record', help it output to split output file based
on time.

For example:

 perf.data.2015122620363710
 perf.data.2015122620364092
 perf.data.2015122620365423
 ...

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1453715801-7732-27-git-send-email-wangnan0@huawei.com
Signed-off-by: He Kuang <hekuang@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index d93bff7..632efc6 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -38,19 +38,7 @@ static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid)
 
 static int build_id_cache__kcore_dir(char *dir, size_t sz)
 {
-	struct timeval tv;
-	struct tm tm;
-	char dt[32];
-
-	if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
-		return -1;
-
-	if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
-		return -1;
-
-	scnprintf(dir, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
-
-	return 0;
+	return fetch_current_timestamp(dir, sz);
 }
 
 static bool same_kallsyms_reloc(const char *from_dir, char *to_dir)
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 7a2da7e..b9e2843 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -701,3 +701,20 @@ bool is_regular_file(const char *file)
 
 	return S_ISREG(st.st_mode);
 }
+
+int fetch_current_timestamp(char *buf, size_t sz)
+{
+	struct timeval tv;
+	struct tm tm;
+	char dt[32];
+
+	if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
+		return -1;
+
+	if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
+		return -1;
+
+	scnprintf(buf, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
+
+	return 0;
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 61650f0..a861581 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -344,5 +344,6 @@ int fetch_kernel_version(unsigned int *puint,
 
 const char *perf_tip(const char *dirpath);
 bool is_regular_file(const char *file);
+int fetch_current_timestamp(char *buf, size_t sz);
 
 #endif /* GIT_COMPAT_UTIL_H */
-- 
cgit v0.10.2


From d2db9a98c3058a45780f7fcd0cc8584858cf6b29 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 25 Jan 2016 09:56:19 +0000
Subject: perf record: Use OPT_BOOLEAN_SET for buildid cache related options

'perf record' knows whether buildid cache is enabled (via
--no-no-buildid-cache) deliberately. Buildid cache can be turned off in
some situations.

Output switching support needs this feature to turn off buildid cache
by default.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1453715801-7732-33-git-send-email-wangnan0@huawei.com
Signed-off-by: He Kuang <hekuang@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 319712a..0ee0d5c 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -49,7 +49,9 @@ struct record {
 	const char		*progname;
 	int			realtime_prio;
 	bool			no_buildid;
+	bool			no_buildid_set;
 	bool			no_buildid_cache;
+	bool			no_buildid_cache_set;
 	bool			buildid_all;
 	unsigned long long	samples;
 };
@@ -1097,10 +1099,12 @@ struct option __record_options[] = {
 	OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
 		    "don't sample"),
-	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
-		    "do not update the buildid cache"),
-	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
-		    "do not collect buildids in perf.data"),
+	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
+			&record.no_buildid_cache_set,
+			"do not update the buildid cache"),
+	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
+			&record.no_buildid_set,
+			"do not collect buildids in perf.data"),
 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
 		     "monitor event in cgroup name only",
 		     parse_cgroups),
-- 
cgit v0.10.2


From 162607ea20fafb4a76234ebe4314cd733345482e Mon Sep 17 00:00:00 2001
From: Hemant Kumar <hemant@linux.vnet.ibm.com>
Date: Thu, 28 Jan 2016 12:03:04 +0530
Subject: perf kvm/{x86,s390}: Remove dependency on uapi/kvm_perf.h

Its better to remove the dependency on uapi/kvm_perf.h to allow dynamic
discovery of kvm events (if its needed). To do this, some extern
variables have been introduced with which we can keep the generic
functions generic.

Signed-off-by: Hemant Kumar <hemant@linux.vnet.ibm.com>
Acked-by: Alexander Yarygin <yarygin@linux.vnet.ibm.com>
Acked-by: David Ahern <dsahern@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Scott  Wood <scottwood@freescale.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/1453962787-15376-1-git-send-email-hemant@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/arch/s390/util/kvm-stat.c b/tools/perf/arch/s390/util/kvm-stat.c
index a5dbc07..b85a94b 100644
--- a/tools/perf/arch/s390/util/kvm-stat.c
+++ b/tools/perf/arch/s390/util/kvm-stat.c
@@ -10,7 +10,7 @@
  */
 
 #include "../../util/kvm-stat.h"
-#include <asm/kvm_perf.h>
+#include <asm/sie.h>
 
 define_exit_reasons_table(sie_exit_reasons, sie_intercept_code);
 define_exit_reasons_table(sie_icpt_insn_codes, icpt_insn_codes);
@@ -18,6 +18,12 @@ define_exit_reasons_table(sie_sigp_order_codes, sigp_order_codes);
 define_exit_reasons_table(sie_diagnose_codes, diagnose_codes);
 define_exit_reasons_table(sie_icpt_prog_codes, icpt_prog_codes);
 
+const char *vcpu_id_str = "id";
+const int decode_str_len = 40;
+const char *kvm_exit_reason = "icptcode";
+const char *kvm_entry_trace = "kvm:kvm_s390_sie_enter";
+const char *kvm_exit_trace = "kvm:kvm_s390_sie_exit";
+
 static void event_icpt_insn_get_key(struct perf_evsel *evsel,
 				    struct perf_sample *sample,
 				    struct event_key *key)
diff --git a/tools/perf/arch/x86/util/kvm-stat.c b/tools/perf/arch/x86/util/kvm-stat.c
index 14e4e66..babefda 100644
--- a/tools/perf/arch/x86/util/kvm-stat.c
+++ b/tools/perf/arch/x86/util/kvm-stat.c
@@ -1,5 +1,7 @@
 #include "../../util/kvm-stat.h"
-#include <asm/kvm_perf.h>
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
 
 define_exit_reasons_table(vmx_exit_reasons, VMX_EXIT_REASONS);
 define_exit_reasons_table(svm_exit_reasons, SVM_EXIT_REASONS);
@@ -11,6 +13,12 @@ static struct kvm_events_ops exit_events = {
 	.name = "VM-EXIT"
 };
 
+const char *vcpu_id_str = "vcpu_id";
+const int decode_str_len = 20;
+const char *kvm_exit_reason = "exit_reason";
+const char *kvm_entry_trace = "kvm:kvm_entry";
+const char *kvm_exit_trace = "kvm:kvm_exit";
+
 /*
  * For the mmio events, we treat:
  * the time of MMIO write: kvm_mmio(KVM_TRACE_MMIO_WRITE...) -> kvm_entry
@@ -65,7 +73,7 @@ static void mmio_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
 				  struct event_key *key,
 				  char *decode)
 {
-	scnprintf(decode, DECODE_STR_LEN, "%#lx:%s",
+	scnprintf(decode, decode_str_len, "%#lx:%s",
 		  (unsigned long)key->key,
 		  key->info == KVM_TRACE_MMIO_WRITE ? "W" : "R");
 }
@@ -109,7 +117,7 @@ static void ioport_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
 				    struct event_key *key,
 				    char *decode)
 {
-	scnprintf(decode, DECODE_STR_LEN, "%#llx:%s",
+	scnprintf(decode, decode_str_len, "%#llx:%s",
 		  (unsigned long long)key->key,
 		  key->info ? "POUT" : "PIN");
 }
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index 4418d92..ab5645c 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -30,7 +30,6 @@
 #include <math.h>
 
 #ifdef HAVE_KVM_STAT_SUPPORT
-#include <asm/kvm_perf.h>
 #include "util/kvm-stat.h"
 
 void exit_event_get_key(struct perf_evsel *evsel,
@@ -38,12 +37,12 @@ void exit_event_get_key(struct perf_evsel *evsel,
 			struct event_key *key)
 {
 	key->info = 0;
-	key->key = perf_evsel__intval(evsel, sample, KVM_EXIT_REASON);
+	key->key = perf_evsel__intval(evsel, sample, kvm_exit_reason);
 }
 
 bool kvm_exit_event(struct perf_evsel *evsel)
 {
-	return !strcmp(evsel->name, KVM_EXIT_TRACE);
+	return !strcmp(evsel->name, kvm_exit_trace);
 }
 
 bool exit_event_begin(struct perf_evsel *evsel,
@@ -59,7 +58,7 @@ bool exit_event_begin(struct perf_evsel *evsel,
 
 bool kvm_entry_event(struct perf_evsel *evsel)
 {
-	return !strcmp(evsel->name, KVM_ENTRY_TRACE);
+	return !strcmp(evsel->name, kvm_entry_trace);
 }
 
 bool exit_event_end(struct perf_evsel *evsel,
@@ -91,7 +90,7 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm,
 	const char *exit_reason = get_exit_reason(kvm, key->exit_reasons,
 						  key->key);
 
-	scnprintf(decode, DECODE_STR_LEN, "%s", exit_reason);
+	scnprintf(decode, decode_str_len, "%s", exit_reason);
 }
 
 static bool register_kvm_events_ops(struct perf_kvm_stat *kvm)
@@ -357,7 +356,7 @@ static bool handle_end_event(struct perf_kvm_stat *kvm,
 	time_diff = sample->time - time_begin;
 
 	if (kvm->duration && time_diff > kvm->duration) {
-		char decode[DECODE_STR_LEN];
+		char decode[decode_str_len];
 
 		kvm->events_ops->decode_key(kvm, &event->key, decode);
 		if (!skip_event(decode)) {
@@ -385,7 +384,8 @@ struct vcpu_event_record *per_vcpu_record(struct thread *thread,
 			return NULL;
 		}
 
-		vcpu_record->vcpu_id = perf_evsel__intval(evsel, sample, VCPU_ID);
+		vcpu_record->vcpu_id = perf_evsel__intval(evsel, sample,
+							  vcpu_id_str);
 		thread__set_priv(thread, vcpu_record);
 	}
 
@@ -574,7 +574,7 @@ static void show_timeofday(void)
 
 static void print_result(struct perf_kvm_stat *kvm)
 {
-	char decode[DECODE_STR_LEN];
+	char decode[decode_str_len];
 	struct kvm_event *event;
 	int vcpu = kvm->trace_vcpu;
 
@@ -585,7 +585,7 @@ static void print_result(struct perf_kvm_stat *kvm)
 
 	pr_info("\n\n");
 	print_vcpu_info(kvm);
-	pr_info("%*s ", DECODE_STR_LEN, kvm->events_ops->name);
+	pr_info("%*s ", decode_str_len, kvm->events_ops->name);
 	pr_info("%10s ", "Samples");
 	pr_info("%9s ", "Samples%");
 
@@ -604,7 +604,7 @@ static void print_result(struct perf_kvm_stat *kvm)
 		min = get_event_min(event, vcpu);
 
 		kvm->events_ops->decode_key(kvm, &event->key, decode);
-		pr_info("%*s ", DECODE_STR_LEN, decode);
+		pr_info("%*s ", decode_str_len, decode);
 		pr_info("%10llu ", (unsigned long long)ecount);
 		pr_info("%8.2f%% ", (double)ecount / kvm->total_count * 100);
 		pr_info("%8.2f%% ", (double)etime / kvm->total_time * 100);
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index ae825d4..dd55548 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -136,5 +136,10 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid);
 extern const char * const kvm_events_tp[];
 extern struct kvm_reg_events_ops kvm_reg_events_ops[];
 extern const char * const kvm_skip_events[];
+extern const char *vcpu_id_str;
+extern const int decode_str_len;
+extern const char *kvm_exit_reason;
+extern const char *kvm_entry_trace;
+extern const char *kvm_exit_trace;
 
 #endif /* __PERF_KVM_STAT_H */
-- 
cgit v0.10.2


From 48deaa74fcdad516a94fe38a4af706747d9e4745 Mon Sep 17 00:00:00 2001
From: Hemant Kumar <hemant@linux.vnet.ibm.com>
Date: Thu, 28 Jan 2016 12:03:05 +0530
Subject: perf kvm/{x86,s390}: Remove const from kvm_events_tp

This patch removes the "const" qualifier from kvm_events_tp declaration
to account for the fact that some architectures may need to update this
variable dynamically. For instance, powerpc will need to update this
variable dynamically depending on the machine type.

Signed-off-by: Hemant Kumar <hemant@linux.vnet.ibm.com>
Acked-by: David Ahern <dsahern@gmail.com>
Cc: Alexander Yarygin <yarygin@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Scott  Wood <scottwood@freescale.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/1453962787-15376-2-git-send-email-hemant@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/arch/s390/util/kvm-stat.c b/tools/perf/arch/s390/util/kvm-stat.c
index b85a94b..ed57df2 100644
--- a/tools/perf/arch/s390/util/kvm-stat.c
+++ b/tools/perf/arch/s390/util/kvm-stat.c
@@ -79,7 +79,7 @@ static struct kvm_events_ops exit_events = {
 	.name = "VM-EXIT"
 };
 
-const char * const kvm_events_tp[] = {
+const char *kvm_events_tp[] = {
 	"kvm:kvm_s390_sie_enter",
 	"kvm:kvm_s390_sie_exit",
 	"kvm:kvm_s390_intercept_instruction",
diff --git a/tools/perf/arch/x86/util/kvm-stat.c b/tools/perf/arch/x86/util/kvm-stat.c
index babefda..b63d4be 100644
--- a/tools/perf/arch/x86/util/kvm-stat.c
+++ b/tools/perf/arch/x86/util/kvm-stat.c
@@ -129,7 +129,7 @@ static struct kvm_events_ops ioport_events = {
 	.name = "IO Port Access"
 };
 
-const char * const kvm_events_tp[] = {
+const char *kvm_events_tp[] = {
 	"kvm:kvm_entry",
 	"kvm:kvm_exit",
 	"kvm:kvm_mmio",
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index dd55548..c965dc8 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -133,7 +133,7 @@ bool kvm_entry_event(struct perf_evsel *evsel);
  */
 int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid);
 
-extern const char * const kvm_events_tp[];
+extern const char *kvm_events_tp[];
 extern struct kvm_reg_events_ops kvm_reg_events_ops[];
 extern const char * const kvm_skip_events[];
 extern const char *vcpu_id_str;
-- 
cgit v0.10.2


From 066d3593e1b14690dc1131d50cacbb0b7eb3f160 Mon Sep 17 00:00:00 2001
From: Hemant Kumar <hemant@linux.vnet.ibm.com>
Date: Thu, 28 Jan 2016 12:03:06 +0530
Subject: perf kvm/powerpc: Port perf kvm stat to powerpc

perf kvm can be used to analyze guest exit reasons. This support already
exists in x86. Hence, porting it to powerpc.

 - To trace KVM events :
  perf kvm stat record
  If many guests are running, we can track for a specific guest by using
  --pid as in : perf kvm stat record --pid <pid>

 - To see the results :
  perf kvm stat report

The result shows the number of exits (from the guest context to
host/hypervisor context) grouped by their respective exit reasons with
their frequency.

Since, different powerpc machines have different KVM tracepoints, this
patch discovers the available tracepoints dynamically and accordingly
looks for them. If any single tracepoint is not present, this support
won't be enabled for reporting. To record, this will fail if any of the
events we are looking to record isn't available.  Right now, its only
supported on PowerPC Book3S_HV architectures.

To analyze the different exits, group them and present them (in a slight
descriptive way) to the user, we need a mapping between the "exit code"
(dumped in the kvm_guest_exit tracepoint data) and to its related
Interrupt vector description (exit reason). This patch adds this mapping
in book3s_hv_exits.h.

It records on two available KVM tracepoints for book3s_hv:

"kvm_hv:kvm_guest_exit" and "kvm_hv:kvm_guest_enter".

Here is a sample o/p:
 # pgrep qemu
19378
60515

2 Guests are running on the host.

 # perf kvm stat record -a
^C[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 4.153 MB perf.data.guest (39624
samples) ]

 # perf kvm stat report -p 60515

Analyze events for pid(s) 60515, all VCPUs:

     VM-EXIT Samples Samples% Time% MinTime    MaxTime  Avg time

       SYSCALL  9141  63.67%  7.49% 1.26us   5782.39us    9.87us (+- 6.46%)
H_DATA_STORAGE  4114  28.66%  5.07% 1.72us   4597.68us   14.84us (+-20.06%)
HV_DECREMENTER   418   2.91%  4.26% 0.70us  30002.22us  122.58us (+-70.29%)
      EXTERNAL   392   2.73%  0.06% 0.64us    104.10us    1.94us (+-18.83%)
RETURN_TO_HOST   287   2.00% 83.11% 1.53us 124240.15us 3486.52us (+-16.81%)
H_INST_STORAGE     5   0.03%  0.00% 1.88us      3.73us    2.39us (+-14.20%)

Total Samples:14357, Total events handled time:1203918.42us.

Signed-off-by: Hemant Kumar <hemant@linux.vnet.ibm.com>
Cc: Alexander Yarygin <yarygin@linux.vnet.ibm.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Scott  Wood <scottwood@freescale.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/1453962787-15376-3-git-send-email-hemant@linux.vnet.ibm.com
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index 7fbca17..9f9cea3 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -1,3 +1,5 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
 endif
+
+HAVE_KVM_STAT_SUPPORT := 1
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index 7b8b0d1..c8fe207 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -1,5 +1,6 @@
 libperf-y += header.o
 libperf-y += sym-handling.o
+libperf-y += kvm-stat.o
 
 libperf-$(CONFIG_DWARF) += dwarf-regs.o
 libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/book3s_hv_exits.h b/tools/perf/arch/powerpc/util/book3s_hv_exits.h
new file mode 100644
index 0000000..e68ba2d
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/book3s_hv_exits.h
@@ -0,0 +1,33 @@
+#ifndef ARCH_PERF_BOOK3S_HV_EXITS_H
+#define ARCH_PERF_BOOK3S_HV_EXITS_H
+
+/*
+ * PowerPC Interrupt vectors : exit code to name mapping
+ */
+
+#define kvm_trace_symbol_exit \
+	{0x0,	"RETURN_TO_HOST"}, \
+	{0x100, "SYSTEM_RESET"}, \
+	{0x200, "MACHINE_CHECK"}, \
+	{0x300, "DATA_STORAGE"}, \
+	{0x380, "DATA_SEGMENT"}, \
+	{0x400, "INST_STORAGE"}, \
+	{0x480, "INST_SEGMENT"}, \
+	{0x500, "EXTERNAL"}, \
+	{0x501, "EXTERNAL_LEVEL"}, \
+	{0x502, "EXTERNAL_HV"}, \
+	{0x600, "ALIGNMENT"}, \
+	{0x700, "PROGRAM"}, \
+	{0x800, "FP_UNAVAIL"}, \
+	{0x900, "DECREMENTER"}, \
+	{0x980, "HV_DECREMENTER"}, \
+	{0xc00, "SYSCALL"}, \
+	{0xd00, "TRACE"}, \
+	{0xe00, "H_DATA_STORAGE"}, \
+	{0xe20, "H_INST_STORAGE"}, \
+	{0xe40, "H_EMUL_ASSIST"}, \
+	{0xf00, "PERFMON"}, \
+	{0xf20, "ALTIVEC"}, \
+	{0xf40, "VSX"}
+
+#endif
diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
new file mode 100644
index 0000000..27bc559b8
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -0,0 +1,107 @@
+#include "util/kvm-stat.h"
+#include "util/parse-events.h"
+
+#include "book3s_hv_exits.h"
+
+#define NR_TPS 2
+
+const char *vcpu_id_str = "vcpu_id";
+const int decode_str_len = 40;
+const char *kvm_entry_trace = "kvm_hv:kvm_guest_enter";
+const char *kvm_exit_trace = "kvm_hv:kvm_guest_exit";
+
+define_exit_reasons_table(hv_exit_reasons, kvm_trace_symbol_exit);
+
+/* Tracepoints specific to ppc_book3s_hv */
+const char *ppc_book3s_hv_kvm_tp[] = {
+	"kvm_hv:kvm_guest_enter",
+	"kvm_hv:kvm_guest_exit",
+};
+
+/* 1 extra placeholder for NULL */
+const char *kvm_events_tp[NR_TPS + 1];
+const char *kvm_exit_reason;
+
+static struct kvm_events_ops exit_events = {
+	.is_begin_event = exit_event_begin,
+	.is_end_event = exit_event_end,
+	.decode_key = exit_event_decode_key,
+	.name = "VM-EXIT"
+};
+
+struct kvm_reg_events_ops kvm_reg_events_ops[] = {
+	{ .name = "vmexit", .ops = &exit_events },
+	{ NULL, NULL },
+};
+
+const char * const kvm_skip_events[] = {
+	NULL,
+};
+
+
+static int is_tracepoint_available(const char *str, struct perf_evlist *evlist)
+{
+	struct parse_events_error err;
+	int ret;
+
+	err.str = NULL;
+	ret = parse_events(evlist, str, &err);
+	if (err.str)
+		pr_err("%s : %s\n", str, err.str);
+	return ret;
+}
+
+static int ppc__setup_book3s_hv(struct perf_kvm_stat *kvm,
+				struct perf_evlist *evlist)
+{
+	const char **events_ptr;
+	int i, nr_tp = 0, err = -1;
+
+	/* Check for book3s_hv tracepoints */
+	for (events_ptr = ppc_book3s_hv_kvm_tp; *events_ptr; events_ptr++) {
+		err = is_tracepoint_available(*events_ptr, evlist);
+		if (err)
+			return -1;
+		nr_tp++;
+	}
+
+	for (i = 0; i < nr_tp; i++)
+		kvm_events_tp[i] = ppc_book3s_hv_kvm_tp[i];
+
+	kvm_events_tp[i] = NULL;
+	kvm_exit_reason = "trap";
+	kvm->exit_reasons = hv_exit_reasons;
+	kvm->exit_reasons_isa = "HV";
+
+	return 0;
+}
+
+/* Wrapper to setup kvm tracepoints */
+static int ppc__setup_kvm_tp(struct perf_kvm_stat *kvm)
+{
+	struct perf_evlist *evlist = perf_evlist__new();
+
+	if (evlist == NULL)
+		return -ENOMEM;
+
+	/* Right now, only supported on book3s_hv */
+	return ppc__setup_book3s_hv(kvm, evlist);
+}
+
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm)
+{
+	return ppc__setup_kvm_tp(kvm);
+}
+
+int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid __maybe_unused)
+{
+	int ret;
+
+	ret = ppc__setup_kvm_tp(kvm);
+	if (ret) {
+		kvm->exit_reasons = NULL;
+		kvm->exit_reasons_isa = NULL;
+	}
+
+	return ret;
+}
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index ab5645c..bff6664 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1132,6 +1132,11 @@ exit:
 		_p;			\
 	})
 
+int __weak setup_kvm_events_tp(struct perf_kvm_stat *kvm __maybe_unused)
+{
+	return 0;
+}
+
 static int
 kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
 {
@@ -1148,7 +1153,14 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
 		NULL
 	};
 	const char * const *events_tp;
+	int ret;
+
 	events_tp_size = 0;
+	ret = setup_kvm_events_tp(kvm);
+	if (ret < 0) {
+		pr_err("Unable to setup the kvm tracepoints\n");
+		return ret;
+	}
 
 	for (events_tp = kvm_events_tp; *events_tp; events_tp++)
 		events_tp_size++;
@@ -1377,6 +1389,12 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
 	/*
 	 * generate the event list
 	 */
+	err = setup_kvm_events_tp(kvm);
+	if (err < 0) {
+		pr_err("Unable to setup the kvm tracepoints\n");
+		return err;
+	}
+
 	kvm->evlist = kvm_live_event_list();
 	if (kvm->evlist == NULL) {
 		err = -1;
diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h
index c965dc8..d01e735 100644
--- a/tools/perf/util/kvm-stat.h
+++ b/tools/perf/util/kvm-stat.h
@@ -122,6 +122,7 @@ void exit_event_decode_key(struct perf_kvm_stat *kvm,
 
 bool kvm_exit_event(struct perf_evsel *evsel);
 bool kvm_entry_event(struct perf_evsel *evsel);
+int setup_kvm_events_tp(struct perf_kvm_stat *kvm);
 
 #define define_exit_reasons_table(name, symbols)	\
 	static struct exit_reasons_table name[] = {	\
-- 
cgit v0.10.2


From 78e6c39b231a8e31e193534fdbe29291b7fd8f37 Mon Sep 17 00:00:00 2001
From: Hemant Kumar <hemant@linux.vnet.ibm.com>
Date: Thu, 28 Jan 2016 12:03:07 +0530
Subject: perf kvm/powerpc: Add support for HCALL reasons

Powerpc provides hcall events that also provides insights into guest
behaviour. Enhance perf kvm stat to record and analyze hcall events.

 - To trace hcall events :
  perf kvm stat record

 - To show the results :
  perf kvm stat report --event=hcall

The result shows the number of hypervisor calls from the guest grouped
by their respective reasons displayed with the frequency.

This patch makes use of two additional tracepoints
"kvm_hv:kvm_hcall_enter" and "kvm_hv:kvm_hcall_exit". To map the hcall
codes to their respective names, it needs a mapping. Such mapping is
added in this patch in book3s_hcalls.h.

 # pgrep qemu
A sample output :
19378
60515

2 VMs running.

 # perf kvm stat record -a
^C[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 4.153 MB perf.data.guest (39624
samples) ]

 # perf kvm stat report -p 60515 --event=hcall

Analyze events for all VMs, all VCPUs:

    HCALL-EVENT Samples Samples% Time% MinTime MaxTime  AvgTime

          H_IPI     822  66.08% 88.10% 0.63us  11.38us 2.05us (+- 1.42%)
     H_SEND_CRQ     144  11.58%  3.77% 0.41us   0.88us 0.50us (+- 1.47%)
   H_VIO_SIGNAL     118   9.49%  2.86% 0.37us   0.83us 0.47us (+- 1.43%)
H_PUT_TERM_CHAR      76   6.11%  2.07% 0.37us   0.90us 0.52us (+- 2.43%)
H_GET_TERM_CHAR      74   5.95%  2.23% 0.37us   1.70us 0.58us (+- 4.77%)
         H_RTAS       6   0.48%  0.85% 1.10us   9.25us 2.70us (+-48.57%)
      H_PERFMON       4   0.32%  0.12% 0.41us   0.96us 0.59us (+-20.92%)

Total Samples:1244, Total events handled time:1916.69us.

Signed-off-by: Hemant Kumar <hemant@linux.vnet.ibm.com>
Cc: Alexander Yarygin <yarygin@linux.vnet.ibm.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Scott  Wood <scottwood@freescale.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/1453962787-15376-4-git-send-email-hemant@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/arch/powerpc/util/book3s_hcalls.h b/tools/perf/arch/powerpc/util/book3s_hcalls.h
new file mode 100644
index 0000000..0dd6b7f
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/book3s_hcalls.h
@@ -0,0 +1,123 @@
+#ifndef ARCH_PERF_BOOK3S_HV_HCALLS_H
+#define ARCH_PERF_BOOK3S_HV_HCALLS_H
+
+/*
+ * PowerPC HCALL codes : hcall code to name mapping
+ */
+#define kvm_trace_symbol_hcall \
+	{0x4, "H_REMOVE"},					\
+	{0x8, "H_ENTER"},					\
+	{0xc, "H_READ"},					\
+	{0x10, "H_CLEAR_MOD"},					\
+	{0x14, "H_CLEAR_REF"},					\
+	{0x18, "H_PROTECT"},					\
+	{0x1c, "H_GET_TCE"},					\
+	{0x20, "H_PUT_TCE"},					\
+	{0x24, "H_SET_SPRG0"},					\
+	{0x28, "H_SET_DABR"},					\
+	{0x2c, "H_PAGE_INIT"},					\
+	{0x30, "H_SET_ASR"},					\
+	{0x34, "H_ASR_ON"},					\
+	{0x38, "H_ASR_OFF"},					\
+	{0x3c, "H_LOGICAL_CI_LOAD"},				\
+	{0x40, "H_LOGICAL_CI_STORE"},				\
+	{0x44, "H_LOGICAL_CACHE_LOAD"},				\
+	{0x48, "H_LOGICAL_CACHE_STORE"},			\
+	{0x4c, "H_LOGICAL_ICBI"},				\
+	{0x50, "H_LOGICAL_DCBF"},				\
+	{0x54, "H_GET_TERM_CHAR"},				\
+	{0x58, "H_PUT_TERM_CHAR"},				\
+	{0x5c, "H_REAL_TO_LOGICAL"},				\
+	{0x60, "H_HYPERVISOR_DATA"},				\
+	{0x64, "H_EOI"},					\
+	{0x68, "H_CPPR"},					\
+	{0x6c, "H_IPI"},					\
+	{0x70, "H_IPOLL"},					\
+	{0x74, "H_XIRR"},					\
+	{0x78, "H_MIGRATE_DMA"},				\
+	{0x7c, "H_PERFMON"},					\
+	{0xdc, "H_REGISTER_VPA"},				\
+	{0xe0, "H_CEDE"},					\
+	{0xe4, "H_CONFER"},					\
+	{0xe8, "H_PROD"},					\
+	{0xec, "H_GET_PPP"},					\
+	{0xf0, "H_SET_PPP"},					\
+	{0xf4, "H_PURR"},					\
+	{0xf8, "H_PIC"},					\
+	{0xfc, "H_REG_CRQ"},					\
+	{0x100, "H_FREE_CRQ"},					\
+	{0x104, "H_VIO_SIGNAL"},				\
+	{0x108, "H_SEND_CRQ"},					\
+	{0x110, "H_COPY_RDMA"},					\
+	{0x114, "H_REGISTER_LOGICAL_LAN"},			\
+	{0x118, "H_FREE_LOGICAL_LAN"},				\
+	{0x11c, "H_ADD_LOGICAL_LAN_BUFFER"},			\
+	{0x120, "H_SEND_LOGICAL_LAN"},				\
+	{0x124, "H_BULK_REMOVE"},				\
+	{0x130, "H_MULTICAST_CTRL"},				\
+	{0x134, "H_SET_XDABR"},					\
+	{0x138, "H_STUFF_TCE"},					\
+	{0x13c, "H_PUT_TCE_INDIRECT"},				\
+	{0x14c, "H_CHANGE_LOGICAL_LAN_MAC"},			\
+	{0x150, "H_VTERM_PARTNER_INFO"},			\
+	{0x154, "H_REGISTER_VTERM"},				\
+	{0x158, "H_FREE_VTERM"},				\
+	{0x15c, "H_RESET_EVENTS"},				\
+	{0x160, "H_ALLOC_RESOURCE"},				\
+	{0x164, "H_FREE_RESOURCE"},				\
+	{0x168, "H_MODIFY_QP"},					\
+	{0x16c, "H_QUERY_QP"},					\
+	{0x170, "H_REREGISTER_PMR"},				\
+	{0x174, "H_REGISTER_SMR"},				\
+	{0x178, "H_QUERY_MR"},					\
+	{0x17c, "H_QUERY_MW"},					\
+	{0x180, "H_QUERY_HCA"},					\
+	{0x184, "H_QUERY_PORT"},				\
+	{0x188, "H_MODIFY_PORT"},				\
+	{0x18c, "H_DEFINE_AQP1"},				\
+	{0x190, "H_GET_TRACE_BUFFER"},				\
+	{0x194, "H_DEFINE_AQP0"},				\
+	{0x198, "H_RESIZE_MR"},					\
+	{0x19c, "H_ATTACH_MCQP"},				\
+	{0x1a0, "H_DETACH_MCQP"},				\
+	{0x1a4, "H_CREATE_RPT"},				\
+	{0x1a8, "H_REMOVE_RPT"},				\
+	{0x1ac, "H_REGISTER_RPAGES"},				\
+	{0x1b0, "H_DISABLE_AND_GETC"},				\
+	{0x1b4, "H_ERROR_DATA"},				\
+	{0x1b8, "H_GET_HCA_INFO"},				\
+	{0x1bc, "H_GET_PERF_COUNT"},				\
+	{0x1c0, "H_MANAGE_TRACE"},				\
+	{0x1d4, "H_FREE_LOGICAL_LAN_BUFFER"},			\
+	{0x1d8, "H_POLL_PENDING"},				\
+	{0x1e4, "H_QUERY_INT_STATE"},				\
+	{0x244, "H_ILLAN_ATTRIBUTES"},				\
+	{0x250, "H_MODIFY_HEA_QP"},				\
+	{0x254, "H_QUERY_HEA_QP"},				\
+	{0x258, "H_QUERY_HEA"},					\
+	{0x25c, "H_QUERY_HEA_PORT"},				\
+	{0x260, "H_MODIFY_HEA_PORT"},				\
+	{0x264, "H_REG_BCMC"},					\
+	{0x268, "H_DEREG_BCMC"},				\
+	{0x26c, "H_REGISTER_HEA_RPAGES"},			\
+	{0x270, "H_DISABLE_AND_GET_HEA"},			\
+	{0x274, "H_GET_HEA_INFO"},				\
+	{0x278, "H_ALLOC_HEA_RESOURCE"},			\
+	{0x284, "H_ADD_CONN"},					\
+	{0x288, "H_DEL_CONN"},					\
+	{0x298, "H_JOIN"},					\
+	{0x2a4, "H_VASI_STATE"},				\
+	{0x2b0, "H_ENABLE_CRQ"},				\
+	{0x2b8, "H_GET_EM_PARMS"},				\
+	{0x2d0, "H_SET_MPP"},					\
+	{0x2d4, "H_GET_MPP"},					\
+	{0x2ec, "H_HOME_NODE_ASSOCIATIVITY"},			\
+	{0x2f4, "H_BEST_ENERGY"},				\
+	{0x2fc, "H_XIRR_X"},					\
+	{0x300, "H_RANDOM"},					\
+	{0x304, "H_COP"},					\
+	{0x314, "H_GET_MPP_X"},					\
+	{0x31c, "H_SET_MODE"},					\
+	{0xf000, "H_RTAS"}					\
+
+#endif
diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c
index 27bc559b8..74eee30 100644
--- a/tools/perf/arch/powerpc/util/kvm-stat.c
+++ b/tools/perf/arch/powerpc/util/kvm-stat.c
@@ -1,9 +1,11 @@
 #include "util/kvm-stat.h"
 #include "util/parse-events.h"
+#include "util/debug.h"
 
 #include "book3s_hv_exits.h"
+#include "book3s_hcalls.h"
 
-#define NR_TPS 2
+#define NR_TPS 4
 
 const char *vcpu_id_str = "vcpu_id";
 const int decode_str_len = 40;
@@ -11,17 +13,77 @@ const char *kvm_entry_trace = "kvm_hv:kvm_guest_enter";
 const char *kvm_exit_trace = "kvm_hv:kvm_guest_exit";
 
 define_exit_reasons_table(hv_exit_reasons, kvm_trace_symbol_exit);
+define_exit_reasons_table(hcall_reasons, kvm_trace_symbol_hcall);
 
 /* Tracepoints specific to ppc_book3s_hv */
 const char *ppc_book3s_hv_kvm_tp[] = {
 	"kvm_hv:kvm_guest_enter",
 	"kvm_hv:kvm_guest_exit",
+	"kvm_hv:kvm_hcall_enter",
+	"kvm_hv:kvm_hcall_exit",
+	NULL,
 };
 
 /* 1 extra placeholder for NULL */
 const char *kvm_events_tp[NR_TPS + 1];
 const char *kvm_exit_reason;
 
+static void hcall_event_get_key(struct perf_evsel *evsel,
+				struct perf_sample *sample,
+				struct event_key *key)
+{
+	key->info = 0;
+	key->key = perf_evsel__intval(evsel, sample, "req");
+}
+
+static const char *get_hcall_exit_reason(u64 exit_code)
+{
+	struct exit_reasons_table *tbl = hcall_reasons;
+
+	while (tbl->reason != NULL) {
+		if (tbl->exit_code == exit_code)
+			return tbl->reason;
+		tbl++;
+	}
+
+	pr_debug("Unknown hcall code: %lld\n",
+	       (unsigned long long)exit_code);
+	return "UNKNOWN";
+}
+
+static bool hcall_event_end(struct perf_evsel *evsel,
+			    struct perf_sample *sample __maybe_unused,
+			    struct event_key *key __maybe_unused)
+{
+	return (!strcmp(evsel->name, kvm_events_tp[3]));
+}
+
+static bool hcall_event_begin(struct perf_evsel *evsel,
+			      struct perf_sample *sample, struct event_key *key)
+{
+	if (!strcmp(evsel->name, kvm_events_tp[2])) {
+		hcall_event_get_key(evsel, sample, key);
+		return true;
+	}
+
+	return false;
+}
+static void hcall_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+				   struct event_key *key,
+				   char *decode)
+{
+	const char *hcall_reason = get_hcall_exit_reason(key->key);
+
+	scnprintf(decode, decode_str_len, "%s", hcall_reason);
+}
+
+static struct kvm_events_ops hcall_events = {
+	.is_begin_event = hcall_event_begin,
+	.is_end_event = hcall_event_end,
+	.decode_key = hcall_event_decode_key,
+	.name = "HCALL-EVENT",
+};
+
 static struct kvm_events_ops exit_events = {
 	.is_begin_event = exit_event_begin,
 	.is_end_event = exit_event_end,
@@ -31,6 +93,7 @@ static struct kvm_events_ops exit_events = {
 
 struct kvm_reg_events_ops kvm_reg_events_ops[] = {
 	{ .name = "vmexit", .ops = &exit_events },
+	{ .name = "hcall", .ops = &hcall_events },
 	{ NULL, NULL },
 };
 
-- 
cgit v0.10.2


From 814568db641f6587c1e98a3a85f214cb6a30fe10 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 29 Jan 2016 17:51:04 -0300
Subject: perf build: Align the names of the build tests:

  $ make -C tools/perf build-test
  make[1]: Entering directory `/home/acme/git/linux/tools/perf'
                 make_pure_O: cd . && make -f Makefile  O=/tmp/tmp.mPx0Cmik3f DESTDIR=/tmp/tmp.U0SUmVbtJm
            make_clean_all_O: cd . && make -f Makefile  O=/tmp/tmp.Yl5UzhTU7T DESTDIR=/tmp/tmp.fop1E4jdER clean all
                make_debug_O: cd . && make -f Makefile  O=/tmp/tmp.pMn2ozBoXC DESTDIR=/tmp/tmp.azxhDp5sEp DEBUG=1
           make_no_libperl_O: cd . && make -f Makefile  O=/tmp/tmp.qJPiINMtA7 DESTDIR=/tmp/tmp.KNMrLeGDxZ NO_LIBPERL=1
  <SNIP>

More needs to be done to make it more compact, i.e. elide the '-f Makefile',
remove that 'cd . &&', move the DESTDIR= and O= to the end, as they don't
convey that much information besides the fact that they are being set to some
random directory just for this build, move the meat, i.e. the meaningful
feature disabling bits to the start, etc.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-wir3w3o4f1nmbgcxgnx8cj9c@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 7f663f4..cc72b67 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -261,6 +261,8 @@ run := $(shell shuf -e $(run))
 run_O := $(shell shuf -e $(run_O))
 endif
 
+max_width := $(shell echo $(run_O) | sed 's/ /\n/g' | wc -L)
+
 ifdef DEBUG
 d := $(info run   $(run))
 d := $(info run_O $(run_O))
@@ -274,7 +276,7 @@ $(run):
 	$(call clean)
 	@TMP_DEST=$$(mktemp -d); \
 	cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST $($@)"; \
-	echo "- $@: $$cmd" && echo $$cmd > $@ && \
+	printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
 	( eval $$cmd ) >> $@ 2>&1; \
 	echo "  test: $(call test,$@)" >> $@ 2>&1; \
 	$(call test,$@) && \
@@ -285,7 +287,7 @@ $(run_O):
 	@TMP_O=$$(mktemp -d); \
 	TMP_DEST=$$(mktemp -d); \
 	cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \
-	echo "- $@: $$cmd" && echo $$cmd > $@ && \
+	printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
 	( eval $$cmd ) >> $@ 2>&1 && \
 	echo "  test: $(call test_O,$@)" >> $@ 2>&1; \
 	$(call test_O,$@) && \
-- 
cgit v0.10.2


From 5e9ebbd87a99ecc6abb74325b0ac63c46891f6f3 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Sat, 30 Jan 2016 14:01:12 +0600
Subject: x86/boot: Micro-optimize reset_early_page_tables()

Save 25 bytes of code and make the bootup a tiny bit faster:

     text    data bss             dec             filename
  9735144 4970776 15474688        30180608        vmlinux.old
  9735119 4970776 15474688        30180583        vmlinux

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Alexander Popov <alpopov@ptsecurity.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454140872-16926-1-git-send-email-kuleshovmail@gmail.com
[ Fixed various small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index f129a9a..35843ca 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -40,13 +40,8 @@ pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
 /* Wipe all early page tables except for the kernel symbol map */
 static void __init reset_early_page_tables(void)
 {
-	unsigned long i;
-
-	for (i = 0; i < PTRS_PER_PGD-1; i++)
-		early_level4_pgt[i].pgd = 0;
-
+	memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
 	next_early_pgt = 0;
-
 	write_cr3(__pa_nodebug(early_level4_pgt));
 }
 
@@ -54,7 +49,6 @@ static void __init reset_early_page_tables(void)
 int __init early_make_pgtable(unsigned long address)
 {
 	unsigned long physaddr = address - __PAGE_OFFSET;
-	unsigned long i;
 	pgdval_t pgd, *pgd_p;
 	pudval_t pud, *pud_p;
 	pmdval_t pmd, *pmd_p;
@@ -81,8 +75,7 @@ again:
 		}
 
 		pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
-		for (i = 0; i < PTRS_PER_PUD; i++)
-			pud_p[i] = 0;
+		memset(pud_p, 0, sizeof(pud_p) * PTRS_PER_PUD);
 		*pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
 	}
 	pud_p += pud_index(address);
@@ -97,8 +90,7 @@ again:
 		}
 
 		pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
-		for (i = 0; i < PTRS_PER_PMD; i++)
-			pmd_p[i] = 0;
+		memset(pmd_p, 0, sizeof(pmd_p) * PTRS_PER_PMD);
 		*pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
 	}
 	pmd = (physaddr & PMD_MASK) + early_pmd_flags;
-- 
cgit v0.10.2


From 9babd5c8caa6e62c116efc3a64a09f65af4112b0 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:17 +0100
Subject: resource: Add System RAM resource type

The IORESOURCE_MEM I/O resource type is used for all types of
memory-mapped ranges, ex. System RAM, System ROM, Video RAM,
Persistent Memory, PCI Bus, PCI MMCONFIG, ACPI Tables, IOAPIC,
reserved, and so on.

This requires walk_system_ram_range(), walk_system_ram_res(),
and region_intersects() to use strcmp() against string "System
RAM" to search for System RAM ranges in the iomem table, which
is inefficient. __ioremap_caller() and reserve_memtype() on x86,
for instance, call walk_system_ram_range() for every request to
check if a given range is in System RAM ranges.

However, adding a new I/O resource type for System RAM is not a
viable option, see [1]. There are approx. 3800 references to
IORESOURCE_MEM in the kernel/drivers, which makes it very
difficult to distinguish their usages between new type and
IORESOURCE_MEM.

The I/O resource types are also used by the PNP subsystem.
Therefore, introduce an extended I/O resource type,
IORESOURCE_SYSTEM_RAM, which consists of IORESOURCE_MEM and a
new modifier flag IORESOURCE_SYSRAM, see [2].

To keep the code 'if (resource_type(r) == IORESOURCE_MEM)' still
working for System RAM, resource_ext_type() is added for
extracting extended type bits.

Link[1]: http://lkml.kernel.org/r/1449168859.9855.54.camel@hpe.com
Link[2]: http://lkml.kernel.org/r/CA+55aFy4WQrWexC4u2LxX9Mw2NVoznw7p3Yh=iF4Xtf7zKWnRw@mail.gmail.com

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Jakub Sitnicki <jsitnicki@gmail.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 24bea08..4b65d94 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -49,12 +49,19 @@ struct resource {
 #define IORESOURCE_WINDOW	0x00200000	/* forwarded by bridge */
 #define IORESOURCE_MUXED	0x00400000	/* Resource is software muxed */
 
+#define IORESOURCE_EXT_TYPE_BITS 0x01000000	/* Resource extended types */
+#define IORESOURCE_SYSRAM	0x01000000	/* System RAM (modifier) */
+
 #define IORESOURCE_EXCLUSIVE	0x08000000	/* Userland may not map this resource */
+
 #define IORESOURCE_DISABLED	0x10000000
 #define IORESOURCE_UNSET	0x20000000	/* No address assigned yet */
 #define IORESOURCE_AUTO		0x40000000
 #define IORESOURCE_BUSY		0x80000000	/* Driver has marked this resource busy */
 
+/* I/O resource extended types */
+#define IORESOURCE_SYSTEM_RAM		(IORESOURCE_MEM|IORESOURCE_SYSRAM)
+
 /* PnP IRQ specific bits (IORESOURCE_BITS) */
 #define IORESOURCE_IRQ_HIGHEDGE		(1<<0)
 #define IORESOURCE_IRQ_LOWEDGE		(1<<1)
@@ -170,6 +177,10 @@ static inline unsigned long resource_type(const struct resource *res)
 {
 	return res->flags & IORESOURCE_TYPE_BITS;
 }
+static inline unsigned long resource_ext_type(const struct resource *res)
+{
+	return res->flags & IORESOURCE_EXT_TYPE_BITS;
+}
 /* True iff r1 completely contains r2 */
 static inline bool resource_contains(struct resource *r1, struct resource *r2)
 {
-- 
cgit v0.10.2


From a3650d53ba16ec412185abb98f231e9ba6bcdc65 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:18 +0100
Subject: resource: Handle resource flags properly

I/O resource flags consist of I/O resource types and modifier
bits. Therefore, checking an I/O resource type in 'flags' must
be performed with a bitwise operation.

Fix find_next_iomem_res() and region_intersects() that simply
compare 'flags' against a given value.

Also change __request_region() to set 'res->flags' from
resource_type() and resource_ext_type() of the parent, so that
children nodes will inherit the extended I/O resource type.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jakub Sitnicki <jsitnicki@gmail.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/resource.c b/kernel/resource.c
index 09c0597..96afc80 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -358,7 +358,7 @@ static int find_next_iomem_res(struct resource *res, char *name,
 	read_lock(&resource_lock);
 
 	for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
-		if (p->flags != res->flags)
+		if ((p->flags & res->flags) != res->flags)
 			continue;
 		if (name && strcmp(p->name, name))
 			continue;
@@ -519,7 +519,8 @@ int region_intersects(resource_size_t start, size_t size, const char *name)
 
 	read_lock(&resource_lock);
 	for (p = iomem_resource.child; p ; p = p->sibling) {
-		bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
+		bool is_type = strcmp(p->name, name) == 0 &&
+				((p->flags & flags) == flags);
 
 		if (start >= p->start && start <= p->end)
 			is_type ? type++ : other++;
@@ -1071,7 +1072,7 @@ struct resource * __request_region(struct resource *parent,
 	res->name = name;
 	res->start = start;
 	res->end = start + n - 1;
-	res->flags = resource_type(parent);
+	res->flags = resource_type(parent) | resource_ext_type(parent);
 	res->flags |= IORESOURCE_BUSY | flags;
 
 	write_lock(&resource_lock);
-- 
cgit v0.10.2


From 43ee493bde78da00deaf5737925365c691a036ad Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:19 +0100
Subject: resource: Add I/O resource descriptor

walk_iomem_res() and region_intersects() still need to use
strcmp() for searching a resource entry by @name in the iomem
table.

This patch introduces I/O resource descriptor 'desc' in struct
resource for the iomem search interfaces. Drivers can assign
their unique descriptor to a range when they support the search
interfaces.

Otherwise, 'desc' is set to IORES_DESC_NONE (0). This avoids
changing most of the drivers as they typically allocate resource
entries statically, or by calling alloc_resource(), kzalloc(),
or alloc_bootmem_low(), which set the field to zero by default.
A later patch will address some drivers that use kmalloc()
without zero'ing the field.

Also change release_mem_region_adjustable() to set 'desc' when
its resource entry gets separated. Other resource interfaces are
also changed to initialize 'desc' explicitly although
alloc_resource() sets it to 0.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jakub Sitnicki <jsitnicki@gmail.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 4b65d94..983bea0 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -20,6 +20,7 @@ struct resource {
 	resource_size_t end;
 	const char *name;
 	unsigned long flags;
+	unsigned long desc;
 	struct resource *parent, *sibling, *child;
 };
 
@@ -112,6 +113,22 @@ struct resource {
 /* PCI control bits.  Shares IORESOURCE_BITS with above PCI ROM.  */
 #define IORESOURCE_PCI_FIXED		(1<<4)	/* Do not move resource */
 
+/*
+ * I/O Resource Descriptors
+ *
+ * Descriptors are used by walk_iomem_res_desc() and region_intersects()
+ * for searching a specific resource range in the iomem table.  Assign
+ * a new descriptor when a resource range supports the search interfaces.
+ * Otherwise, resource.desc must be set to IORES_DESC_NONE (0).
+ */
+enum {
+	IORES_DESC_NONE				= 0,
+	IORES_DESC_CRASH_KERNEL			= 1,
+	IORES_DESC_ACPI_TABLES			= 2,
+	IORES_DESC_ACPI_NV_STORAGE		= 3,
+	IORES_DESC_PERSISTENT_MEMORY		= 4,
+	IORES_DESC_PERSISTENT_MEMORY_LEGACY	= 5,
+};
 
 /* helpers to define resources */
 #define DEFINE_RES_NAMED(_start, _size, _name, _flags)			\
@@ -120,6 +137,7 @@ struct resource {
 		.end = (_start) + (_size) - 1,				\
 		.name = (_name),					\
 		.flags = (_flags),					\
+		.desc = IORES_DESC_NONE,				\
 	}
 
 #define DEFINE_RES_IO_NAMED(_start, _size, _name)			\
diff --git a/kernel/resource.c b/kernel/resource.c
index 96afc80..61512e9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -949,6 +949,7 @@ static void __init __reserve_region_with_split(struct resource *root,
 	res->start = start;
 	res->end = end;
 	res->flags = IORESOURCE_BUSY;
+	res->desc = IORES_DESC_NONE;
 
 	while (1) {
 
@@ -983,6 +984,7 @@ static void __init __reserve_region_with_split(struct resource *root,
 				next_res->start = conflict->end + 1;
 				next_res->end = end;
 				next_res->flags = IORESOURCE_BUSY;
+				next_res->desc = IORES_DESC_NONE;
 			}
 		} else {
 			res->start = conflict->end + 1;
@@ -1074,6 +1076,7 @@ struct resource * __request_region(struct resource *parent,
 	res->end = start + n - 1;
 	res->flags = resource_type(parent) | resource_ext_type(parent);
 	res->flags |= IORESOURCE_BUSY | flags;
+	res->desc = IORES_DESC_NONE;
 
 	write_lock(&resource_lock);
 
@@ -1238,6 +1241,7 @@ int release_mem_region_adjustable(struct resource *parent,
 			new_res->start = end + 1;
 			new_res->end = res->end;
 			new_res->flags = res->flags;
+			new_res->desc = res->desc;
 			new_res->parent = res->parent;
 			new_res->sibling = res->sibling;
 			new_res->child = NULL;
@@ -1413,6 +1417,7 @@ static int __init reserve_setup(char *str)
 			res->start = io_start;
 			res->end = io_start + io_num - 1;
 			res->flags = IORESOURCE_BUSY;
+			res->desc = IORES_DESC_NONE;
 			res->child = NULL;
 			if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
 				reserved = x+1;
-- 
cgit v0.10.2


From f33b14a4b96b185634848046f54fb0d5028566a9 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:20 +0100
Subject: x86/e820: Set System RAM type and descriptor

Change e820_reserve_resources() to set 'flags' and 'desc' from
e820 types.

Set E820_RESERVED_KERN and E820_RAM's (System RAM) io resource
type to IORESOURCE_SYSTEM_RAM.

Do the same for "Kernel data", "Kernel code", and "Kernel bss",
which are child nodes of System RAM.

I/O resource descriptor is set to 'desc' for entries that are
(and will be) target ranges of walk_iomem_res() and
region_intersects().

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: WANG Chao <chaowang@redhat.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 569c1e4..837365f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -925,6 +925,41 @@ static const char *e820_type_to_string(int e820_type)
 	}
 }
 
+static unsigned long e820_type_to_iomem_type(int e820_type)
+{
+	switch (e820_type) {
+	case E820_RESERVED_KERN:
+	case E820_RAM:
+		return IORESOURCE_SYSTEM_RAM;
+	case E820_ACPI:
+	case E820_NVS:
+	case E820_UNUSABLE:
+	case E820_PRAM:
+	case E820_PMEM:
+	default:
+		return IORESOURCE_MEM;
+	}
+}
+
+static unsigned long e820_type_to_iores_desc(int e820_type)
+{
+	switch (e820_type) {
+	case E820_ACPI:
+		return IORES_DESC_ACPI_TABLES;
+	case E820_NVS:
+		return IORES_DESC_ACPI_NV_STORAGE;
+	case E820_PMEM:
+		return IORES_DESC_PERSISTENT_MEMORY;
+	case E820_PRAM:
+		return IORES_DESC_PERSISTENT_MEMORY_LEGACY;
+	case E820_RESERVED_KERN:
+	case E820_RAM:
+	case E820_UNUSABLE:
+	default:
+		return IORES_DESC_NONE;
+	}
+}
+
 static bool do_mark_busy(u32 type, struct resource *res)
 {
 	/* this is the legacy bios/dos rom-shadow + mmio region */
@@ -967,7 +1002,8 @@ void __init e820_reserve_resources(void)
 		res->start = e820.map[i].addr;
 		res->end = end;
 
-		res->flags = IORESOURCE_MEM;
+		res->flags = e820_type_to_iomem_type(e820.map[i].type);
+		res->desc = e820_type_to_iores_desc(e820.map[i].type);
 
 		/*
 		 * don't register the region that could be conflicted with
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d3d80e6..aa52c10 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -152,21 +152,21 @@ static struct resource data_resource = {
 	.name	= "Kernel data",
 	.start	= 0,
 	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource code_resource = {
 	.name	= "Kernel code",
 	.start	= 0,
 	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource bss_resource = {
 	.name	= "Kernel bss",
 	.start	= 0,
 	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 
-- 
cgit v0.10.2


From 03cb525eb25018cf5f3da01d0f1391fc8b37805a Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:21 +0100
Subject: ia64: Set System RAM type and descriptor

Change efi_initialize_iomem_resources() to set 'flags' and
'desc' for EFI memory types. IORESOURCE_SYSRAM, a modifier bit,
is set for System RAM as IORESOURCE_MEM is already set.
IORESOURCE_SYSTEM_RAM is defined as
(IORESOURCE_MEM|IORESOURCE_SYSRAM). I/O resource descriptor is
set for "ACPI Non-volatile Storage" and "Persistent Memory".

Also set IORESOURCE_SYSTEM_RAM for "Kernel code", "Kernel data",
and "Kernel bss".

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-efi <linux-efi@vger.kernel.org>
Cc: linux-ia64@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index caae3f4..300dac3 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1178,7 +1178,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 	efi_memory_desc_t *md;
 	u64 efi_desc_size;
 	char *name;
-	unsigned long flags;
+	unsigned long flags, desc;
 
 	efi_map_start = __va(ia64_boot_param->efi_memmap);
 	efi_map_end   = efi_map_start + ia64_boot_param->efi_memmap_size;
@@ -1193,6 +1193,8 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 			continue;
 
 		flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		desc = IORES_DESC_NONE;
+
 		switch (md->type) {
 
 			case EFI_MEMORY_MAPPED_IO:
@@ -1207,14 +1209,17 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 				if (md->attribute & EFI_MEMORY_WP) {
 					name = "System ROM";
 					flags |= IORESOURCE_READONLY;
-				} else if (md->attribute == EFI_MEMORY_UC)
+				} else if (md->attribute == EFI_MEMORY_UC) {
 					name = "Uncached RAM";
-				else
+				} else {
 					name = "System RAM";
+					flags |= IORESOURCE_SYSRAM;
+				}
 				break;
 
 			case EFI_ACPI_MEMORY_NVS:
 				name = "ACPI Non-volatile Storage";
+				desc = IORES_DESC_ACPI_NV_STORAGE;
 				break;
 
 			case EFI_UNUSABLE_MEMORY:
@@ -1224,6 +1229,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 
 			case EFI_PERSISTENT_MEMORY:
 				name = "Persistent Memory";
+				desc = IORES_DESC_PERSISTENT_MEMORY;
 				break;
 
 			case EFI_RESERVED_TYPE:
@@ -1246,6 +1252,7 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 		res->start = md->phys_addr;
 		res->end = md->phys_addr + efi_md_size(md) - 1;
 		res->flags = flags;
+		res->desc = desc;
 
 		if (insert_resource(&iomem_resource, res) < 0)
 			kfree(res);
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 4f118b0..2029a38 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -80,17 +80,17 @@ unsigned long vga_console_membase;
 
 static struct resource data_resource = {
 	.name	= "Kernel data",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource code_resource = {
 	.name	= "Kernel code",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource bss_resource = {
 	.name	= "Kernel bss",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 unsigned long ia64_max_cacheline_size;
-- 
cgit v0.10.2


From 35d98e93fe6a7ab612f6b389ce42c1dc135d6eef Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:22 +0100
Subject: arch: Set IORESOURCE_SYSTEM_RAM flag for System RAM

Set IORESOURCE_SYSTEM_RAM in flags of resource ranges with
"System RAM", "Kernel code", "Kernel data", and "Kernel bss".

Note that:

 - IORESOURCE_SYSRAM (i.e. modifier bit) is set in flags when
   IORESOURCE_MEM is already set. IORESOURCE_SYSTEM_RAM is defined
   as (IORESOURCE_MEM|IORESOURCE_SYSRAM).

 - Some archs do not set 'flags' for children nodes, such as
   "Kernel code".  This patch does not change 'flags' in this
   case.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mips@linux-mips.org
Cc: linux-mm <linux-mm@kvack.org>
Cc: linux-parisc@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linux-sh@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: sparclinux@vger.kernel.org
Link: http://lkml.kernel.org/r/1453841853-11383-7-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 7d0cba6f..139791e 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -176,13 +176,13 @@ static struct resource mem_res[] = {
 		.name = "Kernel code",
 		.start = 0,
 		.end = 0,
-		.flags = IORESOURCE_MEM
+		.flags = IORESOURCE_SYSTEM_RAM
 	},
 	{
 		.name = "Kernel data",
 		.start = 0,
 		.end = 0,
-		.flags = IORESOURCE_MEM
+		.flags = IORESOURCE_SYSTEM_RAM
 	}
 };
 
@@ -851,7 +851,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc)
 		res->name  = "System RAM";
 		res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
 		res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
 		request_resource(&iomem_resource, res);
 
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 8119479..450987d 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -73,13 +73,13 @@ static struct resource mem_res[] = {
 		.name = "Kernel code",
 		.start = 0,
 		.end = 0,
-		.flags = IORESOURCE_MEM
+		.flags = IORESOURCE_SYSTEM_RAM
 	},
 	{
 		.name = "Kernel data",
 		.start = 0,
 		.end = 0,
-		.flags = IORESOURCE_MEM
+		.flags = IORESOURCE_SYSTEM_RAM
 	}
 };
 
@@ -210,7 +210,7 @@ static void __init request_standard_resources(void)
 		res->name  = "System RAM";
 		res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
 		res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
 		request_resource(&iomem_resource, res);
 
diff --git a/arch/avr32/kernel/setup.c b/arch/avr32/kernel/setup.c
index 209ae5a..e692889 100644
--- a/arch/avr32/kernel/setup.c
+++ b/arch/avr32/kernel/setup.c
@@ -49,13 +49,13 @@ static struct resource __initdata kernel_data = {
 	.name	= "Kernel data",
 	.start	= 0,
 	.end	= 0,
-	.flags	= IORESOURCE_MEM,
+	.flags	= IORESOURCE_SYSTEM_RAM,
 };
 static struct resource __initdata kernel_code = {
 	.name	= "Kernel code",
 	.start	= 0,
 	.end	= 0,
-	.flags	= IORESOURCE_MEM,
+	.flags	= IORESOURCE_SYSTEM_RAM,
 	.sibling = &kernel_data,
 };
 
@@ -134,7 +134,7 @@ add_physical_memory(resource_size_t start, resource_size_t end)
 	new->start = start;
 	new->end = end;
 	new->name = "System RAM";
-	new->flags = IORESOURCE_MEM;
+	new->flags = IORESOURCE_SYSTEM_RAM;
 
 	*pprev = new;
 }
diff --git a/arch/m32r/kernel/setup.c b/arch/m32r/kernel/setup.c
index a5ecef7..136c69f 100644
--- a/arch/m32r/kernel/setup.c
+++ b/arch/m32r/kernel/setup.c
@@ -70,14 +70,14 @@ static struct resource data_resource = {
 	.name   = "Kernel data",
 	.start  = 0,
 	.end    = 0,
-	.flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource code_resource = {
 	.name   = "Kernel code",
 	.start  = 0,
 	.end    = 0,
-	.flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags  = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 unsigned long memory_start;
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 569a7d5..c745f0e 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -732,21 +732,23 @@ static void __init resource_init(void)
 			end = HIGHMEM_START - 1;
 
 		res = alloc_bootmem(sizeof(struct resource));
+
+		res->start = start;
+		res->end = end;
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
 		switch (boot_mem_map.map[i].type) {
 		case BOOT_MEM_RAM:
 		case BOOT_MEM_INIT_RAM:
 		case BOOT_MEM_ROM_DATA:
 			res->name = "System RAM";
+			res->flags |= IORESOURCE_SYSRAM;
 			break;
 		case BOOT_MEM_RESERVED:
 		default:
 			res->name = "reserved";
 		}
 
-		res->start = start;
-		res->end = end;
-
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
 
 		/*
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 1b366c4..3c07d6b 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -55,12 +55,12 @@ signed char pfnnid_map[PFNNID_MAP_MAX] __read_mostly;
 
 static struct resource data_resource = {
 	.name	= "Kernel data",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource code_resource = {
 	.name	= "Kernel code",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource pdcdata_resource = {
@@ -201,7 +201,7 @@ static void __init setup_bootmem(void)
 		res->name = "System RAM";
 		res->start = pmem_ranges[i].start_pfn << PAGE_SHIFT;
 		res->end = res->start + (pmem_ranges[i].pages << PAGE_SHIFT)-1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 		request_resource(&iomem_resource, res);
 	}
 
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index d0f0a51..f078a1f 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -541,7 +541,7 @@ static int __init add_system_ram_resources(void)
 			res->name = "System RAM";
 			res->start = base;
 			res->end = base + size - 1;
-			res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+			res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 			WARN_ON(request_resource(&iomem_resource, res) < 0);
 		}
 	}
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 9220db5..cedb019 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -374,17 +374,17 @@ static void __init setup_lowcore(void)
 
 static struct resource code_resource = {
 	.name  = "Kernel code",
-	.flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource data_resource = {
 	.name = "Kernel data",
-	.flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource bss_resource = {
 	.name = "Kernel bss",
-	.flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource __initdata *standard_resources[] = {
@@ -408,7 +408,7 @@ static void __init setup_resources(void)
 
 	for_each_memblock(memory, reg) {
 		res = alloc_bootmem_low(sizeof(*res));
-		res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+		res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
 
 		res->name = "System RAM";
 		res->start = reg->base;
diff --git a/arch/score/kernel/setup.c b/arch/score/kernel/setup.c
index b48459a..f3a0649 100644
--- a/arch/score/kernel/setup.c
+++ b/arch/score/kernel/setup.c
@@ -101,7 +101,7 @@ static void __init resource_init(void)
 	res->name = "System RAM";
 	res->start = MEMORY_START;
 	res->end = MEMORY_START + MEMORY_SIZE - 1;
-	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 	request_resource(&iomem_resource, res);
 
 	request_resource(res, &code_resource);
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index de19cfa..3f1c18b 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -78,17 +78,17 @@ static char __initdata command_line[COMMAND_LINE_SIZE] = { 0, };
 
 static struct resource code_resource = {
 	.name = "Kernel code",
-	.flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource data_resource = {
 	.name = "Kernel data",
-	.flags = IORESOURCE_BUSY | IORESOURCE_MEM,
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 static struct resource bss_resource = {
 	.name	= "Kernel bss",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
 };
 
 unsigned long memory_start;
@@ -202,7 +202,7 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
 	res->name = "System RAM";
 	res->start = start;
 	res->end = end - 1;
-	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
 	if (request_resource(&iomem_resource, res)) {
 		pr_err("unable to request memory_resource 0x%lx 0x%lx\n",
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 6f21685..1cfe6aa 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2863,17 +2863,17 @@ void hugetlb_setup(struct pt_regs *regs)
 
 static struct resource code_resource = {
 	.name	= "Kernel code",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource data_resource = {
 	.name	= "Kernel data",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource bss_resource = {
 	.name	= "Kernel bss",
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static inline resource_size_t compute_kern_paddr(void *addr)
@@ -2909,7 +2909,7 @@ static int __init report_memory(void)
 		res->name = "System RAM";
 		res->start = pavail[i].phys_addr;
 		res->end = pavail[i].phys_addr + pavail[i].reg_size - 1;
-		res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+		res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
 
 		if (insert_resource(&iomem_resource, res) < 0) {
 			pr_warn("Resource insertion failed.\n");
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index bbb855d..a992238 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -1632,14 +1632,14 @@ static struct resource data_resource = {
 	.name	= "Kernel data",
 	.start	= 0,
 	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 static struct resource code_resource = {
 	.name	= "Kernel code",
 	.start	= 0,
 	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags	= IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM
 };
 
 /*
@@ -1673,10 +1673,15 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
 		kzalloc(sizeof(struct resource), GFP_ATOMIC);
 	if (!res)
 		return NULL;
-	res->name = reserved ? "Reserved" : "System RAM";
 	res->start = start_pfn << PAGE_SHIFT;
 	res->end = (end_pfn << PAGE_SHIFT) - 1;
 	res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+	if (reserved) {
+		res->name = "Reserved";
+	} else {
+		res->name = "System RAM";
+		res->flags |= IORESOURCE_SYSRAM;
+	}
 	if (insert_resource(&iomem_resource, res)) {
 		kfree(res);
 		return NULL;
diff --git a/arch/unicore32/kernel/setup.c b/arch/unicore32/kernel/setup.c
index 3fa317f..c2bffa5 100644
--- a/arch/unicore32/kernel/setup.c
+++ b/arch/unicore32/kernel/setup.c
@@ -72,13 +72,13 @@ static struct resource mem_res[] = {
 		.name = "Kernel code",
 		.start = 0,
 		.end = 0,
-		.flags = IORESOURCE_MEM
+		.flags = IORESOURCE_SYSTEM_RAM
 	},
 	{
 		.name = "Kernel data",
 		.start = 0,
 		.end = 0,
-		.flags = IORESOURCE_MEM
+		.flags = IORESOURCE_SYSTEM_RAM
 	}
 };
 
@@ -211,7 +211,7 @@ request_standard_resources(struct meminfo *mi)
 		res->name  = "System RAM";
 		res->start = mi->bank[i].start;
 		res->end   = mi->bank[i].start + mi->bank[i].size - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
 		request_resource(&iomem_resource, res);
 
-- 
cgit v0.10.2


From 1a085d0727afaedb9506f04798516298b1676e11 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:23 +0100
Subject: kexec: Set IORESOURCE_SYSTEM_RAM for System RAM

Set proper ioresource flags and types for crash kernel
reservation areas.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Young <dyoung@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Minfei Huang <mnfhuang@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: kexec@lists.infradead.org
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-8-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 8dc6591..8d34308 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -66,13 +66,15 @@ struct resource crashk_res = {
 	.name  = "Crash kernel",
 	.start = 0,
 	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_CRASH_KERNEL
 };
 struct resource crashk_low_res = {
 	.name  = "Crash kernel",
 	.start = 0,
 	.end   = 0,
-	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+	.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
+	.desc  = IORES_DESC_CRASH_KERNEL
 };
 
 int kexec_should_crash(struct task_struct *p)
@@ -959,7 +961,7 @@ int crash_shrink_memory(unsigned long new_size)
 
 	ram_res->start = end;
 	ram_res->end = crashk_res.end;
-	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+	ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
 	ram_res->name = "System RAM";
 
 	crashk_res.end = end - 1;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 007b791..2bfcdc0 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -525,7 +525,7 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
 	/* Walk the RAM ranges and allocate a suitable range for the buffer */
 	if (image->type == KEXEC_TYPE_CRASH)
 		ret = walk_iomem_res("Crash kernel",
-				     IORESOURCE_MEM | IORESOURCE_BUSY,
+				     IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
 				     crashk_res.start, crashk_res.end, kbuf,
 				     locate_mem_hole_callback);
 	else
-- 
cgit v0.10.2


From 782b86641e5d471e9eb1cf0072c012d2f758e568 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:24 +0100
Subject: xen, mm: Set IORESOURCE_SYSTEM_RAM to System RAM

Set IORESOURCE_SYSTEM_RAM in struct resource.flags of "System
RAM" entries.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: David Vrabel <david.vrabel@citrix.com> # xen
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Gu Zheng <guz.fnst@cn.fujitsu.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1453841853-11383-9-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 12eab50..dc4305b 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -257,7 +257,7 @@ static struct resource *additional_memory_resource(phys_addr_t size)
 		return NULL;
 
 	res->name = "System RAM";
-	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 
 	ret = allocate_resource(&iomem_resource, res,
 				size, 0, -1,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4af58a3..979b18c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -138,7 +138,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
 	res->name = "System RAM";
 	res->start = start;
 	res->end = start + size - 1;
-	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 	if (request_resource(&iomem_resource, res) < 0) {
 		pr_debug("System RAM resource %pR cannot be added\n", res);
 		kfree(res);
-- 
cgit v0.10.2


From 9a975bee4b3945b271bcff18a520d4863c210f8b Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:25 +0100
Subject: drivers: Initialize resource entry to zero

I/O resource descriptor, 'desc' in struct resource, needs to be
initialized to zero by default.  Some drivers call kmalloc() to
allocate a resource entry, but do not initialize it to zero by
memset().  Change these drivers to call kzalloc(), instead.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Alexandre Bounine <alexandre.bounine@idt.com>
Acked-by: Helge Deller <deller@gmx.de>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-acpi@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Cc: linux-parisc@vger.kernel.org
Cc: linux-renesas-soc@vger.kernel.org
Cc: linux-sh@vger.kernel.org
Link: http://lkml.kernel.org/r/1453841853-11383-10-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index 296b7a1..b6f7fa3 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -62,7 +62,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
 	if (count < 0) {
 		return NULL;
 	} else if (count > 0) {
-		resources = kmalloc(count * sizeof(struct resource),
+		resources = kzalloc(count * sizeof(struct resource),
 				    GFP_KERNEL);
 		if (!resources) {
 			dev_err(&adev->dev, "No memory for resources\n");
diff --git a/drivers/parisc/eisa_enumerator.c b/drivers/parisc/eisa_enumerator.c
index a656d9e..21905fe 100644
--- a/drivers/parisc/eisa_enumerator.c
+++ b/drivers/parisc/eisa_enumerator.c
@@ -91,7 +91,7 @@ static int configure_memory(const unsigned char *buf,
 	for (i=0;i<HPEE_MEMORY_MAX_ENT;i++) {
 		c = get_8(buf+len);
 		
-		if (NULL != (res = kmalloc(sizeof(struct resource), GFP_KERNEL))) {
+		if (NULL != (res = kzalloc(sizeof(struct resource), GFP_KERNEL))) {
 			int result;
 			
 			res->name = name;
@@ -183,7 +183,7 @@ static int configure_port(const unsigned char *buf, struct resource *io_parent,
 	for (i=0;i<HPEE_PORT_MAX_ENT;i++) {
 		c = get_8(buf+len);
 		
-		if (NULL != (res = kmalloc(sizeof(struct resource), GFP_KERNEL))) {
+		if (NULL != (res = kzalloc(sizeof(struct resource), GFP_KERNEL))) {
 			res->name = board;
 			res->start = get_16(buf+len+1);
 			res->end = get_16(buf+len+1)+(c&HPEE_PORT_SIZE_MASK)+1;
diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c
index d7b87c6..e220edc 100644
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -117,7 +117,7 @@ int rio_request_inb_mbox(struct rio_mport *mport,
 	if (mport->ops->open_inb_mbox == NULL)
 		goto out;
 
-	res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 
 	if (res) {
 		rio_init_mbox_res(res, mbox, mbox);
@@ -185,7 +185,7 @@ int rio_request_outb_mbox(struct rio_mport *mport,
 	if (mport->ops->open_outb_mbox == NULL)
 		goto out;
 
-	res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 
 	if (res) {
 		rio_init_mbox_res(res, mbox, mbox);
@@ -285,7 +285,7 @@ int rio_request_inb_dbell(struct rio_mport *mport,
 {
 	int rc = 0;
 
-	struct resource *res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+	struct resource *res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 
 	if (res) {
 		rio_init_dbell_res(res, start, end);
@@ -360,7 +360,7 @@ int rio_release_inb_dbell(struct rio_mport *mport, u16 start, u16 end)
 struct resource *rio_request_outb_dbell(struct rio_dev *rdev, u16 start,
 					u16 end)
 {
-	struct resource *res = kmalloc(sizeof(struct resource), GFP_KERNEL);
+	struct resource *res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 
 	if (res) {
 		rio_init_dbell_res(res, start, end);
diff --git a/drivers/sh/superhyway/superhyway.c b/drivers/sh/superhyway/superhyway.c
index 2d9e7f3..bb1fb771 100644
--- a/drivers/sh/superhyway/superhyway.c
+++ b/drivers/sh/superhyway/superhyway.c
@@ -66,7 +66,7 @@ int superhyway_add_device(unsigned long base, struct superhyway_device *sdev,
 	superhyway_read_vcr(dev, base, &dev->vcr);
 
 	if (!dev->resource) {
-		dev->resource = kmalloc(sizeof(struct resource), GFP_KERNEL);
+		dev->resource = kzalloc(sizeof(struct resource), GFP_KERNEL);
 		if (!dev->resource) {
 			kfree(dev);
 			return -ENOMEM;
-- 
cgit v0.10.2


From bd7e6cb30ced147292d854a54d4a1f5c5a05d927 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:26 +0100
Subject: resource: Change walk_system_ram() to use System RAM type

Now that all System RAM resource entries have been initialized
to IORESOURCE_SYSTEM_RAM type, change walk_system_ram_res() and
walk_system_ram_range() to call find_next_iomem_res() by setting
@res.flags to IORESOURCE_SYSTEM_RAM and @name to NULL. With this
change, they walk through the iomem table to find System RAM
ranges without the need to do strcmp() on the resource names.

No functional change is made to the interfaces.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
[ Boris: fixup comments. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jakub Sitnicki <jsitnicki@gmail.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-11-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/resource.c b/kernel/resource.c
index 61512e9..994f1e41 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -415,11 +415,11 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
 }
 
 /*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM". This function deals with
- * full ranges and not pfn. If resources are not pfn aligned, dealing
- * with pfn can truncate ranges.
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * Now, this function is only for System RAM, it deals with full ranges and
+ * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate
+ * ranges.
  */
 int walk_system_ram_res(u64 start, u64 end, void *arg,
 				int (*func)(u64, u64, void *))
@@ -430,10 +430,10 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 
 	res.start = start;
 	res.end = end;
-	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+	res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 	orig_end = res.end;
 	while ((res.start < res.end) &&
-		(!find_next_iomem_res(&res, "System RAM", true))) {
+		(!find_next_iomem_res(&res, NULL, true))) {
 		ret = (*func)(res.start, res.end, arg);
 		if (ret)
 			break;
@@ -446,9 +446,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
 
 /*
- * This function calls callback against all memory range of "System RAM"
- * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
- * Now, this function is only for "System RAM".
+ * This function calls the @func callback against all memory ranges of type
+ * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
+ * It is to be used only for System RAM.
  */
 int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 		void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -460,10 +460,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 
 	res.start = (u64) start_pfn << PAGE_SHIFT;
 	res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
-	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+	res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 	orig_end = res.end;
 	while ((res.start < res.end) &&
-		(find_next_iomem_res(&res, "System RAM", true) >= 0)) {
+		(find_next_iomem_res(&res, NULL, true) >= 0)) {
 		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		end_pfn = (res.end + 1) >> PAGE_SHIFT;
 		if (end_pfn > pfn)
@@ -484,7 +484,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
 }
 /*
  * This generic page_is_ram() returns true if specified address is
- * registered as "System RAM" in iomem_resource list.
+ * registered as System RAM in iomem_resource list.
  */
 int __weak page_is_ram(unsigned long pfn)
 {
-- 
cgit v0.10.2


From 05fee7cfab7fa9d57e71f00bdd8fcff0cf5044a0 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:27 +0100
Subject: arm/samsung: Change s3c_pm_run_res() to use System RAM type

Change s3c_pm_run_res() to check with IORESOURCE_SYSTEM_RAM,
instead of strcmp() with "System RAM", to walk through System
RAM ranges in the iomem table.

No functional change is made to the interface.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kukjin Kim <kgene@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mm <linux-mm@kvack.org>
Cc: linux-samsung-soc@vger.kernel.org
Link: http://lkml.kernel.org/r/1453841853-11383-12-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/arm/plat-samsung/pm-check.c b/arch/arm/plat-samsung/pm-check.c
index 04aff2c..70f2f69 100644
--- a/arch/arm/plat-samsung/pm-check.c
+++ b/arch/arm/plat-samsung/pm-check.c
@@ -53,8 +53,8 @@ static void s3c_pm_run_res(struct resource *ptr, run_fn_t fn, u32 *arg)
 		if (ptr->child != NULL)
 			s3c_pm_run_res(ptr->child, fn, arg);
 
-		if ((ptr->flags & IORESOURCE_MEM) &&
-		    strcmp(ptr->name, "System RAM") == 0) {
+		if ((ptr->flags & IORESOURCE_SYSTEM_RAM)
+				== IORESOURCE_SYSTEM_RAM) {
 			S3C_PMDBG("Found system RAM at %08lx..%08lx\n",
 				  (unsigned long)ptr->start,
 				  (unsigned long)ptr->end);
-- 
cgit v0.10.2


From 1c29f25bf5d6c557017f619b638c619cbbf798c4 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:28 +0100
Subject: memremap: Change region_intersects() to take @flags and @desc

Change region_intersects() to identify a target with @flags and
@desc, instead of @name with strcmp().

Change the callers of region_intersects(), memremap() and
devm_memremap(), to set IORESOURCE_SYSTEM_RAM in @flags and
IORES_DESC_NONE in @desc when searching System RAM.

Also, export region_intersects() so that the ACPI EINJ error
injection driver can call this function in a later patch.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jakub Sitnicki <jsitnicki@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-13-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f1cd22f..cd5a300 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -385,7 +385,8 @@ enum {
 	REGION_MIXED,
 };
 
-int region_intersects(resource_size_t offset, size_t size, const char *type);
+int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
+		      unsigned long desc);
 
 /* Support for virtually mapped pages */
 struct page *vmalloc_to_page(const void *addr);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index e517a16..293309c 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -47,7 +47,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
  * being mapped does not have i/o side effects and the __iomem
  * annotation is not applicable.
  *
- * MEMREMAP_WB - matches the default mapping for "System RAM" on
+ * MEMREMAP_WB - matches the default mapping for System RAM on
  * the architecture.  This is usually a read-allocate write-back cache.
  * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
  * memremap() will bypass establishing a new mapping and instead return
@@ -56,11 +56,12 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
  * MEMREMAP_WT - establish a mapping whereby writes either bypass the
  * cache or are written through to memory and never exist in a
  * cache-dirty state with respect to program visibility.  Attempts to
- * map "System RAM" with this mapping type will fail.
+ * map System RAM with this mapping type will fail.
  */
 void *memremap(resource_size_t offset, size_t size, unsigned long flags)
 {
-	int is_ram = region_intersects(offset, size, "System RAM");
+	int is_ram = region_intersects(offset, size,
+				       IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
 	void *addr = NULL;
 
 	if (is_ram == REGION_MIXED) {
@@ -76,7 +77,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
 		 * MEMREMAP_WB is special in that it can be satisifed
 		 * from the direct map.  Some archs depend on the
 		 * capability of memremap() to autodetect cases where
-		 * the requested range is potentially in "System RAM"
+		 * the requested range is potentially in System RAM.
 		 */
 		if (is_ram == REGION_INTERSECTS)
 			addr = try_ram_remap(offset, size);
@@ -88,7 +89,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
 	 * If we don't have a mapping yet and more request flags are
 	 * pending then we will be attempting to establish a new virtual
 	 * address mapping.  Enforce that this mapping is not aliasing
-	 * "System RAM"
+	 * System RAM.
 	 */
 	if (!addr && is_ram == REGION_INTERSECTS && flags) {
 		WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
@@ -266,7 +267,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 		struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
 	int is_ram = region_intersects(res->start, resource_size(res),
-			"System RAM");
+				       IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
 	resource_size_t key, align_start, align_size;
 	struct dev_pagemap *pgmap;
 	struct page_map *page_map;
diff --git a/kernel/resource.c b/kernel/resource.c
index 994f1e41..0041ced 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -496,31 +496,34 @@ EXPORT_SYMBOL_GPL(page_is_ram);
  * region_intersects() - determine intersection of region with known resources
  * @start: region start address
  * @size: size of region
- * @name: name of resource (in iomem_resource)
+ * @flags: flags of resource (in iomem_resource)
+ * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE
  *
  * Check if the specified region partially overlaps or fully eclipses a
- * resource identified by @name.  Return REGION_DISJOINT if the region
- * does not overlap @name, return REGION_MIXED if the region overlaps
- * @type and another resource, and return REGION_INTERSECTS if the
- * region overlaps @type and no other defined resource. Note, that
- * REGION_INTERSECTS is also returned in the case when the specified
- * region overlaps RAM and undefined memory holes.
+ * resource identified by @flags and @desc (optional with IORES_DESC_NONE).
+ * Return REGION_DISJOINT if the region does not overlap @flags/@desc,
+ * return REGION_MIXED if the region overlaps @flags/@desc and another
+ * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc
+ * and no other defined resource. Note that REGION_INTERSECTS is also
+ * returned in the case when the specified region overlaps RAM and undefined
+ * memory holes.
  *
  * region_intersect() is used by memory remapping functions to ensure
  * the user is not remapping RAM and is a vast speed up over walking
  * through the resource table page by page.
  */
-int region_intersects(resource_size_t start, size_t size, const char *name)
+int region_intersects(resource_size_t start, size_t size, unsigned long flags,
+		      unsigned long desc)
 {
-	unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 	resource_size_t end = start + size - 1;
 	int type = 0; int other = 0;
 	struct resource *p;
 
 	read_lock(&resource_lock);
 	for (p = iomem_resource.child; p ; p = p->sibling) {
-		bool is_type = strcmp(p->name, name) == 0 &&
-				((p->flags & flags) == flags);
+		bool is_type = (((p->flags & flags) == flags) &&
+				((desc == IORES_DESC_NONE) ||
+				 (desc == p->desc)));
 
 		if (start >= p->start && start <= p->end)
 			is_type ? type++ : other++;
@@ -539,6 +542,7 @@ int region_intersects(resource_size_t start, size_t size, const char *name)
 
 	return REGION_DISJOINT;
 }
+EXPORT_SYMBOL_GPL(region_intersects);
 
 void __weak arch_remove_reservations(struct resource *avail)
 {
-- 
cgit v0.10.2


From 3f33647c41962401272bb60dce67e6094d14dbf2 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:29 +0100
Subject: resource: Add walk_iomem_res_desc()

Add a new interface, walk_iomem_res_desc(), which walks through
the iomem table by identifying a target with @flags and @desc.
This interface provides the same functionality as
walk_iomem_res(), but does not use strcmp() to @name for better
efficiency.

walk_iomem_res() is deprecated and will be removed in a later
patch.

Requested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
[ Fixup comments. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Jakub Sitnicki <jsitnicki@gmail.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-14-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 983bea0..2a4a5e8 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -268,6 +268,9 @@ extern int
 walk_system_ram_res(u64 start, u64 end, void *arg,
 		    int (*func)(u64, u64, void *));
 extern int
+walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
+		    void *arg, int (*func)(u64, u64, void *));
+extern int
 walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg,
 	       int (*func)(u64, u64, void *));
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 0041ced..37ed2fc 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -333,14 +333,15 @@ int release_resource(struct resource *old)
 EXPORT_SYMBOL(release_resource);
 
 /*
- * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
- * the caller must specify res->start, res->end, res->flags and "name".
- * If found, returns 0, res is overwritten, if not found, returns -1.
- * This walks through whole tree and not just first level children
- * until and unless first_level_children_only is true.
+ * Finds the lowest iomem resource existing within [res->start.res->end).
+ * The caller must specify res->start, res->end, res->flags, and optionally
+ * desc and "name".  If found, returns 0, res is overwritten, if not found,
+ * returns -1.
+ * This function walks the whole tree and not just first level children until
+ * and unless first_level_children_only is true.
  */
-static int find_next_iomem_res(struct resource *res, char *name,
-			       bool first_level_children_only)
+static int find_next_iomem_res(struct resource *res, unsigned long desc,
+			       char *name, bool first_level_children_only)
 {
 	resource_size_t start, end;
 	struct resource *p;
@@ -360,6 +361,8 @@ static int find_next_iomem_res(struct resource *res, char *name,
 	for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
 		if ((p->flags & res->flags) != res->flags)
 			continue;
+		if ((desc != IORES_DESC_NONE) && (desc != p->desc))
+			continue;
 		if (name && strcmp(p->name, name))
 			continue;
 		if (p->start > end) {
@@ -385,12 +388,55 @@ static int find_next_iomem_res(struct resource *res, char *name,
  * Walks through iomem resources and calls func() with matching resource
  * ranges. This walks through whole tree and not just first level children.
  * All the memory ranges which overlap start,end and also match flags and
+ * desc are valid candidates.
+ *
+ * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
+ * @flags: I/O resource flags
+ * @start: start addr
+ * @end: end addr
+ *
+ * NOTE: For a new descriptor search, define a new IORES_DESC in
+ * <linux/ioport.h> and set it in 'desc' of a target resource entry.
+ */
+int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
+		u64 end, void *arg, int (*func)(u64, u64, void *))
+{
+	struct resource res;
+	u64 orig_end;
+	int ret = -1;
+
+	res.start = start;
+	res.end = end;
+	res.flags = flags;
+	orig_end = res.end;
+
+	while ((res.start < res.end) &&
+		(!find_next_iomem_res(&res, desc, NULL, false))) {
+
+		ret = (*func)(res.start, res.end, arg);
+		if (ret)
+			break;
+
+		res.start = res.end + 1;
+		res.end = orig_end;
+	}
+
+	return ret;
+}
+
+/*
+ * Walks through iomem resources and calls @func with matching resource
+ * ranges. This walks the whole tree and not just first level children.
+ * All the memory ranges which overlap start,end and also match flags and
  * name are valid candidates.
  *
  * @name: name of resource
  * @flags: resource flags
  * @start: start addr
  * @end: end addr
+ *
+ * NOTE: This function is deprecated and should not be used in new code.
+ * Use walk_iomem_res_desc(), instead.
  */
 int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
 		void *arg, int (*func)(u64, u64, void *))
@@ -404,7 +450,7 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
 	res.flags = flags;
 	orig_end = res.end;
 	while ((res.start < res.end) &&
-		(!find_next_iomem_res(&res, name, false))) {
+		(!find_next_iomem_res(&res, IORES_DESC_NONE, name, false))) {
 		ret = (*func)(res.start, res.end, arg);
 		if (ret)
 			break;
@@ -433,7 +479,7 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 	res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 	orig_end = res.end;
 	while ((res.start < res.end) &&
-		(!find_next_iomem_res(&res, NULL, true))) {
+		(!find_next_iomem_res(&res, IORES_DESC_NONE, NULL, true))) {
 		ret = (*func)(res.start, res.end, arg);
 		if (ret)
 			break;
@@ -463,7 +509,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 	res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 	orig_end = res.end;
 	while ((res.start < res.end) &&
-		(find_next_iomem_res(&res, NULL, true) >= 0)) {
+		(find_next_iomem_res(&res, IORES_DESC_NONE, NULL, true) >= 0)) {
 		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		end_pfn = (res.end + 1) >> PAGE_SHIFT;
 		if (end_pfn > pfn)
-- 
cgit v0.10.2


From f0f4711aa16b82016c0b6e59871934bbd71258da Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:30 +0100
Subject: x86, kexec, nvdimm: Use walk_iomem_res_desc() for iomem search

Change the callers of walk_iomem_res() scanning for the
following resources by name to use walk_iomem_res_desc()
instead.

 "ACPI Tables"
 "ACPI Non-volatile Storage"
 "Persistent Memory (legacy)"
 "Crash kernel"

Note, the caller of walk_iomem_res() with "GART" will be removed
in a later patch.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Young <dyoung@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chun-Yi <joeyli.kernel@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Lee, Chun-Yi <joeyli.kernel@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Minfei Huang <mnfhuang@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Takao Indoh <indou.takao@jp.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: kexec@lists.infradead.org
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Cc: linux-nvdimm@lists.01.org
Link: http://lkml.kernel.org/r/1453841853-11383-15-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 58f3431..35e152e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -599,12 +599,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
 	/* Add ACPI tables */
 	cmd.type = E820_ACPI;
 	flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-	walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
+	walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd,
 		       memmap_entry_callback);
 
 	/* Add ACPI Non-volatile Storage */
 	cmd.type = E820_NVS;
-	walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
+	walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd,
 			memmap_entry_callback);
 
 	/* Add crashk_low_res region */
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 14415af..92f7014 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -13,11 +13,11 @@ static int found(u64 start, u64 end, void *data)
 
 static __init int register_e820_pmem(void)
 {
-	char *pmem = "Persistent Memory (legacy)";
 	struct platform_device *pdev;
 	int rc;
 
-	rc = walk_iomem_res(pmem, IORESOURCE_MEM, 0, -1, NULL, found);
+	rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
+				 IORESOURCE_MEM, 0, -1, NULL, found);
 	if (rc <= 0)
 		return 0;
 
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index b0045a5..95825b3 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -55,7 +55,7 @@ static int e820_pmem_probe(struct platform_device *pdev)
 	for (p = iomem_resource.child; p ; p = p->sibling) {
 		struct nd_region_desc ndr_desc;
 
-		if (strncmp(p->name, "Persistent Memory (legacy)", 26) != 0)
+		if (p->desc != IORES_DESC_PERSISTENT_MEMORY_LEGACY)
 			continue;
 
 		memset(&ndr_desc, 0, sizeof(ndr_desc));
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 2bfcdc0..56b18eb 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -524,10 +524,10 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
 
 	/* Walk the RAM ranges and allocate a suitable range for the buffer */
 	if (image->type == KEXEC_TYPE_CRASH)
-		ret = walk_iomem_res("Crash kernel",
-				     IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
-				     crashk_res.start, crashk_res.end, kbuf,
-				     locate_mem_hole_callback);
+		ret = walk_iomem_res_desc(crashk_res.desc,
+				IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+				crashk_res.start, crashk_res.end, kbuf,
+				locate_mem_hole_callback);
 	else
 		ret = walk_system_ram_res(0, -1, kbuf,
 					  locate_mem_hole_callback);
-- 
cgit v0.10.2


From f296f2634920d205b93d878b48d87bb7e0a4c256 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:31 +0100
Subject: x86/kexec: Remove walk_iomem_res() call with GART type

There is no longer any driver inserting a "GART" region in the
kernel since

  707d4eefbdb3 ("Revert "[PATCH] Insert GART region into resource map"").

Remove the call to walk_iomem_res() with "GART" type, its
callback function, and GART-specific variables set by the
callback.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dave Young <dyoung@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Chun-Yi <joeyli.kernel@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Lee, Chun-Yi <joeyli.kernel@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Minfei Huang <mnfhuang@gmail.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Takao Indoh <indou.takao@jp.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: kexec@lists.infradead.org
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-16-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 35e152e..9ef978d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -57,10 +57,9 @@ struct crash_elf_data {
 	struct kimage *image;
 	/*
 	 * Total number of ram ranges we have after various adjustments for
-	 * GART, crash reserved region etc.
+	 * crash reserved region, etc.
 	 */
 	unsigned int max_nr_ranges;
-	unsigned long gart_start, gart_end;
 
 	/* Pointer to elf header */
 	void *ehdr;
@@ -201,17 +200,6 @@ static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
 	return 0;
 }
 
-static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
-{
-	struct crash_elf_data *ced = arg;
-
-	ced->gart_start = start;
-	ced->gart_end = end;
-
-	/* Not expecting more than 1 gart aperture */
-	return 1;
-}
-
 
 /* Gather all the required information to prepare elf headers for ram regions */
 static void fill_up_crash_elf_data(struct crash_elf_data *ced,
@@ -226,22 +214,6 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced,
 
 	ced->max_nr_ranges = nr_ranges;
 
-	/*
-	 * We don't create ELF headers for GART aperture as an attempt
-	 * to dump this memory in second kernel leads to hang/crash.
-	 * If gart aperture is present, one needs to exclude that region
-	 * and that could lead to need of extra phdr.
-	 */
-	walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
-				ced, get_gart_ranges_callback);
-
-	/*
-	 * If we have gart region, excluding that could potentially split
-	 * a memory range, resulting in extra header. Account for  that.
-	 */
-	if (ced->gart_end)
-		ced->max_nr_ranges++;
-
 	/* Exclusion of crash region could split memory ranges */
 	ced->max_nr_ranges++;
 
@@ -350,13 +322,6 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
 			return ret;
 	}
 
-	/* Exclude GART region */
-	if (ced->gart_end) {
-		ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
-		if (ret)
-			return ret;
-	}
-
 	return ret;
 }
 
-- 
cgit v0.10.2


From a8fc42530ddd19d7580fe8c9f2ea86220a97e94c Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:32 +0100
Subject: resource: Kill walk_iomem_res()

walk_iomem_res_desc() replaced walk_iomem_res() and there is no
caller to walk_iomem_res() any more. Kill it. Also remove @name
from find_next_iomem_res() as it is no longer used.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hanjun Guo <hanjun.guo@linaro.org>
Cc: Jakub Sitnicki <jsitnicki@gmail.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Link: http://lkml.kernel.org/r/1453841853-11383-17-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 2a4a5e8..afb4559 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -270,9 +270,6 @@ walk_system_ram_res(u64 start, u64 end, void *arg,
 extern int
 walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
 		    void *arg, int (*func)(u64, u64, void *));
-extern int
-walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg,
-	       int (*func)(u64, u64, void *));
 
 /* True if any part of r1 overlaps r2 */
 static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
diff --git a/kernel/resource.c b/kernel/resource.c
index 37ed2fc..4983430 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -335,13 +335,12 @@ EXPORT_SYMBOL(release_resource);
 /*
  * Finds the lowest iomem resource existing within [res->start.res->end).
  * The caller must specify res->start, res->end, res->flags, and optionally
- * desc and "name".  If found, returns 0, res is overwritten, if not found,
- * returns -1.
+ * desc.  If found, returns 0, res is overwritten, if not found, returns -1.
  * This function walks the whole tree and not just first level children until
  * and unless first_level_children_only is true.
  */
 static int find_next_iomem_res(struct resource *res, unsigned long desc,
-			       char *name, bool first_level_children_only)
+			       bool first_level_children_only)
 {
 	resource_size_t start, end;
 	struct resource *p;
@@ -363,8 +362,6 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
 			continue;
 		if ((desc != IORES_DESC_NONE) && (desc != p->desc))
 			continue;
-		if (name && strcmp(p->name, name))
-			continue;
 		if (p->start > end) {
 			p = NULL;
 			break;
@@ -411,7 +408,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
 	orig_end = res.end;
 
 	while ((res.start < res.end) &&
-		(!find_next_iomem_res(&res, desc, NULL, false))) {
+		(!find_next_iomem_res(&res, desc, false))) {
 
 		ret = (*func)(res.start, res.end, arg);
 		if (ret)
@@ -425,42 +422,6 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
 }
 
 /*
- * Walks through iomem resources and calls @func with matching resource
- * ranges. This walks the whole tree and not just first level children.
- * All the memory ranges which overlap start,end and also match flags and
- * name are valid candidates.
- *
- * @name: name of resource
- * @flags: resource flags
- * @start: start addr
- * @end: end addr
- *
- * NOTE: This function is deprecated and should not be used in new code.
- * Use walk_iomem_res_desc(), instead.
- */
-int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
-		void *arg, int (*func)(u64, u64, void *))
-{
-	struct resource res;
-	u64 orig_end;
-	int ret = -1;
-
-	res.start = start;
-	res.end = end;
-	res.flags = flags;
-	orig_end = res.end;
-	while ((res.start < res.end) &&
-		(!find_next_iomem_res(&res, IORES_DESC_NONE, name, false))) {
-		ret = (*func)(res.start, res.end, arg);
-		if (ret)
-			break;
-		res.start = res.end + 1;
-		res.end = orig_end;
-	}
-	return ret;
-}
-
-/*
  * This function calls the @func callback against all memory ranges of type
  * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
  * Now, this function is only for System RAM, it deals with full ranges and
@@ -479,7 +440,7 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 	res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 	orig_end = res.end;
 	while ((res.start < res.end) &&
-		(!find_next_iomem_res(&res, IORES_DESC_NONE, NULL, true))) {
+		(!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
 		ret = (*func)(res.start, res.end, arg);
 		if (ret)
 			break;
@@ -509,7 +470,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 	res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
 	orig_end = res.end;
 	while ((res.start < res.end) &&
-		(find_next_iomem_res(&res, IORES_DESC_NONE, NULL, true) >= 0)) {
+		(find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
 		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		end_pfn = (res.end + 1) >> PAGE_SHIFT;
 		if (end_pfn > pfn)
-- 
cgit v0.10.2


From 4650bac1fc45d64aef62ab99aa4db93d41dedbd9 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Tue, 26 Jan 2016 21:57:33 +0100
Subject: ACPI/EINJ: Allow memory error injection to NVDIMM

In the case of memory error injection, einj_error_inject()
checks if a target address is System RAM. Change this check to
allow injecting a memory error into NVDIMM memory by calling
region_intersects() with IORES_DESC_PERSISTENT_MEMORY. This
enables memory error testing on both System RAM and NVDIMM.

In addition, page_is_ram() is replaced with region_intersects()
with IORESOURCE_SYSTEM_RAM, so that it can verify a target
address range with the requested size.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: linux-acpi@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-mm <linux-mm@kvack.org>
Cc: linux-nvdimm@lists.01.org
Link: http://lkml.kernel.org/r/1453841853-11383-18-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c
index 0431883..559c117 100644
--- a/drivers/acpi/apei/einj.c
+++ b/drivers/acpi/apei/einj.c
@@ -519,7 +519,7 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
 			     u64 param3, u64 param4)
 {
 	int rc;
-	unsigned long pfn;
+	u64 base_addr, size;
 
 	/* If user manually set "flags", make sure it is legal */
 	if (flags && (flags &
@@ -545,10 +545,17 @@ static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
 	/*
 	 * Disallow crazy address masks that give BIOS leeway to pick
 	 * injection address almost anywhere. Insist on page or
-	 * better granularity and that target address is normal RAM.
+	 * better granularity and that target address is normal RAM or
+	 * NVDIMM.
 	 */
-	pfn = PFN_DOWN(param1 & param2);
-	if (!page_is_ram(pfn) || ((param2 & PAGE_MASK) != PAGE_MASK))
+	base_addr = param1 & param2;
+	size = ~param2 + 1;
+
+	if (((param2 & PAGE_MASK) != PAGE_MASK) ||
+	    ((region_intersects(base_addr, size, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)
+				!= REGION_INTERSECTS) &&
+	     (region_intersects(base_addr, size, IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY)
+				!= REGION_INTERSECTS)))
 		return -EINVAL;
 
 inject:
-- 
cgit v0.10.2


From cd4d09ec6f6c12a2cc3db5b7d8876a325a53545b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jan 2016 22:12:04 +0100
Subject: x86/cpufeature: Carve out X86_FEATURE_*

Move them to a separate header and have the following
dependency:

  x86/cpufeatures.h <- x86/processor.h <- x86/cpufeature.h

This makes it easier to use the header in asm code and not
include the whole cpufeature.h and add guards for asm.

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453842730-28463-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 87d40a7..c0c6253 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -666,7 +666,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	clearcpuid=BITNUM [X86]
 			Disable CPUID feature X for the kernel. See
-			arch/x86/include/asm/cpufeature.h for the valid bit
+			arch/x86/include/asm/cpufeatures.h for the valid bit
 			numbers. Note the Linux specific bits are not necessarily
 			stable over kernel options, but the vendor specific
 			ones should be.
diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h
index ea97697..4cb404f 100644
--- a/arch/x86/boot/cpuflags.h
+++ b/arch/x86/boot/cpuflags.h
@@ -1,7 +1,7 @@
 #ifndef BOOT_CPUFLAGS_H
 #define BOOT_CPUFLAGS_H
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/processor-flags.h>
 
 struct cpu_features {
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 637097e..f72498d 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -17,7 +17,7 @@
 
 #include "../include/asm/required-features.h"
 #include "../include/asm/disabled-features.h"
-#include "../include/asm/cpufeature.h"
+#include "../include/asm/cpufeatures.h"
 #include "../kernel/cpu/capflags.c"
 
 int main(void)
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 07d2c6c..27226df 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -33,7 +33,7 @@
 #include <linux/crc32.h>
 #include <crypto/internal/hash.h>
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
 #include <asm/fpu/api.h>
 
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 0e98716..0857b1a 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -30,7 +30,7 @@
 #include <linux/kernel.h>
 #include <crypto/internal/hash.h>
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
 #include <asm/fpu/internal.h>
 
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index a3fcfc9..cd4df93 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -30,7 +30,7 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <asm/fpu/api.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cpu_device_id.h>
 
 asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 75175f9..c6ab2eb 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -26,6 +26,7 @@
 #include <asm/traps.h>
 #include <asm/vdso.h>
 #include <asm/uaccess.h>
+#include <asm/cpufeature.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 77d8c51..4c52283 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -40,7 +40,7 @@
 #include <asm/processor-flags.h>
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 08a317a..7853b53 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/mm_types.h>
 
-#include <asm/cpufeature.h>
 #include <asm/processor.h>
 #include <asm/vdso.h>
 
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index 3a1d929..0109ac6 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -3,7 +3,7 @@
 */
 
 #include <asm/dwarf2.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 7c912fe..429d54d 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -20,6 +20,7 @@
 #include <asm/page.h>
 #include <asm/hpet.h>
 #include <asm/desc.h>
+#include <asm/cpufeature.h>
 
 #if defined(CONFIG_X86_64)
 unsigned int __read_mostly vdso64_enabled = 1;
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 7bfc85b..99afb66 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -152,12 +152,6 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	".popsection"
 
 /*
- * This must be included *after* the definition of ALTERNATIVE due to
- * <asm/arch_hweight.h>
- */
-#include <asm/cpufeature.h>
-
-/*
  * Alternative instructions for different CPU types or capabilities.
  *
  * This allows to use optimized instructions even on generic binary
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index c80f6b6..0899cfc 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -6,7 +6,6 @@
 
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
-#include <asm/processor.h>
 #include <asm/apicdef.h>
 #include <linux/atomic.h>
 #include <asm/fixmap.h>
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 259a7c1..02e799f 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_HWEIGHT_H
 #define _ASM_X86_HWEIGHT_H
 
+#include <asm/cpufeatures.h>
+
 #ifdef CONFIG_64BIT
 /* popcnt %edi, %eax -- redundant REX prefix for alignment */
 #define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index ad19841..9733361 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -2,6 +2,7 @@
 #define ASM_X86_CMPXCHG_H
 
 #include <linux/compiler.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative.h> /* Provides LOCK_PREFIX */
 
 /*
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bbf166e..3cce9f3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -1,289 +1,7 @@
-/*
- * Defines x86 CPU feature bits
- */
 #ifndef _ASM_X86_CPUFEATURE_H
 #define _ASM_X86_CPUFEATURE_H
 
-#ifndef _ASM_X86_REQUIRED_FEATURES_H
-#include <asm/required-features.h>
-#endif
-
-#ifndef _ASM_X86_DISABLED_FEATURES_H
-#include <asm/disabled-features.h>
-#endif
-
-#define NCAPINTS	16	/* N 32-bit words worth of info */
-#define NBUGINTS	1	/* N 32-bit bug flags */
-
-/*
- * Note: If the comment begins with a quoted string, that string is used
- * in /proc/cpuinfo instead of the macro name.  If the string is "",
- * this feature bit is not displayed in /proc/cpuinfo at all.
- */
-
-/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
-#define X86_FEATURE_FPU		( 0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME		( 0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE		( 0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE		( 0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC		( 0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR		( 0*32+ 5) /* Model-Specific Registers */
-#define X86_FEATURE_PAE		( 0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE		( 0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8		( 0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC	( 0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP		( 0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR	( 0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE		( 0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA		( 0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV	( 0*32+15) /* CMOV instructions */
-					  /* (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT		( 0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36	( 0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN		( 0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLUSH	( 0*32+19) /* CLFLUSH instruction */
-#define X86_FEATURE_DS		( 0*32+21) /* "dts" Debug Store */
-#define X86_FEATURE_ACPI	( 0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX		( 0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR	( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
-#define X86_FEATURE_XMM		( 0*32+25) /* "sse" */
-#define X86_FEATURE_XMM2	( 0*32+26) /* "sse2" */
-#define X86_FEATURE_SELFSNOOP	( 0*32+27) /* "ss" CPU self snoop */
-#define X86_FEATURE_HT		( 0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC		( 0*32+29) /* "tm" Automatic clock control */
-#define X86_FEATURE_IA64	( 0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE		( 0*32+31) /* Pending Break Enable */
-
-/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
-/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL	( 1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP		( 1*32+19) /* MP Capable. */
-#define X86_FEATURE_NX		( 1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT	( 1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FXSR_OPT	( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
-#define X86_FEATURE_GBPAGES	( 1*32+26) /* "pdpe1gb" GB pages */
-#define X86_FEATURE_RDTSCP	( 1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM		( 1*32+29) /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT	( 1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW	( 1*32+31) /* 3DNow! */
-
-/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY	( 2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN	( 2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI	( 2*32+ 3) /* LongRun table interface */
-
-/* Other features, Linux-defined mapping, word 3 */
-/* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX	( 3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR	( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR	( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR	( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
-/* cpu types for specific tunings: */
-#define X86_FEATURE_K8		( 3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7		( 3*32+ 5) /* "" Athlon */
-#define X86_FEATURE_P3		( 3*32+ 6) /* "" P3 */
-#define X86_FEATURE_P4		( 3*32+ 7) /* "" P4 */
-#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP		( 3*32+ 9) /* smp kernel running on up */
-/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
-#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS	( 3*32+12) /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS		( 3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32	( 3*32+14) /* "" syscall in ia32 userspace */
-#define X86_FEATURE_SYSENTER32	( 3*32+15) /* "" sysenter in ia32 userspace */
-#define X86_FEATURE_REP_GOOD	( 3*32+16) /* rep microcode works well */
-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
-/* free, was #define X86_FEATURE_11AP	( 3*32+19) * "" Bad local APIC aka 11AP */
-#define X86_FEATURE_NOPL	( 3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_ALWAYS	( 3*32+21) /* "" Always-present feature */
-#define X86_FEATURE_XTOPOLOGY	( 3*32+22) /* cpu topology enum extensions */
-#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
-#define X86_FEATURE_NONSTOP_TSC	( 3*32+24) /* TSC does not stop in C states */
-/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
-#define X86_FEATURE_EXTD_APICID	( 3*32+26) /* has extended APICID (8 bits) */
-#define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
-#define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
-#define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
-#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-
-/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
-#define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ	( 4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64	( 4*32+ 2) /* 64-bit Debug Store */
-#define X86_FEATURE_MWAIT	( 4*32+ 3) /* "monitor" Monitor/Mwait support */
-#define X86_FEATURE_DSCPL	( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
-#define X86_FEATURE_VMX		( 4*32+ 5) /* Hardware virtualization */
-#define X86_FEATURE_SMX		( 4*32+ 6) /* Safer mode */
-#define X86_FEATURE_EST		( 4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2		( 4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3	( 4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CID		( 4*32+10) /* Context ID */
-#define X86_FEATURE_SDBG	( 4*32+11) /* Silicon Debug */
-#define X86_FEATURE_FMA		( 4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16	( 4*32+13) /* CMPXCHG16B */
-#define X86_FEATURE_XTPR	( 4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_PDCM	( 4*32+15) /* Performance Capabilities */
-#define X86_FEATURE_PCID	( 4*32+17) /* Process Context Identifiers */
-#define X86_FEATURE_DCA		( 4*32+18) /* Direct Cache Access */
-#define X86_FEATURE_XMM4_1	( 4*32+19) /* "sse4_1" SSE-4.1 */
-#define X86_FEATURE_XMM4_2	( 4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC	( 4*32+21) /* x2APIC */
-#define X86_FEATURE_MOVBE	( 4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT      ( 4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_TSC_DEADLINE_TIMER	( 4*32+24) /* Tsc deadline timer */
-#define X86_FEATURE_AES		( 4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE	( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
-#define X86_FEATURE_OSXSAVE	( 4*32+27) /* "" XSAVE enabled in the OS */
-#define X86_FEATURE_AVX		( 4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_F16C	( 4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRAND	( 4*32+30) /* The RDRAND instruction */
-#define X86_FEATURE_HYPERVISOR	( 4*32+31) /* Running on a hypervisor */
-
-/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-#define X86_FEATURE_XSTORE	( 5*32+ 2) /* "rng" RNG present (xstore) */
-#define X86_FEATURE_XSTORE_EN	( 5*32+ 3) /* "rng_en" RNG enabled */
-#define X86_FEATURE_XCRYPT	( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
-#define X86_FEATURE_XCRYPT_EN	( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
-#define X86_FEATURE_ACE2	( 5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN	( 5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE		( 5*32+10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN	( 5*32+11) /* PHE enabled */
-#define X86_FEATURE_PMM		( 5*32+12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN	( 5*32+13) /* PMM enabled */
-
-/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
-#define X86_FEATURE_LAHF_LM	( 6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY	( 6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVM		( 6*32+ 2) /* Secure virtual machine */
-#define X86_FEATURE_EXTAPIC	( 6*32+ 3) /* Extended APIC space */
-#define X86_FEATURE_CR8_LEGACY	( 6*32+ 4) /* CR8 in 32-bit mode */
-#define X86_FEATURE_ABM		( 6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A	( 6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_OSVW	( 6*32+ 9) /* OS Visible Workaround */
-#define X86_FEATURE_IBS		( 6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_XOP		( 6*32+11) /* extended AVX instructions */
-#define X86_FEATURE_SKINIT	( 6*32+12) /* SKINIT/STGI instructions */
-#define X86_FEATURE_WDT		( 6*32+13) /* Watchdog timer */
-#define X86_FEATURE_LWP		( 6*32+15) /* Light Weight Profiling */
-#define X86_FEATURE_FMA4	( 6*32+16) /* 4 operands MAC instructions */
-#define X86_FEATURE_TCE		( 6*32+17) /* translation cache extension */
-#define X86_FEATURE_NODEID_MSR	( 6*32+19) /* NodeId MSR */
-#define X86_FEATURE_TBM		( 6*32+21) /* trailing bit manipulations */
-#define X86_FEATURE_TOPOEXT	( 6*32+22) /* topology extensions CPUID leafs */
-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
-#define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
-#define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
-#define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
-#define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
-
-/*
- * Auxiliary flags: Linux defined - For features scattered in various
- * CPUID levels like 0x6, 0xA etc, word 7.
- *
- * Reuse free bits when adding new feature flags!
- */
-
-#define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
-#define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-
-#define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
-#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-
-#define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
-
-/* Virtualization flags: Linux defined, word 8 */
-#define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT         ( 8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID        ( 8*32+ 4) /* Intel Virtual Processor ID */
-
-#define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
-#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
-
-
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
-#define X86_FEATURE_FSGSBASE	( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
-#define X86_FEATURE_TSC_ADJUST	( 9*32+ 1) /* TSC adjustment MSR 0x3b */
-#define X86_FEATURE_BMI1	( 9*32+ 3) /* 1st group bit manipulation extensions */
-#define X86_FEATURE_HLE		( 9*32+ 4) /* Hardware Lock Elision */
-#define X86_FEATURE_AVX2	( 9*32+ 5) /* AVX2 instructions */
-#define X86_FEATURE_SMEP	( 9*32+ 7) /* Supervisor Mode Execution Protection */
-#define X86_FEATURE_BMI2	( 9*32+ 8) /* 2nd group bit manipulation extensions */
-#define X86_FEATURE_ERMS	( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
-#define X86_FEATURE_INVPCID	( 9*32+10) /* Invalidate Processor Context ID */
-#define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
-#define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
-#define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
-#define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
-#define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
-#define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
-#define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
-#define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
-#define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
-#define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
-#define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
-#define X86_FEATURE_AVX512CD	( 9*32+28) /* AVX-512 Conflict Detection */
-#define X86_FEATURE_SHA_NI	( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
-
-/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
-#define X86_FEATURE_XSAVEOPT	(10*32+ 0) /* XSAVEOPT */
-#define X86_FEATURE_XSAVEC	(10*32+ 1) /* XSAVEC */
-#define X86_FEATURE_XGETBV1	(10*32+ 2) /* XGETBV with ECX = 1 */
-#define X86_FEATURE_XSAVES	(10*32+ 3) /* XSAVES/XRSTORS */
-
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
-#define X86_FEATURE_CQM_LLC	(11*32+ 1) /* LLC QoS if 1 */
-
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
-
-/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
-#define X86_FEATURE_CLZERO	(13*32+0) /* CLZERO instruction */
-
-/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
-#define X86_FEATURE_DTHERM	(14*32+ 0) /* Digital Thermal Sensor */
-#define X86_FEATURE_IDA		(14*32+ 1) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT	(14*32+ 2) /* Always Running APIC Timer */
-#define X86_FEATURE_PLN		(14*32+ 4) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS		(14*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_HWP		(14*32+ 7) /* Intel Hardware P-states */
-#define X86_FEATURE_HWP_NOTIFY	(14*32+ 8) /* HWP Notification */
-#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
-#define X86_FEATURE_HWP_EPP	(14*32+10) /* HWP Energy Perf. Preference */
-#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
-
-/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
-#define X86_FEATURE_NPT		(15*32+ 0) /* Nested Page Table support */
-#define X86_FEATURE_LBRV	(15*32+ 1) /* LBR Virtualization support */
-#define X86_FEATURE_SVML	(15*32+ 2) /* "svm_lock" SVM locking MSR */
-#define X86_FEATURE_NRIPS	(15*32+ 3) /* "nrip_save" SVM next_rip save */
-#define X86_FEATURE_TSCRATEMSR  (15*32+ 4) /* "tsc_scale" TSC scaling support */
-#define X86_FEATURE_VMCBCLEAN   (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
-#define X86_FEATURE_AVIC	(15*32+13) /* Virtual Interrupt Controller */
-
-/*
- * BUG word(s)
- */
-#define X86_BUG(x)		(NCAPINTS*32 + (x))
-
-#define X86_BUG_F00F		X86_BUG(0) /* Intel F00F */
-#define X86_BUG_FDIV		X86_BUG(1) /* FPU FDIV */
-#define X86_BUG_COMA		X86_BUG(2) /* Cyrix 6x86 coma */
-#define X86_BUG_AMD_TLB_MMATCH	X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
-#define X86_BUG_AMD_APIC_C1E	X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
-#define X86_BUG_11AP		X86_BUG(5) /* Bad local APIC aka 11AP */
-#define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
-#define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
-#define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#include <asm/processor.h>
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
new file mode 100644
index 0000000..0ceb6ad
--- /dev/null
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -0,0 +1,288 @@
+#ifndef _ASM_X86_CPUFEATURES_H
+#define _ASM_X86_CPUFEATURES_H
+
+#ifndef _ASM_X86_REQUIRED_FEATURES_H
+#include <asm/required-features.h>
+#endif
+
+#ifndef _ASM_X86_DISABLED_FEATURES_H
+#include <asm/disabled-features.h>
+#endif
+
+/*
+ * Defines x86 CPU feature bits
+ */
+#define NCAPINTS	16	/* N 32-bit words worth of info */
+#define NBUGINTS	1	/* N 32-bit bug flags */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name.  If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+ */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
+#define X86_FEATURE_FPU		( 0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME		( 0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE		( 0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE		( 0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC		( 0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR		( 0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE		( 0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE		( 0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8		( 0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC	( 0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP		( 0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR	( 0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE		( 0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA		( 0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV	( 0*32+15) /* CMOV instructions */
+					  /* (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT		( 0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36	( 0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN		( 0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLUSH	( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_DS		( 0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI	( 0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX		( 0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR	( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM		( 0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2	( 0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP	( 0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT		( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC		( 0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64	( 0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE		( 0*32+31) /* Pending Break Enable */
+
+/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
+/* Don't duplicate feature flags which are redundant with Intel! */
+#define X86_FEATURE_SYSCALL	( 1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP		( 1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX		( 1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT	( 1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT	( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES	( 1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP	( 1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM		( 1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT	( 1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW	( 1*32+31) /* 3DNow! */
+
+/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
+#define X86_FEATURE_RECOVERY	( 2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN	( 2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI	( 2*32+ 3) /* LongRun table interface */
+
+/* Other features, Linux-defined mapping, word 3 */
+/* This range is used for feature bits which conflict or are synthesized */
+#define X86_FEATURE_CXMMX	( 3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR	( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR	( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR	( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
+/* cpu types for specific tunings: */
+#define X86_FEATURE_K8		( 3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7		( 3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3		( 3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4		( 3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP		( 3*32+ 9) /* smp kernel running on up */
+/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS	( 3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS		( 3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32	( 3*32+14) /* "" syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32	( 3*32+15) /* "" sysenter in ia32 userspace */
+#define X86_FEATURE_REP_GOOD	( 3*32+16) /* rep microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
+/* free, was #define X86_FEATURE_11AP	( 3*32+19) * "" Bad local APIC aka 11AP */
+#define X86_FEATURE_NOPL	( 3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS	( 3*32+21) /* "" Always-present feature */
+#define X86_FEATURE_XTOPOLOGY	( 3*32+22) /* cpu topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC	( 3*32+24) /* TSC does not stop in C states */
+/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID	( 3*32+26) /* has extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
+#define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
+#define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+
+/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
+#define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ	( 4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64	( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT	( 4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL	( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+#define X86_FEATURE_VMX		( 4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX		( 4*32+ 6) /* Safer mode */
+#define X86_FEATURE_EST		( 4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2		( 4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3	( 4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID		( 4*32+10) /* Context ID */
+#define X86_FEATURE_SDBG	( 4*32+11) /* Silicon Debug */
+#define X86_FEATURE_FMA		( 4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16	( 4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR	( 4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM	( 4*32+15) /* Performance Capabilities */
+#define X86_FEATURE_PCID	( 4*32+17) /* Process Context Identifiers */
+#define X86_FEATURE_DCA		( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1	( 4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2	( 4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC	( 4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE	( 4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT      ( 4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER	( 4*32+24) /* Tsc deadline timer */
+#define X86_FEATURE_AES		( 4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE	( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE	( 4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX		( 4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C	( 4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRAND	( 4*32+30) /* The RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR	( 4*32+31) /* Running on a hypervisor */
+
+/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
+#define X86_FEATURE_XSTORE	( 5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN	( 5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT	( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN	( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2	( 5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN	( 5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE		( 5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN	( 5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM		( 5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN	( 5*32+13) /* PMM enabled */
+
+/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
+#define X86_FEATURE_LAHF_LM	( 6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY	( 6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM		( 6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC	( 6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY	( 6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM		( 6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A	( 6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW	( 6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS		( 6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP		( 6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT	( 6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT		( 6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP		( 6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4	( 6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE		( 6*32+17) /* translation cache extension */
+#define X86_FEATURE_NODEID_MSR	( 6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM		( 6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT	( 6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
+#define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
+
+/*
+ * Auxiliary flags: Linux defined - For features scattered in various
+ * CPUID levels like 0x6, 0xA etc, word 7.
+ *
+ * Reuse free bits when adding new feature flags!
+ */
+
+#define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+
+#define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+
+#define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
+
+/* Virtualization flags: Linux defined, word 8 */
+#define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT         ( 8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID        ( 8*32+ 4) /* Intel Virtual Processor ID */
+
+#define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
+#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
+
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
+#define X86_FEATURE_FSGSBASE	( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST	( 9*32+ 1) /* TSC adjustment MSR 0x3b */
+#define X86_FEATURE_BMI1	( 9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE		( 9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2	( 9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_SMEP	( 9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2	( 9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS	( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
+#define X86_FEATURE_INVPCID	( 9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
+#define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
+#define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
+#define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
+#define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD	( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI	( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+
+/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
+#define X86_FEATURE_XSAVEOPT	(10*32+ 0) /* XSAVEOPT */
+#define X86_FEATURE_XSAVEC	(10*32+ 1) /* XSAVEC */
+#define X86_FEATURE_XGETBV1	(10*32+ 2) /* XGETBV with ECX = 1 */
+#define X86_FEATURE_XSAVES	(10*32+ 3) /* XSAVES/XRSTORS */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
+#define X86_FEATURE_CQM_LLC	(11*32+ 1) /* LLC QoS if 1 */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+
+/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
+#define X86_FEATURE_CLZERO	(13*32+0) /* CLZERO instruction */
+
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
+#define X86_FEATURE_DTHERM	(14*32+ 0) /* Digital Thermal Sensor */
+#define X86_FEATURE_IDA		(14*32+ 1) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT	(14*32+ 2) /* Always Running APIC Timer */
+#define X86_FEATURE_PLN		(14*32+ 4) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS		(14*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_HWP		(14*32+ 7) /* Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY	(14*32+ 8) /* HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
+#define X86_FEATURE_HWP_EPP	(14*32+10) /* HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
+#define X86_FEATURE_NPT		(15*32+ 0) /* Nested Page Table support */
+#define X86_FEATURE_LBRV	(15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_SVML	(15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS	(15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR  (15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN   (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
+#define X86_FEATURE_AVIC	(15*32+13) /* Virtual Interrupt Controller */
+
+/*
+ * BUG word(s)
+ */
+#define X86_BUG(x)		(NCAPINTS*32 + (x))
+
+#define X86_BUG_F00F		X86_BUG(0) /* Intel F00F */
+#define X86_BUG_FDIV		X86_BUG(1) /* FPU FDIV */
+#define X86_BUG_COMA		X86_BUG(2) /* Cyrix 6x86 coma */
+#define X86_BUG_AMD_TLB_MMATCH	X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E	X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP		X86_BUG(5) /* Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+
+#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 0fd440d..d01199d 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -17,6 +17,7 @@
 #include <asm/user.h>
 #include <asm/fpu/api.h>
 #include <asm/fpu/xstate.h>
+#include <asm/cpufeature.h>
 
 /*
  * High level FPU state handling functions:
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
index 78162f8..d0afb05 100644
--- a/arch/x86/include/asm/irq_work.h
+++ b/arch/x86/include/asm/irq_work.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_IRQ_WORK_H
 #define _ASM_IRQ_WORK_H
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 
 static inline bool arch_irq_work_has_interrupt(void)
 {
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index c70689b..0deeb2d 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -3,6 +3,8 @@
 
 #include <linux/sched.h>
 
+#include <asm/cpufeature.h>
+
 #define MWAIT_SUBSTATE_MASK		0xf
 #define MWAIT_CSTATE_MASK		0xf
 #define MWAIT_SUBSTATE_SIZE		4
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2d5a50c..491a3d9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -13,7 +13,7 @@ struct vm86;
 #include <asm/types.h>
 #include <uapi/asm/sigcontext.h>
 #include <asm/current.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/page.h>
 #include <asm/pgtable_types.h>
 #include <asm/percpu.h>
@@ -24,7 +24,6 @@ struct vm86;
 #include <asm/fpu/types.h>
 
 #include <linux/personality.h>
-#include <linux/cpumask.h>
 #include <linux/cache.h>
 #include <linux/threads.h>
 #include <linux/math64.h>
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index ba665eb..db33330 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -15,7 +15,7 @@
 
 #include <linux/stringify.h>
 #include <asm/nops.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 
 /* "Raw" instruction opcodes */
 #define __ASM_CLAC	.byte 0x0f,0x01,0xca
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index dfcf072..20a3de5 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -16,7 +16,6 @@
 #endif
 #include <asm/thread_info.h>
 #include <asm/cpumask.h>
-#include <asm/cpufeature.h>
 
 extern int smp_num_siblings;
 extern unsigned int num_processors;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index c7b5510..c0778fc 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -49,7 +49,7 @@
  */
 #ifndef __ASSEMBLY__
 struct task_struct;
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <linux/atomic.h>
 
 struct thread_info {
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6df2029..0bb31cb 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -5,6 +5,7 @@
 #include <linux/sched.h>
 
 #include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/special_insns.h>
 
 #ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index b89c34c..3076986 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -8,7 +8,7 @@
 #include <linux/errno.h>
 #include <linux/lockdep.h>
 #include <asm/alternative.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/page.h>
 
 /*
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 5803130..faa7b52 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -64,7 +64,7 @@ ifdef CONFIG_X86_FEATURE_NAMES
 quiet_cmd_mkcapflags = MKCAP   $@
       cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
 
-cpufeature = $(src)/../../include/asm/cpufeature.h
+cpufeature = $(src)/../../include/asm/cpufeatures.h
 
 targets += capflags.c
 $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index ae20be6..6608c03 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,7 +1,7 @@
 #include <linux/bitops.h>
 #include <linux/kernel.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/e820.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index aaf152e..15e47c1 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -8,6 +8,7 @@
 #include <linux/timer.h>
 #include <asm/pci-direct.h>
 #include <asm/tsc.h>
+#include <asm/cpufeature.h>
 
 #include "cpu.h"
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 565648b..9299e3b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/pgtable.h>
 #include <asm/msr.h>
 #include <asm/bugs.h>
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 0b6c523..341449c 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -14,7 +14,7 @@
 #include <linux/sysfs.h>
 #include <linux/pci.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/amd_nb.h>
 #include <asm/smp.h>
 
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index afa9f0d..fbb5e90 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -1,5 +1,5 @@
 #include <asm/cpu_device_id.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
 #include <linux/slab.h>
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 3f20710..6988c74 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h
 #
 
 IN=$1
@@ -49,8 +49,8 @@ dump_array()
 trap 'rm "$OUT"' EXIT
 
 (
-	echo "#ifndef _ASM_X86_CPUFEATURE_H"
-	echo "#include <asm/cpufeature.h>"
+	echo "#ifndef _ASM_X86_CPUFEATURES_H"
+	echo "#include <asm/cpufeatures.h>"
 	echo "#endif"
 	echo ""
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 5c3d149..74f1d90 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -47,7 +47,7 @@
 #include <linux/smp.h>
 #include <linux/syscore_ops.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/e820.h>
 #include <asm/mtrr.h>
 #include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 252da7a..a19a663 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,6 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/msr.h>
 #include "cpu.h"
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 569c1e4..b3c2a69 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -24,6 +24,7 @@
 #include <asm/e820.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
+#include <asm/cpufeature.h>
 
 /*
  * The e820 map is the map that gets modified e.g. with command line parameters
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 6bc9ae2..af11129 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -19,7 +19,7 @@
 #include <asm/setup.h>
 #include <asm/processor-flags.h>
 #include <asm/msr-index.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/percpu.h>
 #include <asm/nops.h>
 #include <asm/bootparam.h>
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index b8e6ff5..be0ebbb 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -12,6 +12,7 @@
 #include <linux/pm.h>
 #include <linux/io.h>
 
+#include <asm/cpufeature.h>
 #include <asm/irqdomain.h>
 #include <asm/fixmap.h>
 #include <asm/hpet.h>
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 64f9616..7f3550a 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -40,7 +40,7 @@
 #include <linux/uaccess.h>
 #include <linux/gfp.h>
 
-#include <asm/processor.h>
+#include <asm/cpufeature.h>
 #include <asm/msr.h>
 
 static struct class *msr_class;
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 07efb35..014ea59 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -30,7 +30,7 @@
  * 	appropriately. Either display a message or halt.
  */
 
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/msr-index.h>
 
 verify_cpu:
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index a2fe51b..65be7cf 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,5 @@
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 009f982..24ef1c2 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,7 +1,7 @@
 /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
 
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 982ce34..fba3430 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -10,7 +10,7 @@
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bb..a0de849 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,7 +1,7 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 /*
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index ca2afdd..90ce01b 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -6,7 +6,7 @@
  *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 #undef memmove
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2661fad..c9c8122 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -1,7 +1,7 @@
 /* Copyright 2002 Andi Kleen, SuSE Labs */
 
 #include <linux/linkage.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/alternative-asm.h>
 
 .weak memset
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 92e2eac..f65a33f 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -4,6 +4,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/proto.h>
+#include <asm/cpufeature.h>
 
 static int disable_nx;
 
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 50d86c0..660a83c 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -24,7 +24,6 @@
 #include <asm/nmi.h>
 #include <asm/apic.h>
 #include <asm/processor.h>
-#include <asm/cpufeature.h>
 
 #include "op_x86_model.h"
 #include "op_counter.h"
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 174781a..00c3190 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -3,7 +3,7 @@
 
 #include <asm/asm.h>
 #include <asm/segment.h>
-#include <asm/cpufeature.h>
+#include <asm/cpufeatures.h>
 #include <asm/cmpxchg.h>
 #include <asm/nops.h>
 
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index d62de8b..1234818 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -17,7 +17,7 @@
 #include <linux/atomic.h>
 
 #ifdef CONFIG_X86
-#include <asm/processor.h>	/* for boot_cpu_has below */
+#include <asm/cpufeature.h>	/* for boot_cpu_has below */
 #endif
 
 #define TEST(bit, op, c_op, val)				\
-- 
cgit v0.10.2


From bc696ca05f5a8927329ec276a892341e006b00ba Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jan 2016 22:12:05 +0100
Subject: x86/cpufeature: Replace the old static_cpu_has() with safe variant

So the old one didn't work properly before alternatives had run.
And it was supposed to provide an optimized JMP because the
assumption was that the offset it is jumping to is within a
signed byte and thus a two-byte JMP.

So I did an x86_64 allyesconfig build and dumped all possible
sites where static_cpu_has() was used. The optimization amounted
to all in all 12(!) places where static_cpu_has() had generated
a 2-byte JMP. Which has saved us a whopping 36 bytes!

This clearly is not worth the trouble so we can remove it. The
only place where the optimization might count - in __switch_to()
- we will handle differently. But that's not subject of this
patch.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453842730-28463-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 9b18ed9..68a2d1f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -350,16 +350,6 @@ config DEBUG_IMR_SELFTEST
 
 	  If unsure say N here.
 
-config X86_DEBUG_STATIC_CPU_HAS
-	bool "Debug alternatives"
-	depends on DEBUG_KERNEL
-	---help---
-	  This option causes additional code to be generated which
-	  fails if static_cpu_has() is used before alternatives have
-	  run.
-
-	  If unsure, say N.
-
 config X86_DEBUG_FPU
 	bool "Debug the x86 FPU code"
 	depends on DEBUG_KERNEL
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3cce9f3..a261cf2 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -125,103 +125,19 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 #define cpu_has_osxsave		boot_cpu_has(X86_FEATURE_OSXSAVE)
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
 /*
- * Do not add any more of those clumsy macros - use static_cpu_has_safe() for
+ * Do not add any more of those clumsy macros - use static_cpu_has() for
  * fast paths and boot_cpu_has() otherwise!
  */
 
 #if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS)
-extern void warn_pre_alternatives(void);
-extern bool __static_cpu_has_safe(u16 bit);
+extern bool __static_cpu_has(u16 bit);
 
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
  * These are only valid after alternatives have run, but will statically
  * patch the target code for additional performance.
  */
-static __always_inline __pure bool __static_cpu_has(u16 bit)
-{
-#ifdef CC_HAVE_ASM_GOTO
-
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-
-		/*
-		 * Catch too early usage of this before alternatives
-		 * have run.
-		 */
-		asm_volatile_goto("1: jmp %l[t_warn]\n"
-			 "2:\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"
-			 " .long 0\n"		/* no replacement */
-			 " .word %P0\n"		/* 1: do replace */
-			 " .byte 2b - 1b\n"	/* source len */
-			 " .byte 0\n"		/* replacement len */
-			 " .byte 0\n"		/* pad len */
-			 ".previous\n"
-			 /* skipping size check since replacement size = 0 */
-			 : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
-
-#endif
-
-		asm_volatile_goto("1: jmp %l[t_no]\n"
-			 "2:\n"
-			 ".section .altinstructions,\"a\"\n"
-			 " .long 1b - .\n"
-			 " .long 0\n"		/* no replacement */
-			 " .word %P0\n"		/* feature bit */
-			 " .byte 2b - 1b\n"	/* source len */
-			 " .byte 0\n"		/* replacement len */
-			 " .byte 0\n"		/* pad len */
-			 ".previous\n"
-			 /* skipping size check since replacement size = 0 */
-			 : : "i" (bit) : : t_no);
-		return true;
-	t_no:
-		return false;
-
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-	t_warn:
-		warn_pre_alternatives();
-		return false;
-#endif
-
-#else /* CC_HAVE_ASM_GOTO */
-
-		u8 flag;
-		/* Open-coded due to __stringify() in ALTERNATIVE() */
-		asm volatile("1: movb $0,%0\n"
-			     "2:\n"
-			     ".section .altinstructions,\"a\"\n"
-			     " .long 1b - .\n"
-			     " .long 3f - .\n"
-			     " .word %P1\n"		/* feature bit */
-			     " .byte 2b - 1b\n"		/* source len */
-			     " .byte 4f - 3f\n"		/* replacement len */
-			     " .byte 0\n"		/* pad len */
-			     ".previous\n"
-			     ".section .discard,\"aw\",@progbits\n"
-			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
-			     ".previous\n"
-			     ".section .altinstr_replacement,\"ax\"\n"
-			     "3: movb $1,%0\n"
-			     "4:\n"
-			     ".previous\n"
-			     : "=qm" (flag) : "i" (bit));
-		return flag;
-
-#endif /* CC_HAVE_ASM_GOTO */
-}
-
-#define static_cpu_has(bit)					\
-(								\
-	__builtin_constant_p(boot_cpu_has(bit)) ?		\
-		boot_cpu_has(bit) :				\
-	__builtin_constant_p(bit) ?				\
-		__static_cpu_has(bit) :				\
-		boot_cpu_has(bit)				\
-)
-
-static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
+static __always_inline __pure bool _static_cpu_has(u16 bit)
 {
 #ifdef CC_HAVE_ASM_GOTO
 		asm_volatile_goto("1: jmp %l[t_dynamic]\n"
@@ -255,7 +171,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 	t_no:
 		return false;
 	t_dynamic:
-		return __static_cpu_has_safe(bit);
+		return __static_cpu_has(bit);
 #else
 		u8 flag;
 		/* Open-coded due to __stringify() in ALTERNATIVE() */
@@ -293,22 +209,21 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 			     ".previous\n"
 			     : "=qm" (flag)
 			     : "i" (bit), "i" (X86_FEATURE_ALWAYS));
-		return (flag == 2 ? __static_cpu_has_safe(bit) : flag);
+		return (flag == 2 ? __static_cpu_has(bit) : flag);
 #endif /* CC_HAVE_ASM_GOTO */
 }
 
-#define static_cpu_has_safe(bit)				\
+#define static_cpu_has(bit)					\
 (								\
 	__builtin_constant_p(boot_cpu_has(bit)) ?		\
 		boot_cpu_has(bit) :				\
-		_static_cpu_has_safe(bit)			\
+		_static_cpu_has(bit)				\
 )
 #else
 /*
  * gcc 3.x is too stupid to do the static test; fall back to dynamic.
  */
 #define static_cpu_has(bit)		boot_cpu_has(bit)
-#define static_cpu_has_safe(bit)	boot_cpu_has(bit)
 #endif
 
 #define cpu_has_bug(c, bit)		cpu_has(c, (bit))
@@ -316,7 +231,6 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 #define clear_cpu_bug(c, bit)		clear_cpu_cap(c, (bit))
 
 #define static_cpu_has_bug(bit)		static_cpu_has((bit))
-#define static_cpu_has_bug_safe(bit)	static_cpu_has_safe((bit))
 #define boot_cpu_has_bug(bit)		cpu_has_bug(&boot_cpu_data, (bit))
 
 #define MAX_CPU_FEATURES		(NCAPINTS * 32)
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index d01199d..c2e46eb 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -59,22 +59,22 @@ extern u64 fpu__get_supported_xfeatures_mask(void);
  */
 static __always_inline __pure bool use_eager_fpu(void)
 {
-	return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
+	return static_cpu_has(X86_FEATURE_EAGER_FPU);
 }
 
 static __always_inline __pure bool use_xsaveopt(void)
 {
-	return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
+	return static_cpu_has(X86_FEATURE_XSAVEOPT);
 }
 
 static __always_inline __pure bool use_xsave(void)
 {
-	return static_cpu_has_safe(X86_FEATURE_XSAVE);
+	return static_cpu_has(X86_FEATURE_XSAVE);
 }
 
 static __always_inline __pure bool use_fxsr(void)
 {
-	return static_cpu_has_safe(X86_FEATURE_FXSR);
+	return static_cpu_has(X86_FEATURE_FXSR);
 }
 
 /*
@@ -301,7 +301,7 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
 
 	WARN_ON(system_state != SYSTEM_BOOTING);
 
-	if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+	if (static_cpu_has(X86_FEATURE_XSAVES))
 		XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
 	else
 		XSTATE_OP(XSAVE, xstate, lmask, hmask, err);
@@ -323,7 +323,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
 
 	WARN_ON(system_state != SYSTEM_BOOTING);
 
-	if (static_cpu_has_safe(X86_FEATURE_XSAVES))
+	if (static_cpu_has(X86_FEATURE_XSAVES))
 		XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
 	else
 		XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
@@ -461,7 +461,7 @@ static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
 	 * pending. Clear the x87 state here by setting it to fixed values.
 	 * "m" is a random variable that should be in L1.
 	 */
-	if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
+	if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) {
 		asm volatile(
 			"fnclex\n\t"
 			"emms\n\t"
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index c80c02c..ab5c2c6 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -30,7 +30,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
 	unsigned long value;
 	unsigned int id = (x >> 24) & 0xff;
 
-	if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+	if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
 		id |= (value << 2) & 0xff00;
 	}
@@ -178,7 +178,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
 	this_cpu_write(cpu_llc_id, node);
 
 	/* Account for nodes per socket in multi-core-module processors */
-	if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+	if (static_cpu_has(X86_FEATURE_NODEID_MSR)) {
 		rdmsrl(MSR_FAM10H_NODE_ID, val);
 		nodes = ((val >> 3) & 7) + 1;
 	}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 37830de..ee49981 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1475,19 +1475,11 @@ void cpu_init(void)
 }
 #endif
 
-#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS
-void warn_pre_alternatives(void)
-{
-	WARN(1, "You're using static_cpu_has before alternatives have run!\n");
-}
-EXPORT_SYMBOL_GPL(warn_pre_alternatives);
-#endif
-
-inline bool __static_cpu_has_safe(u16 bit)
+inline bool __static_cpu_has(u16 bit)
 {
 	return boot_cpu_has(bit);
 }
-EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
+EXPORT_SYMBOL_GPL(__static_cpu_has);
 
 static void bsp_resume(void)
 {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e574b85..3dce1ca 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -362,7 +362,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 	/* make room for real-mode segments */
 	tsk->thread.sp0 += 16;
 
-	if (static_cpu_has_safe(X86_FEATURE_SEP))
+	if (static_cpu_has(X86_FEATURE_SEP))
 		tsk->thread.sysenter_cs = 0;
 
 	load_sp0(tss, &tsk->thread);
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index cd83d47..3a4b39a 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1431,7 +1431,7 @@ static int __init intel_pstate_init(void)
 	if (!all_cpu_data)
 		return -ENOMEM;
 
-	if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp) {
+	if (static_cpu_has(X86_FEATURE_HWP) && !no_hwp) {
 		pr_info("intel_pstate: HWP enabled\n");
 		hwp_active++;
 	}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dd08e29..d928649 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -930,7 +930,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
 	if (bio_flags & EXTENT_BIO_TREE_LOG)
 		return 0;
 #ifdef CONFIG_X86
-	if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
+	if (static_cpu_has(X86_FEATURE_XMM4_2))
 		return 0;
 #endif
 	return 1;
-- 
cgit v0.10.2


From a362bf9f5e7dd659b96d01382da7b855f4e5a7a1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Wed, 27 Jan 2016 09:43:25 +0100
Subject: x86/cpufeature: Get rid of the non-asm goto variant

I can simply quote hpa from the mail:

  "Get rid of the non-asm goto variant and just fall back to
   dynamic if asm goto is unavailable. It doesn't make any sense,
   really, if it is supposed to be safe, and by now the asm
   goto-capable gcc is in more wide use. (Originally the gcc 3.x
   fallback to pure dynamic didn't exist, either.)"

Booy, am I lazy.

Cleanup the whole CC_HAVE_ASM_GOTO ifdeffery too, while at it.

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160127084325.GB30712@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index a261cf2..9048c1b 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -129,17 +129,16 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
  * fast paths and boot_cpu_has() otherwise!
  */
 
-#if __GNUC__ >= 4 && defined(CONFIG_X86_FAST_FEATURE_TESTS)
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
 extern bool __static_cpu_has(u16 bit);
 
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
- * These are only valid after alternatives have run, but will statically
- * patch the target code for additional performance.
+ * These will statically patch the target code for additional
+ * performance.
  */
 static __always_inline __pure bool _static_cpu_has(u16 bit)
 {
-#ifdef CC_HAVE_ASM_GOTO
 		asm_volatile_goto("1: jmp %l[t_dynamic]\n"
 			 "2:\n"
 			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
@@ -172,45 +171,6 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 		return false;
 	t_dynamic:
 		return __static_cpu_has(bit);
-#else
-		u8 flag;
-		/* Open-coded due to __stringify() in ALTERNATIVE() */
-		asm volatile("1: movb $2,%0\n"
-			     "2:\n"
-			     ".section .altinstructions,\"a\"\n"
-			     " .long 1b - .\n"		/* src offset */
-			     " .long 3f - .\n"		/* repl offset */
-			     " .word %P2\n"		/* always replace */
-			     " .byte 2b - 1b\n"		/* source len */
-			     " .byte 4f - 3f\n"		/* replacement len */
-			     " .byte 0\n"		/* pad len */
-			     ".previous\n"
-			     ".section .discard,\"aw\",@progbits\n"
-			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
-			     ".previous\n"
-			     ".section .altinstr_replacement,\"ax\"\n"
-			     "3: movb $0,%0\n"
-			     "4:\n"
-			     ".previous\n"
-			     ".section .altinstructions,\"a\"\n"
-			     " .long 1b - .\n"		/* src offset */
-			     " .long 5f - .\n"		/* repl offset */
-			     " .word %P1\n"		/* feature bit */
-			     " .byte 4b - 3b\n"		/* src len */
-			     " .byte 6f - 5f\n"		/* repl len */
-			     " .byte 0\n"		/* pad len */
-			     ".previous\n"
-			     ".section .discard,\"aw\",@progbits\n"
-			     " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
-			     ".previous\n"
-			     ".section .altinstr_replacement,\"ax\"\n"
-			     "5: movb $1,%0\n"
-			     "6:\n"
-			     ".previous\n"
-			     : "=qm" (flag)
-			     : "i" (bit), "i" (X86_FEATURE_ALWAYS));
-		return (flag == 2 ? __static_cpu_has(bit) : flag);
-#endif /* CC_HAVE_ASM_GOTO */
 }
 
 #define static_cpu_has(bit)					\
@@ -221,7 +181,8 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 )
 #else
 /*
- * gcc 3.x is too stupid to do the static test; fall back to dynamic.
+ * Fall back to dynamic for gcc versions which don't support asm goto. Should be
+ * a minority now anyway.
  */
 #define static_cpu_has(bit)		boot_cpu_has(bit)
 #endif
-- 
cgit v0.10.2


From 337e4cc84021212a87b04b77b65cccc49304909e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jan 2016 22:12:07 +0100
Subject: x86/alternatives: Add an auxilary section

Add .altinstr_aux for additional instructions which will be used
before and/or during patching. All stuff which needs more
sophisticated patching should go there. See next patch.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453842730-28463-8-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf1..92dc211 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -195,6 +195,17 @@ SECTIONS
 	:init
 #endif
 
+	/*
+	 * Section for code used exclusively before alternatives are run. All
+	 * references to such code must be patched out by alternatives, normally
+	 * by using X86_FEATURE_ALWAYS CPU feature bit.
+	 *
+	 * See static_cpu_has() for an example.
+	 */
+	.altinstr_aux : AT(ADDR(.altinstr_aux) - LOAD_OFFSET) {
+		*(.altinstr_aux)
+	}
+
 	INIT_DATA_SECTION(16)
 
 	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-- 
cgit v0.10.2


From 2476f2fa20568bd5d9e09cd35bcd73e99a6f4cc6 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 27 Jan 2016 09:45:25 +0100
Subject: x86/alternatives: Discard dynamic check after init

Move the code to do the dynamic check to the altinstr_aux
section so that it is discarded after alternatives have run and
a static branch has been chosen.

This way we're changing the dynamic branch from C code to
assembly, which makes it *substantially* smaller while avoiding
a completely unnecessary call to an out of line function.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
[ Changed it to do TESTB, as hpa suggested. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Young <dyoung@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kristen Carlson Accardi <kristen@linux.intel.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1452972124-7380-1-git-send-email-brgerst@gmail.com
Link: http://lkml.kernel.org/r/20160127084525.GC30712@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9048c1b..9fba7a5 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -130,8 +130,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
  */
 
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
-extern bool __static_cpu_has(u16 bit);
-
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
  * These will statically patch the target code for additional
@@ -139,7 +137,7 @@ extern bool __static_cpu_has(u16 bit);
  */
 static __always_inline __pure bool _static_cpu_has(u16 bit)
 {
-		asm_volatile_goto("1: jmp %l[t_dynamic]\n"
+		asm_volatile_goto("1: jmp 6f\n"
 			 "2:\n"
 			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
 			         "((5f-4f) - (2b-1b)),0x90\n"
@@ -164,13 +162,20 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 			 " .byte 0\n"			/* repl len */
 			 " .byte 0\n"			/* pad len */
 			 ".previous\n"
-			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
-			 : : t_dynamic, t_no);
+			 ".section .altinstr_aux,\"ax\"\n"
+			 "6:\n"
+			 " testb %[bitnum],%[cap_byte]\n"
+			 " jnz %l[t_yes]\n"
+			 " jmp %l[t_no]\n"
+			 ".previous\n"
+			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
+			     [bitnum] "i" (1 << (bit & 7)),
+			     [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
+			 : : t_yes, t_no);
+	t_yes:
 		return true;
 	t_no:
 		return false;
-	t_dynamic:
-		return __static_cpu_has(bit);
 }
 
 #define static_cpu_has(bit)					\
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ee49981..079d83f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1475,12 +1475,6 @@ void cpu_init(void)
 }
 #endif
 
-inline bool __static_cpu_has(u16 bit)
-{
-	return boot_cpu_has(bit);
-}
-EXPORT_SYMBOL_GPL(__static_cpu_has);
-
 static void bsp_resume(void)
 {
 	if (this_cpu->c_bsp_resume)
-- 
cgit v0.10.2


From 8c725306993198f845038dc9e45a1267099867a6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 26 Jan 2016 22:12:09 +0100
Subject: x86/vdso: Use static_cpu_has()

... and simplify and speed up a tad.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453842730-28463-10-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 429d54d..10f7045 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -285,7 +285,7 @@ static void vgetcpu_cpu_init(void *arg)
 #ifdef CONFIG_NUMA
 	node = cpu_to_node(cpu);
 #endif
-	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
+	if (static_cpu_has(X86_FEATURE_RDTSCP))
 		write_rdtscp_aux((node << 12) | cpu);
 
 	/*
-- 
cgit v0.10.2


From a4733143085d6c782ac1e6c85778655b6bac1d4e Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Tue, 26 Jan 2016 22:12:10 +0100
Subject: x86/boot: Simplify kernel load address alignment check

We are using %rax as temporary register to check the kernel
address alignment. We don't really have to since the TEST
instruction does not clobber the destination operand.

Suggested-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Popov <alpopov@ptsecurity.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453531828-19291-1-git-send-email-kuleshovmail@gmail.com
Link: http://lkml.kernel.org/r/1453842730-28463-11-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ffdc0e8..7c21029 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -76,9 +76,7 @@ startup_64:
 	subq	$_text - __START_KERNEL_map, %rbp
 
 	/* Is the address not 2M aligned? */
-	movq	%rbp, %rax
-	andl	$~PMD_PAGE_MASK, %eax
-	testl	%eax, %eax
+	testl	$~PMD_PAGE_MASK, %ebp
 	jnz	bad_address
 
 	/*
-- 
cgit v0.10.2


From ca0bb0798022732773752fee97bb633c6f3623d2 Mon Sep 17 00:00:00 2001
From: "wim.coekaerts@oracle.com" <wim.coekaerts@oracle.com>
Date: Fri, 29 Jan 2016 09:39:38 -0800
Subject: Add sun4v_wdt watchdog driver

This driver adds sparc hypervisor watchdog support. The default
timeout is 60 seconds and the range is between 1 and
31536000 seconds. Both watchdog-resolution and
watchdog-max-timeout MD properties settings are supported.

Signed-off-by: Wim Coekaerts <wim.coekaerts@oracle.com>
Reviewed-by: Julian Calaby <julian.calaby@gmail.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/Documentation/watchdog/watchdog-parameters.txt b/Documentation/watchdog/watchdog-parameters.txt
index 9f9ec9f..4e4b6f1 100644
--- a/Documentation/watchdog/watchdog-parameters.txt
+++ b/Documentation/watchdog/watchdog-parameters.txt
@@ -400,3 +400,7 @@ wm8350_wdt:
 nowayout: Watchdog cannot be stopped once started
 	(default=kernel config parameter)
 -------------------------------------------------
+sun4v_wdt:
+timeout_ms: Watchdog timeout in milliseconds 1..180000, default=60000)
+nowayout: Watchdog cannot be stopped once started
+-------------------------------------------------
diff --git a/arch/sparc/kernel/hvcalls.S b/arch/sparc/kernel/hvcalls.S
index afbaba5..d127130 100644
--- a/arch/sparc/kernel/hvcalls.S
+++ b/arch/sparc/kernel/hvcalls.S
@@ -338,8 +338,9 @@ ENTRY(sun4v_mach_set_watchdog)
 	mov	%o1, %o4
 	mov	HV_FAST_MACH_SET_WATCHDOG, %o5
 	ta	HV_FAST_TRAP
+	brnz,a,pn %o4, 0f
 	stx	%o1, [%o4]
-	retl
+0:	retl
 	 nop
 ENDPROC(sun4v_mach_set_watchdog)
 
diff --git a/arch/sparc/kernel/sparc_ksyms_64.c b/arch/sparc/kernel/sparc_ksyms_64.c
index a92d5d2..9e034f2 100644
--- a/arch/sparc/kernel/sparc_ksyms_64.c
+++ b/arch/sparc/kernel/sparc_ksyms_64.c
@@ -37,6 +37,7 @@ EXPORT_SYMBOL(sun4v_niagara_getperf);
 EXPORT_SYMBOL(sun4v_niagara_setperf);
 EXPORT_SYMBOL(sun4v_niagara2_getperf);
 EXPORT_SYMBOL(sun4v_niagara2_setperf);
+EXPORT_SYMBOL(sun4v_mach_set_watchdog);
 
 /* from hweight.S */
 EXPORT_SYMBOL(__arch_hweight8);
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 4f0e7be..30d38ae 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -1565,6 +1565,17 @@ config WATCHDOG_RIO
 	  machines.  The watchdog timeout period is normally one minute but
 	  can be changed with a boot-time parameter.
 
+config WATCHDOG_SUN4V
+	tristate "Sun4v Watchdog support"
+	select WATCHDOG_CORE
+	depends on SPARC64
+	help
+	  Say Y here to support the hypervisor watchdog capability embedded
+	  in the SPARC sun4v architecture.
+
+	  To compile this driver as a module, choose M here. The module will
+	  be called sun4v_wdt.
+
 # XTENSA Architecture
 
 # Xen Architecture
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index f566753..f6a6a38 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -179,6 +179,7 @@ obj-$(CONFIG_SH_WDT) += shwdt.o
 
 obj-$(CONFIG_WATCHDOG_RIO)		+= riowd.o
 obj-$(CONFIG_WATCHDOG_CP1XXX)		+= cpwd.o
+obj-$(CONFIG_WATCHDOG_SUN4V)		+= sun4v_wdt.o
 
 # XTENSA Architecture
 
diff --git a/drivers/watchdog/sun4v_wdt.c b/drivers/watchdog/sun4v_wdt.c
new file mode 100644
index 0000000..1467fe5
--- /dev/null
+++ b/drivers/watchdog/sun4v_wdt.c
@@ -0,0 +1,191 @@
+/*
+ *	sun4v watchdog timer
+ *	(c) Copyright 2016 Oracle Corporation
+ *
+ *	Implement a simple watchdog driver using the built-in sun4v hypervisor
+ *	watchdog support. If time expires, the hypervisor stops or bounces
+ *	the guest domain.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/watchdog.h>
+#include <asm/hypervisor.h>
+#include <asm/mdesc.h>
+
+#define WDT_TIMEOUT			60
+#define WDT_MAX_TIMEOUT			31536000
+#define WDT_MIN_TIMEOUT			1
+#define WDT_DEFAULT_RESOLUTION_MS	1000	/* 1 second */
+
+static unsigned int timeout;
+module_param(timeout, uint, 0);
+MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds (default="
+	__MODULE_STRING(WDT_TIMEOUT) ")");
+
+static bool nowayout = WATCHDOG_NOWAYOUT;
+module_param(nowayout, bool, S_IRUGO);
+MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default="
+	__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+static int sun4v_wdt_stop(struct watchdog_device *wdd)
+{
+	sun4v_mach_set_watchdog(0, NULL);
+
+	return 0;
+}
+
+static int sun4v_wdt_ping(struct watchdog_device *wdd)
+{
+	int hverr;
+
+	/*
+	 * HV watchdog timer will round up the timeout
+	 * passed in to the nearest multiple of the
+	 * watchdog resolution in milliseconds.
+	 */
+	hverr = sun4v_mach_set_watchdog(wdd->timeout * 1000, NULL);
+	if (hverr == HV_EINVAL)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int sun4v_wdt_set_timeout(struct watchdog_device *wdd,
+				 unsigned int timeout)
+{
+	wdd->timeout = timeout;
+
+	return 0;
+}
+
+static const struct watchdog_info sun4v_wdt_ident = {
+	.options =	WDIOF_SETTIMEOUT |
+			WDIOF_MAGICCLOSE |
+			WDIOF_KEEPALIVEPING,
+	.identity =	"sun4v hypervisor watchdog",
+	.firmware_version = 0,
+};
+
+static struct watchdog_ops sun4v_wdt_ops = {
+	.owner =	THIS_MODULE,
+	.start =	sun4v_wdt_ping,
+	.stop =		sun4v_wdt_stop,
+	.ping =		sun4v_wdt_ping,
+	.set_timeout =	sun4v_wdt_set_timeout,
+};
+
+static struct watchdog_device wdd = {
+	.info = &sun4v_wdt_ident,
+	.ops = &sun4v_wdt_ops,
+	.min_timeout = WDT_MIN_TIMEOUT,
+	.max_timeout = WDT_MAX_TIMEOUT,
+	.timeout = WDT_TIMEOUT,
+};
+
+static int __init sun4v_wdt_init(void)
+{
+	struct mdesc_handle *handle;
+	u64 node;
+	const u64 *value;
+	int err = 0;
+	unsigned long major = 1, minor = 1;
+
+	/*
+	 * There are 2 properties that can be set from the control
+	 * domain for the watchdog.
+	 * watchdog-resolution
+	 * watchdog-max-timeout
+	 *
+	 * We can expect a handle to be returned otherwise something
+	 * serious is wrong. Correct to return -ENODEV here.
+	 */
+
+	handle = mdesc_grab();
+	if (!handle)
+		return -ENODEV;
+
+	node = mdesc_node_by_name(handle, MDESC_NODE_NULL, "platform");
+	err = -ENODEV;
+	if (node == MDESC_NODE_NULL)
+		goto out_release;
+
+	/*
+	 * This is a safe way to validate if we are on the right
+	 * platform.
+	 */
+	if (sun4v_hvapi_register(HV_GRP_CORE, major, &minor))
+		goto out_hv_unreg;
+
+	/* Allow value of watchdog-resolution up to 1s (default) */
+	value = mdesc_get_property(handle, node, "watchdog-resolution", NULL);
+	err = -EINVAL;
+	if (value) {
+		if (*value == 0 ||
+		    *value > WDT_DEFAULT_RESOLUTION_MS)
+			goto out_hv_unreg;
+	}
+
+	value = mdesc_get_property(handle, node, "watchdog-max-timeout", NULL);
+	if (value) {
+		/*
+		 * If the property value (in ms) is smaller than
+		 * min_timeout, return -EINVAL.
+		 */
+		if (*value < wdd.min_timeout * 1000)
+			goto out_hv_unreg;
+
+		/*
+		 * If the property value is smaller than
+		 * default max_timeout  then set watchdog max_timeout to
+		 * the value of the property in seconds.
+		 */
+		if (*value < wdd.max_timeout * 1000)
+			wdd.max_timeout = *value  / 1000;
+	}
+
+	watchdog_init_timeout(&wdd, timeout, NULL);
+
+	watchdog_set_nowayout(&wdd, nowayout);
+
+	err = watchdog_register_device(&wdd);
+	if (err)
+		goto out_hv_unreg;
+
+	pr_info("initialized (timeout=%ds, nowayout=%d)\n",
+		 wdd.timeout, nowayout);
+
+	mdesc_release(handle);
+
+	return 0;
+
+out_hv_unreg:
+	sun4v_hvapi_unregister(HV_GRP_CORE);
+
+out_release:
+	mdesc_release(handle);
+	return err;
+}
+
+static void __exit sun4v_wdt_exit(void)
+{
+	sun4v_hvapi_unregister(HV_GRP_CORE);
+	watchdog_unregister_device(&wdd);
+}
+
+module_init(sun4v_wdt_init);
+module_exit(sun4v_wdt_exit);
+
+MODULE_AUTHOR("Wim Coekaerts <wim.coekaerts@oracle.com>");
+MODULE_DESCRIPTION("sun4v watchdog driver");
+MODULE_LICENSE("GPL");
-- 
cgit v0.10.2


From b7765086b7c5a5be029a739c2caa161da51c2076 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 31 Jan 2016 09:33:26 -0800
Subject: x86/entry/64: Fix an IRQ state error on ptregs-using syscalls

I messed up the IRQ state when jumping off the fast path due to
invocation of a ptregs-using syscall.  This bug shouldn't have
had any impact yet, but it would have caused problems with
subsequent context tracking cleanups.

Reported-and-tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 1e423bff959e x86/entry/64: ("Migrate the 64-bit syscall slow path to C")
Link: http://lkml.kernel.org/r/ab92cd365fb7b0a56869e920017790d96610fdca.1454261517.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 567aa52..9f7bb80 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -191,8 +191,8 @@ entry_SYSCALL_64_fastpath:
 
 	/*
 	 * This call instruction is handled specially in stub_ptregs_64.
-	 * It might end up jumping to the slow path.  If it jumps, RAX is
-	 * clobbered.
+	 * It might end up jumping to the slow path.  If it jumps, RAX
+	 * and all argument registers are clobbered.
 	 */
 	call	*sys_call_table(, %rax, 8)
 .Lentry_SYSCALL_64_after_fastpath_call:
@@ -315,17 +315,24 @@ END(entry_SYSCALL_64)
 ENTRY(stub_ptregs_64)
 	/*
 	 * Syscalls marked as needing ptregs land here.
-	 * If we are on the fast path, we need to save the extra regs.
-	 * If we are on the slow path, the extra regs are already saved.
+	 * If we are on the fast path, we need to save the extra regs,
+	 * which we achieve by trying again on the slow path.  If we are on
+	 * the slow path, the extra regs are already saved.
 	 *
 	 * RAX stores a pointer to the C function implementing the syscall.
+	 * IRQs are on.
 	 */
 	cmpq	$.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
 	jne	1f
 
-	/* Called from fast path -- pop return address and jump to slow path */
+	/*
+	 * Called from fast path -- disable IRQs again, pop return address
+	 * and jump to slow path
+	 */
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
 	popq	%rax
-	jmp	entry_SYSCALL64_slow_path	/* called from fast path */
+	jmp	entry_SYSCALL64_slow_path
 
 1:
 	/* Called from C */
-- 
cgit v0.10.2


From eb2a54c3271cb6443ae93ec44a91687b60c559a3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 31 Jan 2016 09:33:27 -0800
Subject: x86/entry/64: Fix fast-path syscall return register state

I was fishing RIP (i.e. RCX) out of pt_regs->cx and RFLAGS (i.e.
R11) out of pt_regs->r11.  While it usually worked (pt_regs
started out with CX == IP and R11 == FLAGS), it was very
fragile.  In particular, it broke sys_iopl() because sys_iopl()
forgot to mark itself as using ptregs.

Undo that part of the syscall rework.  There was no compelling
reason to do it this way.  While I'm at it, load RCX and R11
before the other regs to be a little friendlier to the CPU, as
they will be the first of the reloaded registers to be used.

Reported-and-tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 1e423bff959e x86/entry/64: ("Migrate the 64-bit syscall slow path to C")
Link: http://lkml.kernel.org/r/a85f8360c397e48186a9bc3e565ad74307a7b011.1454261517.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9f7bb80..70eadb0 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -212,7 +212,9 @@ entry_SYSCALL_64_fastpath:
 
 	LOCKDEP_SYS_EXIT
 	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
-	RESTORE_C_REGS
+	movq	RIP(%rsp), %rcx
+	movq	EFLAGS(%rsp), %r11
+	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
 
-- 
cgit v0.10.2


From bb56968a37a44070de92d5690c4b08dd98a5d3f1 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 31 Jan 2016 09:33:28 -0800
Subject: x86/syscalls/64: Mark sys_iopl() as using ptregs

sys_iopl() both reads and writes pt_regs->flags.  Mark it as using ptregs.

This isn't strictly necessary, as pt_regs->flags is available
even in the fast path, but this is very lightweight now that we
have syscall qualifiers and it could avoid some pain down the
road.

Reported-and-tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3de0ca692fa8bf414c5e3d7afe3e6195d1a10e1f.1454261517.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index dcf107c..2e5b565 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -178,7 +178,7 @@
 169	common	reboot			sys_reboot
 170	common	sethostname		sys_sethostname
 171	common	setdomainname		sys_setdomainname
-172	common	iopl			sys_iopl
+172	common	iopl			sys_iopl/ptregs
 173	common	ioperm			sys_ioperm
 174	64	create_module
 175	common	init_module		sys_init_module
-- 
cgit v0.10.2


From d99e1bd175f4291ddb6e62b22bb5bdbe3976389a Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Mon, 25 Jan 2016 20:41:46 +0100
Subject: x86/entry/traps: Refactor preemption and interrupt flag handling

Make the preemption and interrupt flag handling more readable by
removing preempt_conditional_sti() and preempt_conditional_cli()
helpers and using preempt_disable() and
preempt_enable_no_resched() instead.

Rename contitional_sti() and conditional_cli() to the more
understandable cond_local_irq_enable() and
cond_local_irq_disable() respectively, while at it.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
[ Boris: massage text. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453750913-4781-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a..410e8e2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,30 +83,16 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_bss;
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
 EXPORT_SYMBOL_GPL(used_vectors);
 
-static inline void conditional_sti(struct pt_regs *regs)
+static inline void cond_local_irq_enable(struct pt_regs *regs)
 {
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_enable();
 }
 
-static inline void preempt_conditional_sti(struct pt_regs *regs)
-{
-	preempt_count_inc();
-	if (regs->flags & X86_EFLAGS_IF)
-		local_irq_enable();
-}
-
-static inline void conditional_cli(struct pt_regs *regs)
-{
-	if (regs->flags & X86_EFLAGS_IF)
-		local_irq_disable();
-}
-
-static inline void preempt_conditional_cli(struct pt_regs *regs)
+static inline void cond_local_irq_disable(struct pt_regs *regs)
 {
 	if (regs->flags & X86_EFLAGS_IF)
 		local_irq_disable();
-	preempt_count_dec();
 }
 
 void ist_enter(struct pt_regs *regs)
@@ -286,7 +272,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
-		conditional_sti(regs);
+		cond_local_irq_enable(regs);
 		do_trap(trapnr, signr, str, regs, error_code,
 			fill_trap_info(regs, signr, trapnr, &info));
 	}
@@ -368,7 +354,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
 		return;
-	conditional_sti(regs);
+	cond_local_irq_enable(regs);
 
 	if (!user_mode(regs))
 		die("bounds", regs, error_code);
@@ -443,7 +429,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	struct task_struct *tsk;
 
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
-	conditional_sti(regs);
+	cond_local_irq_enable(regs);
 
 	if (v8086_mode(regs)) {
 		local_irq_enable();
@@ -517,9 +503,11 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	 * as we may switch to the interrupt stack.
 	 */
 	debug_stack_usage_inc();
-	preempt_conditional_sti(regs);
+	preempt_disable();
+	cond_local_irq_enable(regs);
 	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
-	preempt_conditional_cli(regs);
+	cond_local_irq_disable(regs);
+	preempt_enable_no_resched();
 	debug_stack_usage_dec();
 exit:
 	ist_exit(regs);
@@ -648,12 +636,14 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_inc();
 
 	/* It's safe to allow irq's after DR6 has been saved */
-	preempt_conditional_sti(regs);
+	preempt_disable();
+	cond_local_irq_enable(regs);
 
 	if (v8086_mode(regs)) {
 		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 					X86_TRAP_DB);
-		preempt_conditional_cli(regs);
+		cond_local_irq_disable(regs);
+		preempt_enable_no_resched();
 		debug_stack_usage_dec();
 		goto exit;
 	}
@@ -673,7 +663,8 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	si_code = get_si_code(tsk->thread.debugreg6);
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
-	preempt_conditional_cli(regs);
+	cond_local_irq_disable(regs);
+	preempt_enable_no_resched();
 	debug_stack_usage_dec();
 
 exit:
@@ -696,7 +687,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
 		return;
-	conditional_sti(regs);
+	cond_local_irq_enable(regs);
 
 	if (!user_mode(regs)) {
 		if (!fixup_exception(regs)) {
@@ -743,7 +734,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 {
-	conditional_sti(regs);
+	cond_local_irq_enable(regs);
 }
 
 dotraplinkage void
@@ -756,7 +747,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 	if (read_cr0() & X86_CR0_EM) {
 		struct math_emu_info info = { };
 
-		conditional_sti(regs);
+		cond_local_irq_enable(regs);
 
 		info.regs = regs;
 		math_emulate(&info);
@@ -765,7 +756,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 #endif
 	fpu__restore(&current->thread.fpu); /* interrupts still off */
 #ifdef CONFIG_X86_32
-	conditional_sti(regs);
+	cond_local_irq_enable(regs);
 #endif
 }
 NOKPROBE_SYMBOL(do_device_not_available);
-- 
cgit v0.10.2


From 16aaa53756501914a863ae7a15fcb070dc27c3d7 Mon Sep 17 00:00:00 2001
From: Huaitong Han <huaitong.han@intel.com>
Date: Mon, 25 Jan 2016 20:41:47 +0100
Subject: x86/cpufeature: Use enum cpuid_leafs instead of magic numbers

Most of the magic numbers in x86_capability[] have been converted to
'enum cpuid_leafs', and this patch updates the remaining part.

Signed-off-by: Huaitong Han <huaitong.han@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: lguest@lists.ozlabs.org
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1453750913-4781-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1514753..15340e3 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -256,7 +256,7 @@ extern int force_personality32;
    instruction set this CPU supports.  This could be done in user space,
    but it's not easy, and we've already done it here.  */
 
-#define ELF_HWCAP		(boot_cpu_data.x86_capability[0])
+#define ELF_HWCAP		(boot_cpu_data.x86_capability[CPUID_1_EDX])
 
 /* This yields a string that ld.so will use to load implementation
    specific libraries for optimization.  This is more specific in
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 30ca760..97340f2 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -408,7 +408,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 	processor.cpuflag = CPU_ENABLED;
 	processor.cpufeature = (boot_cpu_data.x86 << 8) |
 	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-	processor.featureflag = boot_cpu_data.x86_capability[0];
+	processor.featureflag = boot_cpu_data.x86_capability[CPUID_1_EDX];
 	processor.reserved[0] = 0;
 	processor.reserved[1] = 0;
 	for (i = 0; i < 2; i++) {
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4ba229a..a9033ae 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1535,7 +1535,7 @@ __init void lguest_init(void)
 	 */
 	cpu_detect(&new_cpu_data);
 	/* head.S usually sets up the first capability word, so do it here. */
-	new_cpu_data.x86_capability[0] = cpuid_edx(1);
+	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
 
 	/* Math is always hard! */
 	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d09e4c9..2c26108 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1654,7 +1654,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	cpu_detect(&new_cpu_data);
 	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
 	new_cpu_data.wp_works_ok = 1;
-	new_cpu_data.x86_capability[0] = cpuid_edx(1);
+	new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
 #endif
 
 	if (xen_start_info->mod_start) {
-- 
cgit v0.10.2


From bfbe0eeb769e2aff2cb1fc6845c4e4b7eac40bb3 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:48 +0100
Subject: x86/mce: Fix order of AMD MCE init function call

In mce_amd_feature_init() we take decisions based on mce_flags
being set or not. So the feature detection using CPUID should
naturally be ordered before we call mce_amd_feature_init().

Fix that here.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a006f4c..b718080 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1617,10 +1617,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 	case X86_VENDOR_AMD: {
 		u32 ebx = cpuid_ebx(0x80000007);
 
-		mce_amd_feature_init(c);
 		mce_flags.overflow_recov = !!(ebx & BIT(0));
 		mce_flags.succor	 = !!(ebx & BIT(1));
 		mce_flags.smca		 = !!(ebx & BIT(3));
+		mce_amd_feature_init(c);
 
 		break;
 		}
-- 
cgit v0.10.2


From 284b965c146f482b4a411133f62288d52b7e3a72 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:49 +0100
Subject: x86/mce/AMD: Do not perform shared bank check for future processors

Fam17h and above should not require a check to see if a bank is
shared or not. For shared banks, there will always be only one
core that has visibility over the MSRs and only that particular
core will be allowed to write to the MSRs.

Fix the code to return early if we have Scalable MCA support. No
change in functionality for earlier processors.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
[ Massaged the changelog text, fixed kbuild test robot build warning. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index e99b150..3068ce2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -84,6 +84,13 @@ struct thresh_restart {
 
 static inline bool is_shared_bank(int bank)
 {
+	/*
+	 * Scalable MCA provides for only one core to have access to the MSRs of
+	 * a shared bank.
+	 */
+	if (mce_flags.smca)
+		return false;
+
 	/* Bank 4 is for northbridge reporting and is thus shared */
 	return (bank == 4);
 }
-- 
cgit v0.10.2


From 60f116fca162d9488f783f5014779463243ab7a2 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:50 +0100
Subject: x86/mce/AMD: Reduce number of blocks scanned per bank

From Fam17h onwards, the number of extended MCx_MISC register blocks is
reduced to 4. It is an architectural change from what we had on
earlier processors.

Although theoritically the total number of extended MCx_MISC
registers was 8 in earlier processor families, in practice we
only had to use the extra registers for MC4. And only 2 of those
were used. So this change does not affect older processors.
Tested on Fam10h and Fam15h systems.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 3068ce2..59822279 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -28,7 +28,7 @@
 #include <asm/msr.h>
 #include <asm/trace/irq_vectors.h>
 
-#define NR_BLOCKS         9
+#define NR_BLOCKS         5
 #define THRESHOLD_MAX     0xFFF
 #define INT_TYPE_APIC     0x00020000
 #define MASK_VALID_HI     0x80000000
-- 
cgit v0.10.2


From f57a1f3c14b9182f1fea667f5a38a1094699db7c Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:51 +0100
Subject: x86/mce/AMD: Fix LVT offset configuration for thresholding

For processor families with the Scalable MCA feature, the LVT
offset for threshold interrupts is configured only in MSR
0xC0000410 and not in each per bank MISC register as was done in
earlier families.

Obtain the LVT offset from the correct MSR for those families.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-7-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 59822279..a77a452 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -49,6 +49,11 @@
 #define DEF_LVT_OFF		0x2
 #define DEF_INT_TYPE_APIC	0x2
 
+/* Scalable MCA: */
+
+/* Threshold LVT offset is at MSR0xC0000410[15:12] */
+#define SMCA_THR_LVT_OFF	0xF000
+
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
@@ -142,6 +147,14 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
 	}
 
 	if (apic != msr) {
+		/*
+		 * On SMCA CPUs, LVT offset is programmed at a different MSR, and
+		 * the BIOS provides the value. The original field where LVT offset
+		 * was set is reserved. Return early here:
+		 */
+		if (mce_flags.smca)
+			return 0;
+
 		pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
 		       "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
 		       b->cpu, apic, b->bank, b->block, b->address, hi, lo);
@@ -300,7 +313,19 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 				goto init;
 
 			b.interrupt_enable = 1;
-			new	= (high & MASK_LVTOFF_HI) >> 20;
+
+			if (mce_flags.smca) {
+				u32 smca_low, smca_high;
+
+				/* Gather LVT offset for thresholding: */
+				if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+					break;
+
+				new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+			} else {
+				new = (high & MASK_LVTOFF_HI) >> 20;
+			}
+
 			offset  = setup_APIC_mce_threshold(offset, new);
 
 			if ((offset == new) &&
-- 
cgit v0.10.2


From 429893b16d35d309ed6b35136aad5f908a08d9b9 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 25 Jan 2016 20:41:52 +0100
Subject: x86/mce/AMD: Carve out threshold block preparation

mce_amd_feature_init() was getting pretty fat, carve out the
threshold_block setup into a separate function in order to
simplify flow and make it more understandable.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1453750913-4781-8-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index a77a452..f2860a1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -267,14 +267,59 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
 	wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
+static int
+prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
+			int offset, u32 misc_high)
+{
+	unsigned int cpu = smp_processor_id();
+	struct threshold_block b;
+	int new;
+
+	if (!block)
+		per_cpu(bank_map, cpu) |= (1 << bank);
+
+	memset(&b, 0, sizeof(b));
+	b.cpu			= cpu;
+	b.bank			= bank;
+	b.block			= block;
+	b.address		= addr;
+	b.interrupt_capable	= lvt_interrupt_supported(bank, misc_high);
+
+	if (!b.interrupt_capable)
+		goto done;
+
+	b.interrupt_enable = 1;
+
+	if (mce_flags.smca) {
+		u32 smca_low, smca_high;
+
+		/* Gather LVT offset for thresholding: */
+		if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
+			goto out;
+
+		new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
+	} else {
+		new = (misc_high & MASK_LVTOFF_HI) >> 20;
+	}
+
+	offset = setup_APIC_mce_threshold(offset, new);
+
+	if ((offset == new) && (mce_threshold_vector != amd_threshold_interrupt))
+		mce_threshold_vector = amd_threshold_interrupt;
+
+done:
+	mce_threshold_block_init(&b, offset);
+
+out:
+	return offset;
+}
+
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
-	struct threshold_block b;
-	unsigned int cpu = smp_processor_id();
 	u32 low = 0, high = 0, address = 0;
 	unsigned int bank, block;
-	int offset = -1, new;
+	int offset = -1;
 
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		for (block = 0; block < NR_BLOCKS; ++block) {
@@ -299,41 +344,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			     (high & MASK_LOCKED_HI))
 				continue;
 
-			if (!block)
-				per_cpu(bank_map, cpu) |= (1 << bank);
-
-			memset(&b, 0, sizeof(b));
-			b.cpu			= cpu;
-			b.bank			= bank;
-			b.block			= block;
-			b.address		= address;
-			b.interrupt_capable	= lvt_interrupt_supported(bank, high);
-
-			if (!b.interrupt_capable)
-				goto init;
-
-			b.interrupt_enable = 1;
-
-			if (mce_flags.smca) {
-				u32 smca_low, smca_high;
-
-				/* Gather LVT offset for thresholding: */
-				if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
-					break;
-
-				new = (smca_low & SMCA_THR_LVT_OFF) >> 12;
-			} else {
-				new = (high & MASK_LVTOFF_HI) >> 20;
-			}
-
-			offset  = setup_APIC_mce_threshold(offset, new);
-
-			if ((offset == new) &&
-			    (mce_threshold_vector != amd_threshold_interrupt))
-				mce_threshold_vector = amd_threshold_interrupt;
-
-init:
-			mce_threshold_block_init(&b, offset);
+			offset = prepare_threshold_block(bank, block, address, offset, high);
 		}
 	}
 
-- 
cgit v0.10.2


From e6c8f1873be8a14c7e44202df1f7e6ea61bf3352 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 25 Jan 2016 20:41:53 +0100
Subject: x86/mce/AMD: Set MCAX Enable bit

It is required for the OS to acknowledge that it is using the
MCAX register set and its associated fields by setting the
'McaXEnable' bit in each bank's MCi_CONFIG register. If it is
not set, then all UC errors will cause a system panic.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1453750913-4781-9-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b05402e..5523465 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -264,6 +264,10 @@
 #define MSR_IA32_MC0_CTL2		0x00000280
 #define MSR_IA32_MCx_CTL2(x)		(MSR_IA32_MC0_CTL2 + (x))
 
+/* 'SMCA': AMD64 Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
+#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f2860a1..88de27b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -54,6 +54,14 @@
 /* Threshold LVT offset is at MSR0xC0000410[15:12] */
 #define SMCA_THR_LVT_OFF	0xF000
 
+/*
+ * OS is required to set the MCAX bit to acknowledge that it is now using the
+ * new MSR ranges and new registers under each bank. It also means that the OS
+ * will configure deferred errors in the new MCx_CONFIG register. If the bit is
+ * not set, uncorrectable errors will cause a system panic.
+ */
+#define SMCA_MCAX_EN_OFF	0x1
+
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
@@ -292,6 +300,12 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 
 	if (mce_flags.smca) {
 		u32 smca_low, smca_high;
+		u32 smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+		if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
+			smca_high |= SMCA_MCAX_EN_OFF;
+			wrmsr(smca_addr, smca_low, smca_high);
+		}
 
 		/* Gather LVT offset for thresholding: */
 		if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
-- 
cgit v0.10.2


From 744070e0e4ac691bb43608f7bf46a9641a9cf342 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 28 Jan 2016 00:40:48 +0900
Subject: perf hists: Fix min callchain hits calculation

The total period should be get using hists__total_period() since it
takes filtered entries into account.  In addition, if callchain mode is
'fractal', the total period should be the entry's period.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453909257-26015-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 81ce0af..b961946 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1163,9 +1163,18 @@ static void __hists__insert_output_entry(struct rb_root *entries,
 	struct rb_node *parent = NULL;
 	struct hist_entry *iter;
 
-	if (use_callchain)
+	if (use_callchain) {
+		if (callchain_param.mode == CHAIN_GRAPH_REL) {
+			u64 total = he->stat.period;
+
+			if (symbol_conf.cumulate_callchain)
+				total = he->stat_acc->period;
+
+			min_callchain_hits = total * (callchain_param.min_percent / 100);
+		}
 		callchain_param.sort(&he->sorted_chain, he->callchain,
 				      min_callchain_hits, &callchain_param);
+	}
 
 	while (*p != NULL) {
 		parent = *p;
@@ -1195,7 +1204,7 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog)
 	else
 		use_callchain = symbol_conf.use_callchain;
 
-	min_callchain_hits = hists->stats.total_period * (callchain_param.min_percent / 100);
+	min_callchain_hits = hists__total_period(hists) * (callchain_param.min_percent / 100);
 
 	if (sort__need_collapse)
 		root = &hists->entries_collapsed;
-- 
cgit v0.10.2


From 0f58474ec835f6fc80af2cde2c7ed5495cd212ba Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 28 Jan 2016 00:40:49 +0900
Subject: perf hists: Update hists' total period when adding entries

Currently the hist entry addition path doesn't update total_period of
hists and it's calculated during 'resort' path.  But the resort path
needs to know the total period before doing its job because it's used
for calculating percent limit of callchains in hist entries.

So this patch update the total period during the addition path.  It
makes the percent limit of callchains working (again).

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453909257-26015-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index b961946..098310b 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -432,8 +432,12 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 		cmp = hist_entry__cmp(he, entry);
 
 		if (!cmp) {
-			if (sample_self)
+			if (sample_self) {
 				he_stat__add_period(&he->stat, period, weight);
+				hists->stats.total_period += period;
+				if (!he->filtered)
+					hists->stats.total_non_filtered_period += period;
+			}
 			if (symbol_conf.cumulate_callchain)
 				he_stat__add_period(he->stat_acc, period, weight);
 
@@ -466,7 +470,10 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 	if (!he)
 		return NULL;
 
-	hists->nr_entries++;
+	if (sample_self)
+		hists__inc_stats(hists, he);
+	else
+		hists->nr_entries++;
 
 	rb_link_node(&he->rb_node_in, parent, p);
 	rb_insert_color(&he->rb_node_in, hists->entries_in);
-- 
cgit v0.10.2


From 2665b4528d0522ef073c2bde33cf9a7bd7391164 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 28 Jan 2016 00:40:50 +0900
Subject: perf report: Apply --percent-limit to callchains also

Currently --percent-limit option only works for hist entries.  However
it'd be better to have same effect to callchains as well

Requested-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453909257-26015-4-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 2bf537f..72ed0b4 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -75,7 +75,10 @@ static int report__config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 	if (!strcmp(var, "report.percent-limit")) {
-		rep->min_percent = strtof(value, NULL);
+		double pcnt = strtof(value, NULL);
+
+		rep->min_percent = pcnt;
+		callchain_param.min_percent = pcnt;
 		return 0;
 	}
 	if (!strcmp(var, "report.children")) {
@@ -633,8 +636,10 @@ parse_percent_limit(const struct option *opt, const char *str,
 		    int unset __maybe_unused)
 {
 	struct report *rep = opt->value;
+	double pcnt = strtof(str, NULL);
 
-	rep->min_percent = strtof(str, NULL);
+	rep->min_percent = pcnt;
+	callchain_param.min_percent = pcnt;
 	return 0;
 }
 
-- 
cgit v0.10.2


From 7e597d327eca3d92a759542ff707cba61af3a718 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 28 Jan 2016 00:40:51 +0900
Subject: perf report: Get rid of hist_entry__callchain_fprintf()

It's just a wrapper function to align the start position ofcallchains to
'comm' of each thread if it's a first sort key.  But it doesn't not work
with tracepoint events and also with upcoming hierarchy view.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453909257-26015-5-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 387110d..8e25f7d 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -349,30 +349,6 @@ static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
 	return 0;
 }
 
-static size_t hist_entry__callchain_fprintf(struct hist_entry *he,
-					    struct hists *hists,
-					    FILE *fp)
-{
-	int left_margin = 0;
-	u64 total_period = hists->stats.total_period;
-
-	if (field_order == NULL && (sort_order == NULL ||
-				    !prefixcmp(sort_order, "comm"))) {
-		struct perf_hpp_fmt *fmt;
-
-		perf_hpp__for_each_format(fmt) {
-			if (!perf_hpp__is_sort_entry(fmt))
-				continue;
-
-			/* must be 'comm' sort entry */
-			left_margin = fmt->width(fmt, NULL, hists_to_evsel(hists));
-			left_margin -= thread__comm_len(he->thread);
-			break;
-		}
-	}
-	return hist_entry_callchain__fprintf(he, total_period, left_margin, fp);
-}
-
 static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
 {
 	const char *sep = symbol_conf.field_sep;
@@ -418,6 +394,7 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
 		.buf		= bf,
 		.size		= size,
 	};
+	u64 total_period = hists->stats.total_period;
 
 	if (size == 0 || size > bfsz)
 		size = hpp.size = bfsz;
@@ -427,7 +404,7 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
 	ret = fprintf(fp, "%s\n", bf);
 
 	if (symbol_conf.use_callchain)
-		ret += hist_entry__callchain_fprintf(he, hists, fp);
+		ret += hist_entry_callchain__fprintf(he, total_period, 0, fp);
 
 	return ret;
 }
-- 
cgit v0.10.2


From 54d27b3119e2eecbb3dfbf821db90fab25f6c523 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 28 Jan 2016 00:40:52 +0900
Subject: perf callchain: Pass parent_samples to __callchain__fprintf_graph()

Pass hist entry's period to graph callchain print function.  This info
is needed by later patch to determine whether it can omit percentage of
top-level node or not.

No functional change intended.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453909257-26015-6-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 8e25f7d..96188ea 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -166,7 +166,8 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 }
 
 static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
-				       u64 total_samples, int left_margin)
+				       u64 total_samples, u64 parent_samples,
+				       int left_margin)
 {
 	struct callchain_node *cnode;
 	struct callchain_list *chain;
@@ -213,6 +214,9 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 		root = &cnode->rb_root;
 	}
 
+	if (callchain_param.mode == CHAIN_GRAPH_REL)
+		total_samples = parent_samples;
+
 	ret += __callchain__fprintf_graph(fp, root, total_samples,
 					  1, 1, left_margin);
 	ret += fprintf(fp, "\n");
@@ -323,16 +327,19 @@ static size_t hist_entry_callchain__fprintf(struct hist_entry *he,
 					    u64 total_samples, int left_margin,
 					    FILE *fp)
 {
+	u64 parent_samples = he->stat.period;
+
+	if (symbol_conf.cumulate_callchain)
+		parent_samples = he->stat_acc->period;
+
 	switch (callchain_param.mode) {
 	case CHAIN_GRAPH_REL:
-		return callchain__fprintf_graph(fp, &he->sorted_chain,
-						symbol_conf.cumulate_callchain ?
-						he->stat_acc->period : he->stat.period,
-						left_margin);
+		return callchain__fprintf_graph(fp, &he->sorted_chain, total_samples,
+						parent_samples, left_margin);
 		break;
 	case CHAIN_GRAPH_ABS:
 		return callchain__fprintf_graph(fp, &he->sorted_chain, total_samples,
-						left_margin);
+						parent_samples, left_margin);
 		break;
 	case CHAIN_FLAT:
 		return callchain__fprintf_flat(fp, &he->sorted_chain, total_samples);
-- 
cgit v0.10.2


From 7ed5d6e28a0a1a54f554b0ab9c38a6061e7cac9e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 28 Jan 2016 00:40:53 +0900
Subject: perf report: Fix percent display in callchains on --stdio

When there's only a single callchain, perf doesn't print its percentage
in front of the symbols.  This is because it assumes that the percentage
is same as parents.  But if a percent limit is applied, it's possible
that there are actually a couple of child nodes but only one of them is
shown.  In this case it should display the percent to prevent
misunderstanding of its percentage is same as the parent's.

For example, let's see the following callchain.

  $ perf report -s comm --percent-limit 0.01 --stdio
  ...
     9.95%  swapper
            |
            |--7.57%--intel_idle
            |          cpuidle_enter_state
            |          cpuidle_enter
            |          call_cpuidle
            |          cpu_startup_entry
            |          |
            |          |--4.89%--start_secondary
            |          |
            |           --2.68%--rest_init
            |                     start_kernel
            |                     x86_64_start_reservations
            |                     x86_64_start_kernel
	    |
	    |--0.15%--__schedule
	    |          |
	    |          |--0.13%--schedule
	    |          |          schedule_preempt_disable
	    |          |          cpu_startup_entry
            |          |          |
            |          |          |--0.09%--start_secondary
            |          |          |
            |          |           --0.04%--rest_init
            |          |                     start_kernel
            |          |                     x86_64_start_reservations
            |          |                     x86_64_start_kernel
            |          |
            |           --0.01%--schedule_preempt_disabled
            |                     cpu_startup_entry
  ...

Current code omits the percent if 'intel_idle' becomes the only node
when percent limit is set to 0.5%, its percent is not 9.95% but users
will assume it incorrectly.

Before:

  $ perf report --percent-limit 0.5 --stdio
  ...
     9.95%  swapper
            |
            ---intel_idle
               cpuidle_enter_state
               cpuidle_enter
               call_cpuidle
               cpu_startup_entry
               |
               |--4.89%--start_secondary
               |
                --2.68%--rest_init
                          start_kernel
                          x86_64_start_reservations
                          x86_64_start_kernel

After:

  $ perf report --percent-limit 0.5 --stdio
  ...
     9.95%  swapper
            |
             --7.57%--intel_idle
                       cpuidle_enter_state
                       cpuidle_enter
                       call_cpuidle
                       cpu_startup_entry
                       |
                       |--4.89%--start_secondary
                       |
                        --2.68%--rest_init
                                  start_kernel
                                  x86_64_start_reservations
                                  x86_64_start_kernel

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453909257-26015-7-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 96188ea..76ff46b 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -165,6 +165,25 @@ static size_t __callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 	return ret;
 }
 
+/*
+ * If have one single callchain root, don't bother printing
+ * its percentage (100 % in fractal mode and the same percentage
+ * than the hist in graph mode). This also avoid one level of column.
+ *
+ * However when percent-limit applied, it's possible that single callchain
+ * node have different (non-100% in fractal mode) percentage.
+ */
+static bool need_percent_display(struct rb_node *node, u64 parent_samples)
+{
+	struct callchain_node *cnode;
+
+	if (rb_next(node))
+		return true;
+
+	cnode = rb_entry(node, struct callchain_node, rb_node);
+	return callchain_cumul_hits(cnode) != parent_samples;
+}
+
 static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 				       u64 total_samples, u64 parent_samples,
 				       int left_margin)
@@ -178,13 +197,8 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 	int ret = 0;
 	char bf[1024];
 
-	/*
-	 * If have one single callchain root, don't bother printing
-	 * its percentage (100 % in fractal mode and the same percentage
-	 * than the hist in graph mode). This also avoid one level of column.
-	 */
 	node = rb_first(root);
-	if (node && !rb_next(node)) {
+	if (node && !need_percent_display(node, parent_samples)) {
 		cnode = rb_entry(node, struct callchain_node, rb_node);
 		list_for_each_entry(chain, &cnode->val, list) {
 			/*
-- 
cgit v0.10.2


From 0c841c6c16f320704f75970bbe6a9800c53e6cf5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 28 Jan 2016 00:40:54 +0900
Subject: perf hists browser: Fix dump to show correct callchain style

The commit 8c430a348699 ("perf hists browser: Support folded
callchains") missed to update hist_browser__dump() so it always shows
graph-style callchains regardless of current setting.

To fix that, factor out callchain printing code and rename the existing
function which prints graph-style callchain.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: 8c430a348699 ("perf hists browser: Support folded callchains")
Link: http://lkml.kernel.org/r/1453909257-26015-8-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 1da30f8..6b22baf 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -844,7 +844,7 @@ next:
 	return row - first_row;
 }
 
-static int hist_browser__show_callchain(struct hist_browser *browser,
+static int hist_browser__show_callchain_graph(struct hist_browser *browser,
 					struct rb_root *root, int level,
 					unsigned short row, u64 total,
 					print_callchain_entry_fn print,
@@ -898,7 +898,7 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
 			else
 				new_total = total;
 
-			row += hist_browser__show_callchain(browser, &child->rb_root,
+			row += hist_browser__show_callchain_graph(browser, &child->rb_root,
 							    new_level, row, new_total,
 							    print, arg, is_output_full);
 		}
@@ -910,6 +910,43 @@ out:
 	return row - first_row;
 }
 
+static int hist_browser__show_callchain(struct hist_browser *browser,
+					struct hist_entry *entry, int level,
+					unsigned short row,
+					print_callchain_entry_fn print,
+					struct callchain_print_arg *arg,
+					check_output_full_fn is_output_full)
+{
+	u64 total = hists__total_period(entry->hists);
+	int printed;
+
+	if (callchain_param.mode == CHAIN_GRAPH_REL) {
+		if (symbol_conf.cumulate_callchain)
+			total = entry->stat_acc->period;
+		else
+			total = entry->stat.period;
+	}
+
+	if (callchain_param.mode == CHAIN_FLAT) {
+		printed = hist_browser__show_callchain_flat(browser,
+						&entry->sorted_chain, row, total,
+						print, arg, is_output_full);
+	} else if (callchain_param.mode == CHAIN_FOLDED) {
+		printed = hist_browser__show_callchain_folded(browser,
+						&entry->sorted_chain, row, total,
+						print, arg, is_output_full);
+	} else {
+		printed = hist_browser__show_callchain_graph(browser,
+						&entry->sorted_chain, level, row, total,
+						print, arg, is_output_full);
+	}
+
+	if (arg->is_current_entry)
+		browser->he_selection = entry;
+
+	return printed;
+}
+
 struct hpp_arg {
 	struct ui_browser *b;
 	char folded_sign;
@@ -1084,38 +1121,14 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 		--row_offset;
 
 	if (folded_sign == '-' && row != browser->b.rows) {
-		u64 total = hists__total_period(entry->hists);
 		struct callchain_print_arg arg = {
 			.row_offset = row_offset,
 			.is_current_entry = current_entry,
 		};
 
-		if (callchain_param.mode == CHAIN_GRAPH_REL) {
-			if (symbol_conf.cumulate_callchain)
-				total = entry->stat_acc->period;
-			else
-				total = entry->stat.period;
-		}
-
-		if (callchain_param.mode == CHAIN_FLAT) {
-			printed += hist_browser__show_callchain_flat(browser,
-					&entry->sorted_chain, row, total,
+		printed += hist_browser__show_callchain(browser, entry, 1, row,
 					hist_browser__show_callchain_entry, &arg,
 					hist_browser__check_output_full);
-		} else if (callchain_param.mode == CHAIN_FOLDED) {
-			printed += hist_browser__show_callchain_folded(browser,
-					&entry->sorted_chain, row, total,
-					hist_browser__show_callchain_entry, &arg,
-					hist_browser__check_output_full);
-		} else {
-			printed += hist_browser__show_callchain(browser,
-					&entry->sorted_chain, 1, row, total,
-					hist_browser__show_callchain_entry, &arg,
-					hist_browser__check_output_full);
-		}
-
-		if (arg.is_current_entry)
-			browser->he_selection = entry;
 	}
 
 	return printed;
@@ -1380,15 +1393,11 @@ do_offset:
 static int hist_browser__fprintf_callchain(struct hist_browser *browser,
 					   struct hist_entry *he, FILE *fp)
 {
-	u64 total = hists__total_period(he->hists);
 	struct callchain_print_arg arg  = {
 		.fp = fp,
 	};
 
-	if (symbol_conf.cumulate_callchain)
-		total = he->stat_acc->period;
-
-	hist_browser__show_callchain(browser, &he->sorted_chain, 1, 0, total,
+	hist_browser__show_callchain(browser, he, 1, 0,
 				     hist_browser__fprintf_callchain_entry, &arg,
 				     hist_browser__check_dump_full);
 	return arg.printed;
-- 
cgit v0.10.2


From 5eca104eee7edfe7155523849750ced539b16e94 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 28 Jan 2016 00:40:55 +0900
Subject: perf hists browser: Pass parent_total to callchain print functions

Pass parent node's total period to callchain print functions.  This info
is needed by later patch to determine whether it can omit percent or not
correctly.

No functional change intended.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453909257-26015-9-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 6b22baf..41dbb79 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -660,6 +660,7 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser,
 static int hist_browser__show_callchain_flat(struct hist_browser *browser,
 					     struct rb_root *root,
 					     unsigned short row, u64 total,
+					     u64 parent_total __maybe_unused,
 					     print_callchain_entry_fn print,
 					     struct callchain_print_arg *arg,
 					     check_output_full_fn is_output_full)
@@ -763,6 +764,7 @@ static char *hist_browser__folded_callchain_str(struct hist_browser *browser,
 static int hist_browser__show_callchain_folded(struct hist_browser *browser,
 					       struct rb_root *root,
 					       unsigned short row, u64 total,
+					       u64 parent_total __maybe_unused,
 					       print_callchain_entry_fn print,
 					       struct callchain_print_arg *arg,
 					       check_output_full_fn is_output_full)
@@ -847,14 +849,18 @@ next:
 static int hist_browser__show_callchain_graph(struct hist_browser *browser,
 					struct rb_root *root, int level,
 					unsigned short row, u64 total,
+					u64 parent_total,
 					print_callchain_entry_fn print,
 					struct callchain_print_arg *arg,
 					check_output_full_fn is_output_full)
 {
 	struct rb_node *node;
 	int first_row = row, offset = level * LEVEL_OFFSET_STEP;
-	u64 new_total;
 	bool need_percent;
+	u64 percent_total = total;
+
+	if (callchain_param.mode == CHAIN_GRAPH_REL)
+		percent_total = parent_total;
 
 	node = rb_first(root);
 	need_percent = node && rb_next(node);
@@ -878,7 +884,7 @@ static int hist_browser__show_callchain_graph(struct hist_browser *browser,
 			folded_sign = callchain_list__folded(chain);
 
 			row += hist_browser__show_callchain_list(browser, child,
-							chain, row, total,
+							chain, row, percent_total,
 							was_first && need_percent,
 							offset + extra_offset,
 							print, arg);
@@ -893,13 +899,9 @@ static int hist_browser__show_callchain_graph(struct hist_browser *browser,
 		if (folded_sign == '-') {
 			const int new_level = level + (extra_offset ? 2 : 1);
 
-			if (callchain_param.mode == CHAIN_GRAPH_REL)
-				new_total = child->children_hit;
-			else
-				new_total = total;
-
 			row += hist_browser__show_callchain_graph(browser, &child->rb_root,
-							    new_level, row, new_total,
+							    new_level, row, total,
+							    child->children_hit,
 							    print, arg, is_output_full);
 		}
 		if (is_output_full(browser, row))
@@ -918,27 +920,29 @@ static int hist_browser__show_callchain(struct hist_browser *browser,
 					check_output_full_fn is_output_full)
 {
 	u64 total = hists__total_period(entry->hists);
+	u64 parent_total;
 	int printed;
 
-	if (callchain_param.mode == CHAIN_GRAPH_REL) {
-		if (symbol_conf.cumulate_callchain)
-			total = entry->stat_acc->period;
-		else
-			total = entry->stat.period;
-	}
+	if (symbol_conf.cumulate_callchain)
+		parent_total = entry->stat_acc->period;
+	else
+		parent_total = entry->stat.period;
 
 	if (callchain_param.mode == CHAIN_FLAT) {
 		printed = hist_browser__show_callchain_flat(browser,
-						&entry->sorted_chain, row, total,
-						print, arg, is_output_full);
+						&entry->sorted_chain, row,
+						total, parent_total, print, arg,
+						is_output_full);
 	} else if (callchain_param.mode == CHAIN_FOLDED) {
 		printed = hist_browser__show_callchain_folded(browser,
-						&entry->sorted_chain, row, total,
-						print, arg, is_output_full);
+						&entry->sorted_chain, row,
+						total, parent_total, print, arg,
+						is_output_full);
 	} else {
 		printed = hist_browser__show_callchain_graph(browser,
-						&entry->sorted_chain, level, row, total,
-						print, arg, is_output_full);
+						&entry->sorted_chain, level, row,
+						total, parent_total, print, arg,
+						is_output_full);
 	}
 
 	if (arg->is_current_entry)
-- 
cgit v0.10.2


From 59c624e2391080fa6315a376a4ee74d0eb393d1d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 28 Jan 2016 00:40:56 +0900
Subject: perf hists browser: Fix percent display in callchains

When there's only a single callchain, perf doesn't print its percentage
in front of the symbols.  This is because it assumes that the percentage
is same as parents.  But if a percent limit is applied, it's possible
that there are actually a couple of child nodes but only one of them is
shown.  In this case it should display the percent to prevent
misunderstanding of its percentage is same as the parent's.

For example, let's see the following callchain.

  $ perf report --no-children --percent-limit 0.01 --tui
  ...
  -    0.06%  sleep    [kernel.vmlinux]    [k] kmem_cache_alloc_trace
       kmem_cache_alloc_trace
     - perf_event_mmap
        - 0.04% mmap_region
             do_mmap_pgoff
           - vm_mmap_pgoff
              + 0.02% sys_mmap_pgoff
              + 0.02% vm_mmap
           + 0.02% mprotect_fixup

Current code omits the percent if 'mmap_region' becomes the only node
when percent limit is set to 0.03%, its percent is not 0.06% but users
will assume it incorrectly.

Before:

  $ perf report --no-children --percent-limit 0.03 --tui
  ...
     0.06%  sleep    [kernel.vmlinux]    [k] kmem_cache_alloc_trace
       kmem_cache_alloc_trace
     - perf_event_mmap
        - mmap_region
          do_mmap_pgoff
          vm_mmap_pgoff

After:

  $ perf report --no-children --percent-limit 0.03 --tui
  ...
     0.06%  sleep    [kernel.vmlinux]    [k] kmem_cache_alloc_trace
       kmem_cache_alloc_trace
     - perf_event_mmap
        - 0.04% mmap_region
             do_mmap_pgoff
             vm_mmap_pgoff

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1453909257-26015-10-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 41dbb79..61d578b 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -657,10 +657,24 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser,
 	return 1;
 }
 
+static bool check_percent_display(struct rb_node *node, u64 parent_total)
+{
+	struct callchain_node *child;
+
+	if (node == NULL)
+		return false;
+
+	if (rb_next(node))
+		return true;
+
+	child = rb_entry(node, struct callchain_node, rb_node);
+	return callchain_cumul_hits(child) != parent_total;
+}
+
 static int hist_browser__show_callchain_flat(struct hist_browser *browser,
 					     struct rb_root *root,
 					     unsigned short row, u64 total,
-					     u64 parent_total __maybe_unused,
+					     u64 parent_total,
 					     print_callchain_entry_fn print,
 					     struct callchain_print_arg *arg,
 					     check_output_full_fn is_output_full)
@@ -670,7 +684,7 @@ static int hist_browser__show_callchain_flat(struct hist_browser *browser,
 	bool need_percent;
 
 	node = rb_first(root);
-	need_percent = node && rb_next(node);
+	need_percent = check_percent_display(node, parent_total);
 
 	while (node) {
 		struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
@@ -764,7 +778,7 @@ static char *hist_browser__folded_callchain_str(struct hist_browser *browser,
 static int hist_browser__show_callchain_folded(struct hist_browser *browser,
 					       struct rb_root *root,
 					       unsigned short row, u64 total,
-					       u64 parent_total __maybe_unused,
+					       u64 parent_total,
 					       print_callchain_entry_fn print,
 					       struct callchain_print_arg *arg,
 					       check_output_full_fn is_output_full)
@@ -774,7 +788,7 @@ static int hist_browser__show_callchain_folded(struct hist_browser *browser,
 	bool need_percent;
 
 	node = rb_first(root);
-	need_percent = node && rb_next(node);
+	need_percent = check_percent_display(node, parent_total);
 
 	while (node) {
 		struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
@@ -863,7 +877,7 @@ static int hist_browser__show_callchain_graph(struct hist_browser *browser,
 		percent_total = parent_total;
 
 	node = rb_first(root);
-	need_percent = node && rb_next(node);
+	need_percent = check_percent_display(node, parent_total);
 
 	while (node) {
 		struct callchain_node *child = rb_entry(node, struct callchain_node, rb_node);
-- 
cgit v0.10.2


From 3848c23b19e07188bfa15e3d9a2ac27692f2ff3c Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 28 Jan 2016 21:24:54 +0900
Subject: perf report: Don't show blank lines if entry has no callchain

When all callchains of a hist entry is percent-limited, do not add a
blank line at the end.  It makes the entry look like it doesn't have
callchains.

Reported-and-Tested-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/20160128122454.GA27446@danjae.kornet
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 76ff46b..691e52c 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -233,7 +233,10 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
 
 	ret += __callchain__fprintf_graph(fp, root, total_samples,
 					  1, 1, left_margin);
-	ret += fprintf(fp, "\n");
+	if (ret) {
+		/* do not add a blank line if it printed nothing */
+		ret += fprintf(fp, "\n");
+	}
 
 	return ret;
 }
-- 
cgit v0.10.2


From f84a93726e0c0c53b5b6dbee96623e7e53e06dd8 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Fri, 29 Jan 2016 11:35:12 +0300
Subject: mac80211: minstrel_ht: fix out-of-bound in
 minstrel_ht_set_best_prob_rate

Patch fixes this splat

BUG: KASAN: slab-out-of-bounds in minstrel_ht_update_stats.isra.7+0x6e1/0x9e0
[mac80211] at addr ffff8800cee640f4 Read of size 4 by task swapper/3/0

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Link: http://lkml.kernel.org/r/CALYGNiNyJhSaVnE35qS6UCGaSb2Dx1_i5HcRavuOX14oTz2P+w@mail.gmail.com
Acked-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 3928dbd..93bf2b7 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -414,15 +414,16 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
 	    (max_tp_group != MINSTREL_CCK_GROUP))
 		return;
 
+	max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
+	max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
+	max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
+
 	if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) {
 		cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx,
 						    mrs->prob_ewma);
 		if (cur_tp_avg > tmp_tp_avg)
 			mi->max_prob_rate = index;
 
-		max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
-		max_gpr_idx = mg->max_group_prob_rate %	MCS_GROUP_RATES;
-		max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
 		max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group,
 							max_gpr_idx,
 							max_gpr_prob);
@@ -431,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
 	} else {
 		if (mrs->prob_ewma > tmp_prob)
 			mi->max_prob_rate = index;
-		if (mrs->prob_ewma > mg->rates[mg->max_group_prob_rate].prob_ewma)
+		if (mrs->prob_ewma > max_gpr_prob)
 			mg->max_group_prob_rate = index;
 	}
 }
-- 
cgit v0.10.2


From 212c5a5e6ba61678be6b5fee576e38bccb50b613 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven.eckelmann@open-mesh.com>
Date: Tue, 2 Feb 2016 08:12:26 +0100
Subject: mac80211: minstrel: Change expected throughput unit back to Kbps

The change from cur_tp to the function
minstrel_get_tp_avg/minstrel_ht_get_tp_avg changed the unit used for the
current throughput. For example in minstrel_ht the correct
conversion between them would be:

    mrs->cur_tp / 10 == minstrel_ht_get_tp_avg(..).

This factor 10 must also be included in the calculation of
minstrel_get_expected_throughput and minstrel_ht_get_expected_throughput to
return values with the unit [Kbps] instead of [10Kbps]. Otherwise routing
algorithms like B.A.T.M.A.N. V will make incorrect decision based on these
values. Its kernel based implementation expects expected_throughput always
to have the unit [Kbps] and not sometimes [10Kbps] and sometimes [Kbps].

The same requirement has iw or olsrdv2's nl80211 based statistics module
which retrieve the same data via NL80211_STA_INFO_TX_BITRATE.

Cc: stable@vger.kernel.org
Fixes: 6a27b2c40b48 ("mac80211: restructure per-rate throughput calculation into function")
Signed-off-by: Sven Eckelmann <sven@open-mesh.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 3ece7d1..b54f398 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -711,7 +711,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta)
 	 * computing cur_tp
 	 */
 	tmp_mrs = &mi->r[idx].stats;
-	tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma);
+	tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10;
 	tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024;
 
 	return tmp_cur_tp;
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 93bf2b7..702328b 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -1335,7 +1335,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta)
 	prob = mi->groups[i].rates[j].prob_ewma;
 
 	/* convert tp_avg from pkt per second in kbps */
-	tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024;
+	tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10;
+	tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024;
 
 	return tp_avg;
 }
-- 
cgit v0.10.2


From 1b74dde7c47c19a73ea3e9fac95ac27b5d3d50c5 Mon Sep 17 00:00:00 2001
From: Chen Yucong <slaoub@gmail.com>
Date: Tue, 2 Feb 2016 11:45:02 +0800
Subject: x86/cpu: Convert printk(KERN_<LEVEL> ...) to pr_<level>(...)

 - Use the more current logging style pr_<level>(...) instead of the old
   printk(KERN_<LEVEL> ...).

 - Convert pr_warning() to pr_warn().

Signed-off-by: Chen Yucong <slaoub@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454384702-21707-1-git-send-email-slaoub@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a07956a..97c59fd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -117,7 +117,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		void (*f_vide)(void);
 		u64 d, d2;
 
-		printk(KERN_INFO "AMD K6 stepping B detected - ");
+		pr_info("AMD K6 stepping B detected - ");
 
 		/*
 		 * It looks like AMD fixed the 2.6.2 bug and improved indirect
@@ -133,10 +133,9 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		d = d2-d;
 
 		if (d > 20*K6_BUG_LOOP)
-			printk(KERN_CONT
-				"system stability may be impaired when more than 32 MB are used.\n");
+			pr_cont("system stability may be impaired when more than 32 MB are used.\n");
 		else
-			printk(KERN_CONT "probably OK (after B9730xxxx).\n");
+			pr_cont("probably OK (after B9730xxxx).\n");
 	}
 
 	/* K6 with old style WHCR */
@@ -154,7 +153,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 			wbinvd();
 			wrmsr(MSR_K6_WHCR, l, h);
 			local_irq_restore(flags);
-			printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+			pr_info("Enabling old style K6 write allocation for %d Mb\n",
 				mbytes);
 		}
 		return;
@@ -175,7 +174,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 			wbinvd();
 			wrmsr(MSR_K6_WHCR, l, h);
 			local_irq_restore(flags);
-			printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+			pr_info("Enabling new style K6 write allocation for %d Mb\n",
 				mbytes);
 		}
 
@@ -202,7 +201,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
 	 */
 	if (c->x86_model >= 6 && c->x86_model <= 10) {
 		if (!cpu_has(c, X86_FEATURE_XMM)) {
-			printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+			pr_info("Enabling disabled K7/SSE Support.\n");
 			msr_clear_bit(MSR_K7_HWCR, 15);
 			set_cpu_cap(c, X86_FEATURE_XMM);
 		}
@@ -216,9 +215,8 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
 	if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
 		rdmsr(MSR_K7_CLK_CTL, l, h);
 		if ((l & 0xfff00000) != 0x20000000) {
-			printk(KERN_INFO
-			    "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
-					l, ((l & 0x000fffff)|0x20000000));
+			pr_info("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+				l, ((l & 0x000fffff)|0x20000000));
 			wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
 		}
 	}
@@ -485,7 +483,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
 			unsigned long pfn = tseg >> PAGE_SHIFT;
 
-			printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+			pr_debug("tseg: %010llx\n", tseg);
 			if (pfn_range_is_mapped(pfn, pfn + 1))
 				set_memory_4k((unsigned long)__va(tseg), 1);
 		}
@@ -500,8 +498,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 
 			rdmsrl(MSR_K7_HWCR, val);
 			if (!(val & BIT(24)))
-				printk(KERN_WARNING FW_BUG "TSC doesn't count "
-					"with P0 frequency!\n");
+				pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n");
 		}
 	}
 
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
index 04f0fe5..a972ac4 100644
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
@@ -15,7 +15,7 @@ void __init check_bugs(void)
 {
 	identify_boot_cpu();
 #if !defined(CONFIG_SMP)
-	printk(KERN_INFO "CPU: ");
+	pr_info("CPU: ");
 	print_cpu_info(&boot_cpu_data);
 #endif
 	alternative_instructions();
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index ae20be6..ce197bb 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -29,7 +29,7 @@ static void init_c3(struct cpuinfo_x86 *c)
 			rdmsr(MSR_VIA_FCR, lo, hi);
 			lo |= ACE_FCR;		/* enable ACE unit */
 			wrmsr(MSR_VIA_FCR, lo, hi);
-			printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
+			pr_info("CPU: Enabled ACE h/w crypto\n");
 		}
 
 		/* enable RNG unit, if present and disabled */
@@ -37,7 +37,7 @@ static void init_c3(struct cpuinfo_x86 *c)
 			rdmsr(MSR_VIA_RNG, lo, hi);
 			lo |= RNG_ENABLE;	/* enable RNG unit */
 			wrmsr(MSR_VIA_RNG, lo, hi);
-			printk(KERN_INFO "CPU: Enabled h/w RNG\n");
+			pr_info("CPU: Enabled h/w RNG\n");
 		}
 
 		/* store Centaur Extended Feature Flags as
@@ -130,7 +130,7 @@ static void init_centaur(struct cpuinfo_x86 *c)
 			name = "C6";
 			fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
 			fcr_clr = DPDC;
-			printk(KERN_NOTICE "Disabling bugged TSC.\n");
+			pr_notice("Disabling bugged TSC.\n");
 			clear_cpu_cap(c, X86_FEATURE_TSC);
 			break;
 		case 8:
@@ -163,11 +163,11 @@ static void init_centaur(struct cpuinfo_x86 *c)
 		newlo = (lo|fcr_set) & (~fcr_clr);
 
 		if (newlo != lo) {
-			printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
+			pr_info("Centaur FCR was 0x%X now 0x%X\n",
 				lo, newlo);
 			wrmsr(MSR_IDT_FCR1, newlo, hi);
 		} else {
-			printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
+			pr_info("Centaur FCR is 0x%X\n", lo);
 		}
 		/* Emulate MTRRs using Centaur's MCR. */
 		set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 37830de..68a80e9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -228,7 +228,7 @@ static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 	lo |= 0x200000;
 	wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
 
-	printk(KERN_NOTICE "CPU serial number disabled.\n");
+	pr_notice("CPU serial number disabled.\n");
 	clear_cpu_cap(c, X86_FEATURE_PN);
 
 	/* Disabling the serial number may affect the cpuid level */
@@ -329,9 +329,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
 		if (!warn)
 			continue;
 
-		printk(KERN_WARNING
-		       "CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
-				x86_cap_flag(df->feature), df->level);
+		pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
+			x86_cap_flag(df->feature), df->level);
 	}
 }
 
@@ -510,7 +509,7 @@ void detect_ht(struct cpuinfo_x86 *c)
 	smp_num_siblings = (ebx & 0xff0000) >> 16;
 
 	if (smp_num_siblings == 1) {
-		printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
+		pr_info_once("CPU0: Hyper-Threading is disabled\n");
 		goto out;
 	}
 
@@ -531,10 +530,10 @@ void detect_ht(struct cpuinfo_x86 *c)
 
 out:
 	if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       c->phys_proc_id);
-		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-		       c->cpu_core_id);
+		pr_info("CPU: Physical Processor ID: %d\n",
+			c->phys_proc_id);
+		pr_info("CPU: Processor Core ID: %d\n",
+			c->cpu_core_id);
 		printed = 1;
 	}
 #endif
@@ -559,9 +558,8 @@ static void get_cpu_vendor(struct cpuinfo_x86 *c)
 		}
 	}
 
-	printk_once(KERN_ERR
-			"CPU: vendor_id '%s' unknown, using generic init.\n" \
-			"CPU: Your system may be unstable.\n", v);
+	pr_err_once("CPU: vendor_id '%s' unknown, using generic init.\n" \
+		    "CPU: Your system may be unstable.\n", v);
 
 	c->x86_vendor = X86_VENDOR_UNKNOWN;
 	this_cpu = &default_cpu;
@@ -760,7 +758,7 @@ void __init early_cpu_init(void)
 	int count = 0;
 
 #ifdef CONFIG_PROCESSOR_SELECT
-	printk(KERN_INFO "KERNEL supported cpus:\n");
+	pr_info("KERNEL supported cpus:\n");
 #endif
 
 	for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
@@ -778,7 +776,7 @@ void __init early_cpu_init(void)
 			for (j = 0; j < 2; j++) {
 				if (!cpudev->c_ident[j])
 					continue;
-				printk(KERN_INFO "  %s %s\n", cpudev->c_vendor,
+				pr_info("  %s %s\n", cpudev->c_vendor,
 					cpudev->c_ident[j]);
 			}
 		}
@@ -1061,7 +1059,7 @@ static void __print_cpu_msr(void)
 		for (index = index_min; index < index_max; index++) {
 			if (rdmsrl_safe(index, &val))
 				continue;
-			printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
+			pr_info(" MSR%08x: %016llx\n", index, val);
 		}
 	}
 }
@@ -1100,19 +1098,19 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 	}
 
 	if (vendor && !strstr(c->x86_model_id, vendor))
-		printk(KERN_CONT "%s ", vendor);
+		pr_cont("%s ", vendor);
 
 	if (c->x86_model_id[0])
-		printk(KERN_CONT "%s", c->x86_model_id);
+		pr_cont("%s", c->x86_model_id);
 	else
-		printk(KERN_CONT "%d86", c->x86);
+		pr_cont("%d86", c->x86);
 
-	printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
+	pr_cont(" (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
 
 	if (c->x86_mask || c->cpuid_level >= 0)
-		printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask);
+		pr_cont(", stepping: 0x%x)\n", c->x86_mask);
 	else
-		printk(KERN_CONT ")\n");
+		pr_cont(")\n");
 
 	print_cpu_msr(c);
 }
@@ -1438,7 +1436,7 @@ void cpu_init(void)
 
 	show_ucode_info_early();
 
-	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+	pr_info("Initializing CPU#%d\n", cpu);
 
 	if (cpu_feature_enabled(X86_FEATURE_VME) ||
 	    cpu_has_tsc ||
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index aaf152e..187bb58 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -103,7 +103,7 @@ static void check_cx686_slop(struct cpuinfo_x86 *c)
 		local_irq_restore(flags);
 
 		if (ccr5 & 2) { /* possible wrong calibration done */
-			printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n");
+			pr_info("Recalibrating delay loop with SLOP bit reset\n");
 			calibrate_delay();
 			c->loops_per_jiffy = loops_per_jiffy;
 		}
@@ -115,7 +115,7 @@ static void set_cx86_reorder(void)
 {
 	u8 ccr3;
 
-	printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n");
+	pr_info("Enable Memory access reorder on Cyrix/NSC processor.\n");
 	ccr3 = getCx86(CX86_CCR3);
 	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
 
@@ -128,7 +128,7 @@ static void set_cx86_reorder(void)
 
 static void set_cx86_memwb(void)
 {
-	printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
+	pr_info("Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
 
 	/* CCR2 bit 2: unlock NW bit */
 	setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
@@ -268,7 +268,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
 		 *  VSA1 we work around however.
 		 */
 
-		printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
+		pr_info("Working around Cyrix MediaGX virtual DMA bugs.\n");
 		isa_dma_bridge_buggy = 2;
 
 		/* We do this before the PCI layer is running. However we
@@ -426,7 +426,7 @@ static void cyrix_identify(struct cpuinfo_x86 *c)
 		if (dir0 == 5 || dir0 == 3) {
 			unsigned char ccr3;
 			unsigned long flags;
-			printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
+			pr_info("Enabling CPUID on Cyrix processor.\n");
 			local_irq_save(flags);
 			ccr3 = getCx86(CX86_CCR3);
 			/* enable MAPEN  */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index d820d8e..73d391a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -56,7 +56,7 @@ detect_hypervisor_vendor(void)
 	}
 
 	if (max_pri)
-		printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name);
+		pr_info("Hypervisor detected: %s\n", x86_hyper->name);
 }
 
 void init_hypervisor(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 565648b..05b9211 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -61,7 +61,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	 */
 	if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 &&
 	    c->microcode < 0x20e) {
-		printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
+		pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
 		clear_cpu_cap(c, X86_FEATURE_PSE);
 	}
 
@@ -140,7 +140,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
 		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
 		if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
-			printk(KERN_INFO "Disabled fast string operations\n");
+			pr_info("Disabled fast string operations\n");
 			setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
 			setup_clear_cpu_cap(X86_FEATURE_ERMS);
 		}
@@ -176,7 +176,7 @@ int ppro_with_ram_bug(void)
 	    boot_cpu_data.x86 == 6 &&
 	    boot_cpu_data.x86_model == 1 &&
 	    boot_cpu_data.x86_mask < 8) {
-		printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n");
+		pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n");
 		return 1;
 	}
 	return 0;
@@ -225,7 +225,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
 
 		set_cpu_bug(c, X86_BUG_F00F);
 		if (!f00f_workaround_enabled) {
-			printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+			pr_notice("Intel Pentium with F0 0F bug - workaround enabled.\n");
 			f00f_workaround_enabled = 1;
 		}
 	}
@@ -244,7 +244,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
 	 * Forcefully enable PAE if kernel parameter "forcepae" is present.
 	 */
 	if (forcepae) {
-		printk(KERN_WARNING "PAE forced!\n");
+		pr_warn("PAE forced!\n");
 		set_cpu_cap(c, X86_FEATURE_PAE);
 		add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
 	}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 0b6c523..6ed779e 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -444,7 +444,7 @@ static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
 	err = amd_set_l3_disable_slot(nb, cpu, slot, val);
 	if (err) {
 		if (err == -EEXIST)
-			pr_warning("L3 slot %d in use/index already disabled!\n",
+			pr_warn("L3 slot %d in use/index already disabled!\n",
 				   slot);
 		return err;
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 4cfba43..517619e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -115,7 +115,7 @@ static int raise_local(void)
 	int cpu = m->extcpu;
 
 	if (m->inject_flags & MCJ_EXCEPTION) {
-		printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
+		pr_info("Triggering MCE exception on CPU %d\n", cpu);
 		switch (context) {
 		case MCJ_CTX_IRQ:
 			/*
@@ -128,15 +128,15 @@ static int raise_local(void)
 			raise_exception(m, NULL);
 			break;
 		default:
-			printk(KERN_INFO "Invalid MCE context\n");
+			pr_info("Invalid MCE context\n");
 			ret = -EINVAL;
 		}
-		printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
+		pr_info("MCE exception done on CPU %d\n", cpu);
 	} else if (m->status) {
-		printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
+		pr_info("Starting machine check poll CPU %d\n", cpu);
 		raise_poll(m);
 		mce_notify_irq();
-		printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
+		pr_info("Machine check poll done on CPU %d\n", cpu);
 	} else
 		m->finished = 0;
 
@@ -183,8 +183,7 @@ static void raise_mce(struct mce *m)
 		start = jiffies;
 		while (!cpumask_empty(mce_inject_cpumask)) {
 			if (!time_before(jiffies, start + 2*HZ)) {
-				printk(KERN_ERR
-				"Timeout waiting for mce inject %lx\n",
+				pr_err("Timeout waiting for mce inject %lx\n",
 					*cpumask_bits(mce_inject_cpumask));
 				break;
 			}
@@ -241,7 +240,7 @@ static int inject_init(void)
 {
 	if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
 		return -ENOMEM;
-	printk(KERN_INFO "Machine check injector initialized\n");
+	pr_info("Machine check injector initialized\n");
 	register_mce_write_callback(mce_write);
 	register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
 				"mce_notify");
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 12402e1..2a0717b 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -26,14 +26,12 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
 	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
 
-	printk(KERN_EMERG
-		"CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
-		smp_processor_id(), loaddr, lotype);
+	pr_emerg("CPU#%d: Machine Check Exception:  0x%8X (type 0x%8X).\n",
+		 smp_processor_id(), loaddr, lotype);
 
 	if (lotype & (1<<5)) {
-		printk(KERN_EMERG
-			"CPU#%d: Possible thermal failure (CPU on fire ?).\n",
-			smp_processor_id());
+		pr_emerg("CPU#%d: Possible thermal failure (CPU on fire ?).\n",
+			 smp_processor_id());
 	}
 
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
@@ -61,12 +59,10 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
 	/* Read registers before enabling: */
 	rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
 	rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
-	printk(KERN_INFO
-	       "Intel old style machine check architecture supported.\n");
+	pr_info("Intel old style machine check architecture supported.\n");
 
 	/* Enable MCE: */
 	cr4_set_bits(X86_CR4_MCE);
-	printk(KERN_INFO
-	       "Intel old style machine check reporting enabled on CPU#%d.\n",
-	       smp_processor_id());
+	pr_info("Intel old style machine check reporting enabled on CPU#%d.\n",
+		smp_processor_id());
 }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 2c5aaf8..0b445c2 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ static int therm_throt_process(bool new_event, int event, int level)
 	/* if we just entered the thermal event */
 	if (new_event) {
 		if (event == THERMAL_THROTTLING_EVENT)
-			printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+			pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
 				this_cpu,
 				level == CORE_LEVEL ? "Core" : "Package",
 				state->count);
@@ -198,8 +198,7 @@ static int therm_throt_process(bool new_event, int event, int level)
 	}
 	if (old_event) {
 		if (event == THERMAL_THROTTLING_EVENT)
-			printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
-				this_cpu,
+			pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
 				level == CORE_LEVEL ? "Core" : "Package");
 		return 1;
 	}
@@ -417,8 +416,8 @@ static void intel_thermal_interrupt(void)
 
 static void unexpected_thermal_interrupt(void)
 {
-	printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
-			smp_processor_id());
+	pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
+		smp_processor_id());
 }
 
 static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -499,7 +498,7 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 
 	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
 		if (system_state == SYSTEM_BOOTING)
-			printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+			pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
 		return;
 	}
 
@@ -557,8 +556,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	l = apic_read(APIC_LVTTHMR);
 	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
 
-	printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
-		       tm2 ? "TM2" : "TM1");
+	pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
+		      tm2 ? "TM2" : "TM1");
 
 	/* enable thermal throttle processing */
 	atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index 7245980..fcf9ae9 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -12,8 +12,8 @@
 
 static void default_threshold_interrupt(void)
 {
-	printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
-			 THRESHOLD_APIC_VECTOR);
+	pr_err("Unexpected threshold interrupt at vector %x\n",
+		THRESHOLD_APIC_VECTOR);
 }
 
 void (*mce_threshold_vector)(void) = default_threshold_interrupt;
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 01dd870..c6a722e 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -17,7 +17,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code)
 {
 	ist_enter(regs);
 
-	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
+	pr_emerg("CPU0: Machine Check Exception.\n");
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
 	ist_exit(regs);
@@ -39,6 +39,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
 
 	cr4_set_bits(X86_CR4_MCE);
 
-	printk(KERN_INFO
-	       "Winchip machine check reporting enabled on CPU#0.\n");
+	pr_info("Winchip machine check reporting enabled on CPU#0.\n");
 }
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 2233f8a..75d3aab 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -953,7 +953,7 @@ struct microcode_ops * __init init_amd_microcode(void)
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
 	if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
-		pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
+		pr_warn("AMD CPU family 0x%x not supported\n", c->x86);
 		return NULL;
 	}
 
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 20e242e..4e7c693 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -161,8 +161,8 @@ static void __init ms_hyperv_init_platform(void)
 	ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
 	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
 
-	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
-	       ms_hyperv.features, ms_hyperv.hints);
+	pr_info("HyperV: features 0x%x, hints 0x%x\n",
+		ms_hyperv.features, ms_hyperv.hints);
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
@@ -174,8 +174,8 @@ static void __init ms_hyperv_init_platform(void)
 		rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
 		hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
 		lapic_timer_frequency = hv_lapic_frequency;
-		printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
-				lapic_timer_frequency);
+		pr_info("HyperV: LAPIC Timer Frequency: %#x\n",
+			lapic_timer_frequency);
 	}
 #endif
 
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index 316fe3e..3d68993 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -103,7 +103,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
 	 */
 	if (type != MTRR_TYPE_WRCOMB &&
 	    (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) {
-		pr_warning("mtrr: only write-combining%s supported\n",
+		pr_warn("mtrr: only write-combining%s supported\n",
 			   centaur_mcr_type ? " and uncacheable are" : " is");
 		return -EINVAL;
 	}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 0d98503..31e951c 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -57,9 +57,9 @@ static int __initdata				nr_range;
 static struct var_mtrr_range_state __initdata	range_state[RANGE_NUM];
 
 static int __initdata debug_print;
-#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
+#define Dprintk(x...) do { if (debug_print) pr_debug(x); } while (0)
 
-#define BIOS_BUG_MSG KERN_WARNING \
+#define BIOS_BUG_MSG \
 	"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
 
 static int __init
@@ -81,9 +81,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 						base, base + size);
 	}
 	if (debug_print) {
-		printk(KERN_DEBUG "After WB checking\n");
+		pr_debug("After WB checking\n");
 		for (i = 0; i < nr_range; i++)
-			printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+			pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
 				 range[i].start, range[i].end);
 	}
 
@@ -101,7 +101,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 		    (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
 		    (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
 			/* Var MTRR contains UC entry below 1M? Skip it: */
-			printk(BIOS_BUG_MSG, i);
+			pr_warn(BIOS_BUG_MSG, i);
 			if (base + size <= (1<<(20-PAGE_SHIFT)))
 				continue;
 			size -= (1<<(20-PAGE_SHIFT)) - base;
@@ -114,11 +114,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 				 extra_remove_base + extra_remove_size);
 
 	if  (debug_print) {
-		printk(KERN_DEBUG "After UC checking\n");
+		pr_debug("After UC checking\n");
 		for (i = 0; i < RANGE_NUM; i++) {
 			if (!range[i].end)
 				continue;
-			printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+			pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
 				 range[i].start, range[i].end);
 		}
 	}
@@ -126,9 +126,9 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 	/* sort the ranges */
 	nr_range = clean_sort_range(range, RANGE_NUM);
 	if  (debug_print) {
-		printk(KERN_DEBUG "After sorting\n");
+		pr_debug("After sorting\n");
 		for (i = 0; i < nr_range; i++)
-			printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
+			pr_debug("MTRR MAP PFN: %016llx - %016llx\n",
 				 range[i].start, range[i].end);
 	}
 
@@ -544,7 +544,7 @@ static void __init print_out_mtrr_range_state(void)
 		start_base = to_size_factor(start_base, &start_factor),
 		type = range_state[i].type;
 
-		printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+		pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
 			i, start_base, start_factor,
 			size_base, size_factor,
 			(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
@@ -713,7 +713,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 		return 0;
 
 	/* Print original var MTRRs at first, for debugging: */
-	printk(KERN_DEBUG "original variable MTRRs\n");
+	pr_debug("original variable MTRRs\n");
 	print_out_mtrr_range_state();
 
 	memset(range, 0, sizeof(range));
@@ -733,7 +733,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 					  x_remove_base, x_remove_size);
 
 	range_sums = sum_ranges(range, nr_range);
-	printk(KERN_INFO "total RAM covered: %ldM\n",
+	pr_info("total RAM covered: %ldM\n",
 	       range_sums >> (20 - PAGE_SHIFT));
 
 	if (mtrr_chunk_size && mtrr_gran_size) {
@@ -745,12 +745,11 @@ int __init mtrr_cleanup(unsigned address_bits)
 
 		if (!result[i].bad) {
 			set_var_mtrr_all(address_bits);
-			printk(KERN_DEBUG "New variable MTRRs\n");
+			pr_debug("New variable MTRRs\n");
 			print_out_mtrr_range_state();
 			return 1;
 		}
-		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
-		       "will find optimal one\n");
+		pr_info("invalid mtrr_gran_size or mtrr_chunk_size, will find optimal one\n");
 	}
 
 	i = 0;
@@ -768,7 +767,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 				      x_remove_base, x_remove_size, i);
 			if (debug_print) {
 				mtrr_print_out_one_result(i);
-				printk(KERN_INFO "\n");
+				pr_info("\n");
 			}
 
 			i++;
@@ -779,7 +778,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 	index_good = mtrr_search_optimal_index();
 
 	if (index_good != -1) {
-		printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+		pr_info("Found optimal setting for mtrr clean up\n");
 		i = index_good;
 		mtrr_print_out_one_result(i);
 
@@ -790,7 +789,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 		gran_size <<= 10;
 		x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
 		set_var_mtrr_all(address_bits);
-		printk(KERN_DEBUG "New variable MTRRs\n");
+		pr_debug("New variable MTRRs\n");
 		print_out_mtrr_range_state();
 		return 1;
 	} else {
@@ -799,8 +798,8 @@ int __init mtrr_cleanup(unsigned address_bits)
 			mtrr_print_out_one_result(i);
 	}
 
-	printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
-	printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+	pr_info("mtrr_cleanup: can not find optimal value\n");
+	pr_info("please specify mtrr_gran_size/mtrr_chunk_size\n");
 
 	return 0;
 }
@@ -918,7 +917,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 
 	/* kvm/qemu doesn't have mtrr set right, don't trim them all: */
 	if (!highest_pfn) {
-		printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n");
+		pr_info("CPU MTRRs all blank - virtualized system.\n");
 		return 0;
 	}
 
@@ -973,7 +972,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 							 end_pfn);
 
 	if (total_trim_size) {
-		pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20);
+		pr_warn("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n",
+			total_trim_size >> 20);
 
 		if (!changed_by_mtrr_cleanup)
 			WARN_ON(1);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index c870af1..fcbcb2f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -55,7 +55,7 @@ static inline void k8_check_syscfg_dram_mod_en(void)
 
 	rdmsr(MSR_K8_SYSCFG, lo, hi);
 	if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
-		printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
+		pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
 		       " not cleared by BIOS, clearing this bit\n",
 		       smp_processor_id());
 		lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY;
@@ -501,14 +501,14 @@ void __init mtrr_state_warn(void)
 	if (!mask)
 		return;
 	if (mask & MTRR_CHANGE_MASK_FIXED)
-		pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
+		pr_warn("mtrr: your CPUs had inconsistent fixed MTRR settings\n");
 	if (mask & MTRR_CHANGE_MASK_VARIABLE)
-		pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n");
+		pr_warn("mtrr: your CPUs had inconsistent variable MTRR settings\n");
 	if (mask & MTRR_CHANGE_MASK_DEFTYPE)
-		pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
+		pr_warn("mtrr: your CPUs had inconsistent MTRRdefType settings\n");
 
-	printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n");
-	printk(KERN_INFO "mtrr: corrected configuration.\n");
+	pr_info("mtrr: probably your BIOS does not setup all CPUs.\n");
+	pr_info("mtrr: corrected configuration.\n");
 }
 
 /*
@@ -519,8 +519,7 @@ void __init mtrr_state_warn(void)
 void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
 {
 	if (wrmsr_safe(msr, a, b) < 0) {
-		printk(KERN_ERR
-			"MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
+		pr_err("MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
 			smp_processor_id(), msr, a, b);
 	}
 }
@@ -607,7 +606,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 		tmp |= ~((1ULL<<(hi - 1)) - 1);
 
 		if (tmp != mask) {
-			printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+			pr_warn("mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
 			add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 			mask = tmp;
 		}
@@ -858,13 +857,13 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
 	    boot_cpu_data.x86_model == 1 &&
 	    boot_cpu_data.x86_mask <= 7) {
 		if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) {
-			pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
+			pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
 			return -EINVAL;
 		}
 		if (!(base + size < 0x70000 || base > 0x7003F) &&
 		    (type == MTRR_TYPE_WRCOMB
 		     || type == MTRR_TYPE_WRBACK)) {
-			pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
+			pr_warn("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
 			return -EINVAL;
 		}
 	}
@@ -878,7 +877,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size,
 	     lbase = lbase >> 1, last = last >> 1)
 		;
 	if (lbase != last) {
-		pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
+		pr_warn("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size);
 		return -EINVAL;
 	}
 	return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 5c3d149..ba80d68 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -300,24 +300,24 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		return error;
 
 	if (type >= MTRR_NUM_TYPES) {
-		pr_warning("mtrr: type: %u invalid\n", type);
+		pr_warn("mtrr: type: %u invalid\n", type);
 		return -EINVAL;
 	}
 
 	/* If the type is WC, check that this processor supports it */
 	if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
-		pr_warning("mtrr: your processor doesn't support write-combining\n");
+		pr_warn("mtrr: your processor doesn't support write-combining\n");
 		return -ENOSYS;
 	}
 
 	if (!size) {
-		pr_warning("mtrr: zero sized request\n");
+		pr_warn("mtrr: zero sized request\n");
 		return -EINVAL;
 	}
 
 	if ((base | (base + size - 1)) >>
 	    (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) {
-		pr_warning("mtrr: base or size exceeds the MTRR width\n");
+		pr_warn("mtrr: base or size exceeds the MTRR width\n");
 		return -EINVAL;
 	}
 
@@ -348,7 +348,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 				} else if (types_compatible(type, ltype))
 					continue;
 			}
-			pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing"
+			pr_warn("mtrr: 0x%lx000,0x%lx000 overlaps existing"
 				" 0x%lx000,0x%lx000\n", base, size, lbase,
 				lsize);
 			goto out;
@@ -357,7 +357,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 		if (ltype != type) {
 			if (types_compatible(type, ltype))
 				continue;
-			pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
+			pr_warn("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
 				base, size, mtrr_attrib_to_str(ltype),
 				mtrr_attrib_to_str(type));
 			goto out;
@@ -395,7 +395,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 static int mtrr_check(unsigned long base, unsigned long size)
 {
 	if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
-		pr_warning("mtrr: size and base must be multiples of 4 kiB\n");
+		pr_warn("mtrr: size and base must be multiples of 4 kiB\n");
 		pr_debug("mtrr: size: 0x%lx  base: 0x%lx\n", size, base);
 		dump_stack();
 		return -1;
@@ -493,16 +493,16 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 		}
 	}
 	if (reg >= max) {
-		pr_warning("mtrr: register: %d too big\n", reg);
+		pr_warn("mtrr: register: %d too big\n", reg);
 		goto out;
 	}
 	mtrr_if->get(reg, &lbase, &lsize, &ltype);
 	if (lsize < 1) {
-		pr_warning("mtrr: MTRR %d not used\n", reg);
+		pr_warn("mtrr: MTRR %d not used\n", reg);
 		goto out;
 	}
 	if (mtrr_usage_table[reg] < 1) {
-		pr_warning("mtrr: reg: %d has count=0\n", reg);
+		pr_warn("mtrr: reg: %d has count=0\n", reg);
 		goto out;
 	}
 	if (--mtrr_usage_table[reg] < 1)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1b443db..7402c818 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -254,15 +254,16 @@ static bool check_hw_exists(void)
 	 * We still allow the PMU driver to operate:
 	 */
 	if (bios_fail) {
-		printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
-		printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail);
+		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
+		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
+			      reg_fail, val_fail);
 	}
 
 	return true;
 
 msr_fail:
-	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
-	printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
+	pr_cont("Broken PMU hardware detected, using software events only.\n");
+	pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
 		boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
 		reg, val_new);
 
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 989d3c2..aa12f95 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -670,7 +670,7 @@ static __init int perf_event_ibs_init(void)
 	perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
 
 	register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
-	printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+	pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
 
 	return 0;
 }
@@ -774,14 +774,14 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
 		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
 		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
 			pci_dev_put(cpu_cfg);
-			printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
-			       "IBSCTL = 0x%08x\n", value);
+			pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
+				 value);
 			return -EINVAL;
 		}
 	} while (1);
 
 	if (!nodes) {
-		printk(KERN_DEBUG "No CPU node configured for IBS\n");
+		pr_debug("No CPU node configured for IBS\n");
 		return -ENODEV;
 	}
 
@@ -810,7 +810,7 @@ static void force_ibs_eilvt_setup(void)
 	preempt_enable();
 
 	if (offset == APIC_EILVT_NR_MAX) {
-		printk(KERN_DEBUG "No EILVT entry available\n");
+		pr_debug("No EILVT entry available\n");
 		return;
 	}
 
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index 4974274..19a1736 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -536,7 +536,7 @@ static int __init amd_uncore_init(void)
 		if (ret)
 			goto fail_nb;
 
-		printk(KERN_INFO "perf: AMD NB counters detected\n");
+		pr_info("perf: AMD NB counters detected\n");
 		ret = 0;
 	}
 
@@ -550,7 +550,7 @@ static int __init amd_uncore_init(void)
 		if (ret)
 			goto fail_l2;
 
-		printk(KERN_INFO "perf: AMD L2I counters detected\n");
+		pr_info("perf: AMD L2I counters detected\n");
 		ret = 0;
 	}
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 10602f0..7c79261 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1325,13 +1325,13 @@ void __init intel_ds_init(void)
 
 		switch (format) {
 		case 0:
-			printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
+			pr_cont("PEBS fmt0%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
 			break;
 
 		case 1:
-			printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
+			pr_cont("PEBS fmt1%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
 			break;
@@ -1351,7 +1351,7 @@ void __init intel_ds_init(void)
 			break;
 
 		default:
-			printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
+			pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
 			x86_pmu.pebs = 0;
 		}
 	}
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 819d949..f6f50c4 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -51,7 +51,7 @@ void x86_init_rdrand(struct cpuinfo_x86 *c)
 	for (i = 0; i < SANITY_CHECK_LOOPS; i++) {
 		if (!rdrand_long(&tmp)) {
 			clear_cpu_cap(c, X86_FEATURE_RDRAND);
-			printk_once(KERN_WARNING "rdrand: disabled\n");
+			pr_warn_once("rdrand: disabled\n");
 			return;
 		}
 	}
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 4c60eaf..cd53135 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -87,10 +87,10 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
 	c->x86_max_cores = (core_level_siblings / smp_num_siblings);
 
 	if (!printed) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		pr_info("CPU: Physical Processor ID: %d\n",
 		       c->phys_proc_id);
 		if (c->x86_max_cores > 1)
-			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+			pr_info("CPU: Processor Core ID: %d\n",
 			       c->cpu_core_id);
 		printed = 1;
 	}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 252da7a..e3b4d18 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -33,7 +33,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
 	if (max >= 0x80860001) {
 		cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
 		if (cpu_rev != 0x02000000) {
-			printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
+			pr_info("CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
 				(cpu_rev >> 24) & 0xff,
 				(cpu_rev >> 16) & 0xff,
 				(cpu_rev >> 8) & 0xff,
@@ -44,10 +44,10 @@ static void init_transmeta(struct cpuinfo_x86 *c)
 	if (max >= 0x80860002) {
 		cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
 		if (cpu_rev == 0x02000000) {
-			printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
+			pr_info("CPU: Processor revision %08X, %u MHz\n",
 				new_cpu_rev, cpu_freq);
 		}
-		printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
+		pr_info("CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n",
 		       (cms_rev1 >> 24) & 0xff,
 		       (cms_rev1 >> 16) & 0xff,
 		       (cms_rev1 >> 8) & 0xff,
@@ -76,7 +76,7 @@ static void init_transmeta(struct cpuinfo_x86 *c)
 		      (void *)&cpu_info[56],
 		      (void *)&cpu_info[60]);
 		cpu_info[64] = '\0';
-		printk(KERN_INFO "CPU: %s\n", cpu_info);
+		pr_info("CPU: %s\n", cpu_info);
 	}
 
 	/* Unhide possibly hidden capability flags */
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 628a059..364e583 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -62,7 +62,7 @@ static unsigned long vmware_get_tsc_khz(void)
 	tsc_hz = eax | (((uint64_t)ebx) << 32);
 	do_div(tsc_hz, 1000);
 	BUG_ON(tsc_hz >> 32);
-	printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
+	pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
 			 (unsigned long) tsc_hz / 1000,
 			 (unsigned long) tsc_hz % 1000);
 
@@ -84,8 +84,7 @@ static void __init vmware_platform_setup(void)
 	if (ebx != UINT_MAX)
 		x86_platform.calibrate_tsc = vmware_get_tsc_khz;
 	else
-		printk(KERN_WARNING
-		       "Failed to get TSC freq from the hypervisor\n");
+		pr_warn("Failed to get TSC freq from the hypervisor\n");
 }
 
 /*
-- 
cgit v0.10.2


From 02afeaae9843733a39cd9b11053748b2d1dc5ae7 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 22 Dec 2015 14:52:38 -0800
Subject: x86/boot: Fix early command-line parsing when matching at end

The x86 early command line parsing in cmdline_find_option_bool() is
buggy. If it matches a specified 'option' all the way to the end of the
command-line, it will consider it a match.

For instance,

  cmdline = "foo";
  cmdline_find_option_bool(cmdline, "fool");

will return 1. This is particularly annoying since we have actual FPU
options like "noxsave" and "noxsaves" So, command-line "foo bar noxsave"
will match *BOTH* a "noxsave" and "noxsaves". (This turns out not to be
an actual problem because "noxsave" implies "noxsaves", but it's still
confusing.)

To fix this, we simplify the code and stop tracking 'len'. 'len'
was trying to indicate either the NULL terminator *OR* the end of a
non-NULL-terminated command line at 'COMMAND_LINE_SIZE'. But, each of the
three states is *already* checking 'cmdline' for a NULL terminator.

We _only_ need to check if we have overrun 'COMMAND_LINE_SIZE', and that
we can do without keeping 'len' around.

Also add some commends to clarify what is going on.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Cc: yu-cheng.yu@intel.com
Link: http://lkml.kernel.org/r/20151222225238.9AEB560C@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 422db00..49548be 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -21,12 +21,14 @@ static inline int myisspace(u8 c)
  * @option: option string to look for
  *
  * Returns the position of that @option (starts counting with 1)
- * or 0 on not found.
+ * or 0 on not found.  @option will only be found if it is found
+ * as an entire word in @cmdline.  For instance, if @option="car"
+ * then a cmdline which contains "cart" will not match.
  */
 int cmdline_find_option_bool(const char *cmdline, const char *option)
 {
 	char c;
-	int len, pos = 0, wstart = 0;
+	int pos = 0, wstart = 0;
 	const char *opptr = NULL;
 	enum {
 		st_wordstart = 0,	/* Start of word/after whitespace */
@@ -37,11 +39,14 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 	if (!cmdline)
 		return -1;      /* No command line */
 
-	len = min_t(int, strlen(cmdline), COMMAND_LINE_SIZE);
-	if (!len)
+	if (!strlen(cmdline))
 		return 0;
 
-	while (len--) {
+	/*
+	 * This 'pos' check ensures we do not overrun
+	 * a non-NULL-terminated 'cmdline'
+	 */
+	while (pos < COMMAND_LINE_SIZE) {
 		c = *(char *)cmdline++;
 		pos++;
 
@@ -58,17 +63,26 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 			/* fall through */
 
 		case st_wordcmp:
-			if (!*opptr)
+			if (!*opptr) {
+				/*
+				 * We matched all the way to the end of the
+				 * option we were looking for.  If the
+				 * command-line has a space _or_ ends, then
+				 * we matched!
+				 */
 				if (!c || myisspace(c))
 					return wstart;
 				else
 					state = st_wordskip;
-			else if (!c)
+			} else if (!c) {
+				/*
+				 * Hit the NULL terminator on the end of
+				 * cmdline.
+				 */
 				return 0;
-			else if (c != *opptr++)
+			} else if (c != *opptr++) {
 				state = st_wordskip;
-			else if (!len)		/* last word and is matching */
-				return wstart;
+			}
 			break;
 
 		case st_wordskip:
-- 
cgit v0.10.2


From abcdc1c694fa4055323cbec1cde4c2cb6b68398c Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 22 Dec 2015 14:52:39 -0800
Subject: x86/boot: Fix early command-line parsing when partial word matches

cmdline_find_option_bool() keeps track of position in two strings:

 1. the command-line
 2. the option we are searchign for in the command-line

We plow through each character in the command-line one at a time, always
moving forward. We move forward in the option ('opptr') when we match
characters in 'cmdline'. We reset the 'opptr' only when we go in to the
'st_wordstart' state.

But, if we fail to match an option because we see a space
(state=st_wordcmp, *opptr='\0',c=' '), we set state='st_wordskip' and
'break', moving to the next character. But, that move to the next
character is the one *after* the ' '. This means that we will miss a
'st_wordstart' state.

For instance, if we have

  cmdline = "foo fool";

and are searching for "fool", we have:

	  "fool"
  opptr = ----^

           "foo fool"
   c = --------^

We see that 'l' != ' ', set state=st_wordskip, break, and then move 'c', so:

          "foo fool"
  c = ---------^

and are still in state=st_wordskip. We will stay in wordskip until we
have skipped "fool", thus missing the option we were looking for. This
*only* happens when you have a partially- matching word followed by a
matching one.

To fix this, we always fall *into* the 'st_wordskip' state when we set
it.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Cc: yu-cheng.yu@intel.com
Link: http://lkml.kernel.org/r/20151222225239.8E1DCA58@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 49548be..ff8d1be 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -72,18 +72,26 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 				 */
 				if (!c || myisspace(c))
 					return wstart;
-				else
-					state = st_wordskip;
+				/*
+				 * We hit the end of the option, but _not_
+				 * the end of a word on the cmdline.  Not
+				 * a match.
+				 */
 			} else if (!c) {
 				/*
 				 * Hit the NULL terminator on the end of
 				 * cmdline.
 				 */
 				return 0;
-			} else if (c != *opptr++) {
-				state = st_wordskip;
+			} else if (c == *opptr++) {
+				/*
+				 * We are currently matching, so continue
+				 * to the next character on the cmdline.
+				 */
+				break;
 			}
-			break;
+			state = st_wordskip;
+			/* fall through */
 
 		case st_wordskip:
 			if (!c)
-- 
cgit v0.10.2


From 4de07ea481361b08fe13735004dafae862482d38 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 22 Dec 2015 14:52:41 -0800
Subject: x86/boot: Simplify early command line parsing

__cmdline_find_option_bool() tries to account for both NULL-terminated
and non-NULL-terminated strings. It keeps 'pos' to look for the end of
the buffer and also looks for '!c' in a bunch of places to look for NULL
termination.

But, it also calls strlen(). You can't call strlen on a
non-NULL-terminated string.

If !strlen(cmdline), then cmdline[0]=='\0'. In that case, we will go in
to the while() loop, set c='\0', hit st_wordstart, notice !c, and will
immediately return 0.

So, remove the strlen().  It is unnecessary and unsafe.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Cc: yu-cheng.yu@intel.com
Link: http://lkml.kernel.org/r/20151222225241.15365E43@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index ff8d1be..945a639 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -39,9 +39,6 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 	if (!cmdline)
 		return -1;      /* No command line */
 
-	if (!strlen(cmdline))
-		return 0;
-
 	/*
 	 * This 'pos' check ensures we do not overrun
 	 * a non-NULL-terminated 'cmdline'
-- 
cgit v0.10.2


From 8c0517759a1a100a8b83134cf3c7f254774aaeba Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 22 Dec 2015 14:52:43 -0800
Subject: x86/boot: Pass in size to early cmdline parsing

We will use this in a few patches to implement tests for early parsing.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
[ Aligned args properly. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Cc: yu-cheng.yu@intel.com
Link: http://lkml.kernel.org/r/20151222225243.5CC47EB6@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 945a639..5cc78bf 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -25,7 +25,9 @@ static inline int myisspace(u8 c)
  * as an entire word in @cmdline.  For instance, if @option="car"
  * then a cmdline which contains "cart" will not match.
  */
-int cmdline_find_option_bool(const char *cmdline, const char *option)
+static int
+__cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
+			   const char *option)
 {
 	char c;
 	int pos = 0, wstart = 0;
@@ -43,7 +45,7 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 	 * This 'pos' check ensures we do not overrun
 	 * a non-NULL-terminated 'cmdline'
 	 */
-	while (pos < COMMAND_LINE_SIZE) {
+	while (pos < max_cmdline_size) {
 		c = *(char *)cmdline++;
 		pos++;
 
@@ -101,3 +103,8 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 
 	return 0;	/* Buffer overrun */
 }
+
+int cmdline_find_option_bool(const char *cmdline, const char *option)
+{
+	return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
+}
-- 
cgit v0.10.2


From 01441af5df438a171bce36bc3c7cfb588bc98a7a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:23:59 +0100
Subject: perf hists: Factor output_resort from hists__output_resort

Currently hists__output_resort() depends on hists based on hists_evsel
struct, but we need to be able to sort common hists as well.

Cutting out the sorting base sorting code into output_resort
function, so it can be reused in following patch.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 098310b..7797d06 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1197,19 +1197,13 @@ static void __hists__insert_output_entry(struct rb_root *entries,
 	rb_insert_color(&he->rb_node, entries);
 }
 
-void hists__output_resort(struct hists *hists, struct ui_progress *prog)
+static void output_resort(struct hists *hists, struct ui_progress *prog,
+			  bool use_callchain)
 {
 	struct rb_root *root;
 	struct rb_node *next;
 	struct hist_entry *n;
 	u64 min_callchain_hits;
-	struct perf_evsel *evsel = hists_to_evsel(hists);
-	bool use_callchain;
-
-	if (evsel && symbol_conf.use_callchain && !symbol_conf.show_ref_callgraph)
-		use_callchain = evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN;
-	else
-		use_callchain = symbol_conf.use_callchain;
 
 	min_callchain_hits = hists__total_period(hists) * (callchain_param.min_percent / 100);
 
@@ -1239,6 +1233,19 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog)
 	}
 }
 
+void hists__output_resort(struct hists *hists, struct ui_progress *prog)
+{
+	struct perf_evsel *evsel = hists_to_evsel(hists);
+	bool use_callchain;
+
+	if (evsel && symbol_conf.use_callchain && !symbol_conf.show_ref_callgraph)
+		use_callchain = evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN;
+	else
+		use_callchain = symbol_conf.use_callchain;
+
+	output_resort(hists, prog, use_callchain);
+}
+
 static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h,
 				       enum hist_filter filter)
 {
-- 
cgit v0.10.2


From 452ce03b1e686f0b2da6c1644dce7cdc71e3c69c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:00 +0100
Subject: perf hists: Introduce perf_evsel__output_resort function

Adding evsel specific function to sort hists_evsel based hists. The
hists__output_resort can be now used to sort common hists object.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-3-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index cc5c126..cfe3663 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -245,7 +245,7 @@ static int __cmd_annotate(struct perf_annotate *ann)
 			hists__collapse_resort(hists, NULL);
 			/* Don't sort callchain */
 			perf_evsel__reset_sample_bit(pos, CALLCHAIN);
-			hists__output_resort(hists, NULL);
+			perf_evsel__output_resort(pos, NULL);
 
 			if (symbol_conf.event_group &&
 			    !perf_evsel__is_group_leader(pos))
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 72ed0b4..54ce047 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -507,7 +507,7 @@ static void report__output_resort(struct report *rep)
 	ui_progress__init(&prog, rep->nr_entries, "Sorting events for output...");
 
 	evlist__for_each(rep->session->evlist, pos)
-		hists__output_resort(evsel__hists(pos), &prog);
+		perf_evsel__output_resort(pos, &prog);
 
 	ui_progress__finish();
 }
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index bf01cbb..f1bbe2a 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -252,7 +252,8 @@ static void perf_top__print_sym_table(struct perf_top *top)
 	char bf[160];
 	int printed = 0;
 	const int win_width = top->winsize.ws_col - 1;
-	struct hists *hists = evsel__hists(top->sym_evsel);
+	struct perf_evsel *evsel = top->sym_evsel;
+	struct hists *hists = evsel__hists(evsel);
 
 	puts(CONSOLE_CLEAR);
 
@@ -288,7 +289,7 @@ static void perf_top__print_sym_table(struct perf_top *top)
 	}
 
 	hists__collapse_resort(hists, NULL);
-	hists__output_resort(hists, NULL);
+	perf_evsel__output_resort(evsel, NULL);
 
 	hists__output_recalc_col_len(hists, top->print_entries - printed);
 	putchar('\n');
@@ -540,6 +541,7 @@ static bool perf_top__handle_keypress(struct perf_top *top, int c)
 static void perf_top__sort_new_samples(void *arg)
 {
 	struct perf_top *t = arg;
+	struct perf_evsel *evsel = t->sym_evsel;
 	struct hists *hists;
 
 	perf_top__reset_sample_counters(t);
@@ -547,7 +549,7 @@ static void perf_top__sort_new_samples(void *arg)
 	if (t->evlist->selected != NULL)
 		t->sym_evsel = t->evlist->selected;
 
-	hists = evsel__hists(t->sym_evsel);
+	hists = evsel__hists(evsel);
 
 	if (t->evlist->enabled) {
 		if (t->zero) {
@@ -559,7 +561,7 @@ static void perf_top__sort_new_samples(void *arg)
 	}
 
 	hists__collapse_resort(hists, NULL);
-	hists__output_resort(hists, NULL);
+	perf_evsel__output_resort(evsel, NULL);
 }
 
 static void *display_thread_tui(void *arg)
diff --git a/tools/perf/tests/hists_cumulate.c b/tools/perf/tests/hists_cumulate.c
index 5e6a86e..ecf136c 100644
--- a/tools/perf/tests/hists_cumulate.c
+++ b/tools/perf/tests/hists_cumulate.c
@@ -191,7 +191,7 @@ static int do_test(struct hists *hists, struct result *expected, size_t nr_expec
 	 * function since TEST_ASSERT_VAL() returns in case of failure.
 	 */
 	hists__collapse_resort(hists, NULL);
-	hists__output_resort(hists, NULL);
+	perf_evsel__output_resort(hists_to_evsel(hists), NULL);
 
 	if (verbose > 2) {
 		pr_info("use callchain: %d, cumulate callchain: %d\n",
diff --git a/tools/perf/tests/hists_filter.c b/tools/perf/tests/hists_filter.c
index 351a424..34b945a 100644
--- a/tools/perf/tests/hists_filter.c
+++ b/tools/perf/tests/hists_filter.c
@@ -145,7 +145,7 @@ int test__hists_filter(int subtest __maybe_unused)
 		struct hists *hists = evsel__hists(evsel);
 
 		hists__collapse_resort(hists, NULL);
-		hists__output_resort(hists, NULL);
+		perf_evsel__output_resort(evsel, NULL);
 
 		if (verbose > 2) {
 			pr_info("Normal histogram\n");
diff --git a/tools/perf/tests/hists_output.c b/tools/perf/tests/hists_output.c
index b231265..23cce67 100644
--- a/tools/perf/tests/hists_output.c
+++ b/tools/perf/tests/hists_output.c
@@ -156,7 +156,7 @@ static int test1(struct perf_evsel *evsel, struct machine *machine)
 		goto out;
 
 	hists__collapse_resort(hists, NULL);
-	hists__output_resort(hists, NULL);
+	perf_evsel__output_resort(evsel, NULL);
 
 	if (verbose > 2) {
 		pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -256,7 +256,7 @@ static int test2(struct perf_evsel *evsel, struct machine *machine)
 		goto out;
 
 	hists__collapse_resort(hists, NULL);
-	hists__output_resort(hists, NULL);
+	perf_evsel__output_resort(evsel, NULL);
 
 	if (verbose > 2) {
 		pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -310,7 +310,7 @@ static int test3(struct perf_evsel *evsel, struct machine *machine)
 		goto out;
 
 	hists__collapse_resort(hists, NULL);
-	hists__output_resort(hists, NULL);
+	perf_evsel__output_resort(evsel, NULL);
 
 	if (verbose > 2) {
 		pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -388,7 +388,7 @@ static int test4(struct perf_evsel *evsel, struct machine *machine)
 		goto out;
 
 	hists__collapse_resort(hists, NULL);
-	hists__output_resort(hists, NULL);
+	perf_evsel__output_resort(evsel, NULL);
 
 	if (verbose > 2) {
 		pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
@@ -491,7 +491,7 @@ static int test5(struct perf_evsel *evsel, struct machine *machine)
 		goto out;
 
 	hists__collapse_resort(hists, NULL);
-	hists__output_resort(hists, NULL);
+	perf_evsel__output_resort(evsel, NULL);
 
 	if (verbose > 2) {
 		pr_info("[fields = %s, sort = %s]\n", field_order, sort_order);
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 7797d06..d07955c 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1233,9 +1233,8 @@ static void output_resort(struct hists *hists, struct ui_progress *prog,
 	}
 }
 
-void hists__output_resort(struct hists *hists, struct ui_progress *prog)
+void perf_evsel__output_resort(struct perf_evsel *evsel, struct ui_progress *prog)
 {
-	struct perf_evsel *evsel = hists_to_evsel(hists);
 	bool use_callchain;
 
 	if (evsel && symbol_conf.use_callchain && !symbol_conf.show_ref_callgraph)
@@ -1243,7 +1242,12 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog)
 	else
 		use_callchain = symbol_conf.use_callchain;
 
-	output_resort(hists, prog, use_callchain);
+	output_resort(evsel__hists(evsel), prog, use_callchain);
+}
+
+void hists__output_resort(struct hists *hists, struct ui_progress *prog)
+{
+	output_resort(hists, prog, symbol_conf.use_callchain);
 }
 
 static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index d4ec482..bc24997 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -128,6 +128,7 @@ int hist_entry__sort_snprintf(struct hist_entry *he, char *bf, size_t size,
 			      struct hists *hists);
 void hist_entry__delete(struct hist_entry *he);
 
+void perf_evsel__output_resort(struct perf_evsel *evsel, struct ui_progress *prog);
 void hists__output_resort(struct hists *hists, struct ui_progress *prog);
 void hists__collapse_resort(struct hists *hists, struct ui_progress *prog);
 
-- 
cgit v0.10.2


From b21a763edd5f832c6d966d9e60376f3d21009859 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:01 +0100
Subject: perf hists: Add _idx fields into struct perf_hpp_fmt

Currently there's no way of comparing hpp format entries, which is
needed in following patches.

Adding _idx fields into struct perf_hpp_fmt to recognize and be able to
compare hpp format entries.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-4-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index bf2a66e..d392801 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -371,7 +371,7 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 	return 0;
 }
 
-#define HPP__COLOR_PRINT_FNS(_name, _fn)		\
+#define HPP__COLOR_PRINT_FNS(_name, _fn, _idx)		\
 	{						\
 		.name   = _name,			\
 		.header	= hpp__header_fn,		\
@@ -381,9 +381,10 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 		.cmp	= hpp__nop_cmp,			\
 		.collapse = hpp__nop_cmp,		\
 		.sort	= hpp__sort_ ## _fn,		\
+		.idx	= PERF_HPP__ ## _idx,		\
 	}
 
-#define HPP__COLOR_ACC_PRINT_FNS(_name, _fn)		\
+#define HPP__COLOR_ACC_PRINT_FNS(_name, _fn, _idx)	\
 	{						\
 		.name   = _name,			\
 		.header	= hpp__header_fn,		\
@@ -393,9 +394,10 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 		.cmp	= hpp__nop_cmp,			\
 		.collapse = hpp__nop_cmp,		\
 		.sort	= hpp__sort_ ## _fn,		\
+		.idx	= PERF_HPP__ ## _idx,		\
 	}
 
-#define HPP__PRINT_FNS(_name, _fn)			\
+#define HPP__PRINT_FNS(_name, _fn, _idx)		\
 	{						\
 		.name   = _name,			\
 		.header	= hpp__header_fn,		\
@@ -404,17 +406,18 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 		.cmp	= hpp__nop_cmp,			\
 		.collapse = hpp__nop_cmp,		\
 		.sort	= hpp__sort_ ## _fn,		\
+		.idx	= PERF_HPP__ ## _idx,		\
 	}
 
 struct perf_hpp_fmt perf_hpp__format[] = {
-	HPP__COLOR_PRINT_FNS("Overhead", overhead),
-	HPP__COLOR_PRINT_FNS("sys", overhead_sys),
-	HPP__COLOR_PRINT_FNS("usr", overhead_us),
-	HPP__COLOR_PRINT_FNS("guest sys", overhead_guest_sys),
-	HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us),
-	HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc),
-	HPP__PRINT_FNS("Samples", samples),
-	HPP__PRINT_FNS("Period", period)
+	HPP__COLOR_PRINT_FNS("Overhead", overhead, OVERHEAD),
+	HPP__COLOR_PRINT_FNS("sys", overhead_sys, OVERHEAD_SYS),
+	HPP__COLOR_PRINT_FNS("usr", overhead_us, OVERHEAD_US),
+	HPP__COLOR_PRINT_FNS("guest sys", overhead_guest_sys, OVERHEAD_GUEST_SYS),
+	HPP__COLOR_PRINT_FNS("guest usr", overhead_guest_us, OVERHEAD_GUEST_US),
+	HPP__COLOR_ACC_PRINT_FNS("Children", overhead_acc, OVERHEAD_ACC),
+	HPP__PRINT_FNS("Samples", samples, SAMPLES),
+	HPP__PRINT_FNS("Period", period, PERIOD)
 };
 
 LIST_HEAD(perf_hpp__list);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index bc24997..8a0cbde 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -221,6 +221,7 @@ struct perf_hpp_fmt {
 	bool elide;
 	int len;
 	int user_len;
+	int idx;
 };
 
 extern struct list_head perf_hpp__list;
-- 
cgit v0.10.2


From 2e8b79e706f504801fbce19fa9f16f3c858a105e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:02 +0100
Subject: perf hists: Use struct perf_hpp_fmt::idx in perf_hpp__reset_width

We are going to add dynamic hpp format fields, so we need to make the
'len' change for the format itself, not in the perf_hpp__format
template.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-5-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index d392801..5a11bf0 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -629,20 +629,12 @@ unsigned int hists__sort_list_width(struct hists *hists)
 
 void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
 {
-	int idx;
-
 	if (perf_hpp__is_sort_entry(fmt))
 		return perf_hpp__reset_sort_width(fmt, hists);
 
-	for (idx = 0; idx < PERF_HPP__MAX_INDEX; idx++) {
-		if (fmt == &perf_hpp__format[idx])
-			break;
-	}
-
-	if (idx == PERF_HPP__MAX_INDEX)
-		return;
+	BUG_ON(fmt->idx >= PERF_HPP__MAX_INDEX);
 
-	switch (idx) {
+	switch (fmt->idx) {
 	case PERF_HPP__OVERHEAD:
 	case PERF_HPP__OVERHEAD_SYS:
 	case PERF_HPP__OVERHEAD_US:
-- 
cgit v0.10.2


From 97358084b91e94e5f8fcf0379f0430c0ea16bd3b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:03 +0100
Subject: perf hists: Add 'equal' method to perf_hpp_fmt struct

To easily compare format entries and make it available for all kinds of
format entries.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-6-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 5a11bf0..71c8bb7 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -524,6 +524,11 @@ void perf_hpp__cancel_cumulate(void)
 	perf_hpp__format[PERF_HPP__OVERHEAD].name = "Overhead";
 }
 
+static bool fmt_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+	return a->equal && a->equal(a, b);
+}
+
 void perf_hpp__setup_output_field(void)
 {
 	struct perf_hpp_fmt *fmt;
@@ -542,7 +547,7 @@ void perf_hpp__setup_output_field(void)
 			struct perf_hpp_fmt *pos;
 
 			perf_hpp__for_each_format(pos) {
-				if (perf_hpp__same_sort_entry(pos, fmt))
+				if (fmt_equal(fmt, pos))
 					goto next;
 			}
 		}
@@ -571,7 +576,7 @@ void perf_hpp__append_sort_keys(void)
 			struct perf_hpp_fmt *pos;
 
 			perf_hpp__for_each_sort_list(pos) {
-				if (perf_hpp__same_sort_entry(pos, fmt))
+				if (fmt_equal(fmt, pos))
 					goto next;
 			}
 		}
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 8a0cbde..9a240d7 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -215,6 +215,7 @@ struct perf_hpp_fmt {
 			    struct hist_entry *a, struct hist_entry *b);
 	int64_t (*sort)(struct perf_hpp_fmt *fmt,
 			struct hist_entry *a, struct hist_entry *b);
+	bool (*equal)(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b);
 
 	struct list_head list;
 	struct list_head sort_list;
@@ -268,7 +269,6 @@ void perf_hpp__reset_output_field(void);
 void perf_hpp__append_sort_keys(void);
 
 bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format);
-bool perf_hpp__same_sort_entry(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b);
 bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *format);
 bool perf_hpp__defined_dynamic_entry(struct perf_hpp_fmt *fmt, struct hists *hists);
 
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 898e4b0..170f7f7 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1441,20 +1441,6 @@ struct hpp_sort_entry {
 	struct sort_entry *se;
 };
 
-bool perf_hpp__same_sort_entry(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
-{
-	struct hpp_sort_entry *hse_a;
-	struct hpp_sort_entry *hse_b;
-
-	if (!perf_hpp__is_sort_entry(a) || !perf_hpp__is_sort_entry(b))
-		return false;
-
-	hse_a = container_of(a, struct hpp_sort_entry, hpp);
-	hse_b = container_of(b, struct hpp_sort_entry, hpp);
-
-	return hse_a->se == hse_b->se;
-}
-
 void perf_hpp__reset_sort_width(struct perf_hpp_fmt *fmt, struct hists *hists)
 {
 	struct hpp_sort_entry *hse;
@@ -1540,6 +1526,25 @@ static int64_t __sort__hpp_sort(struct perf_hpp_fmt *fmt,
 	return sort_fn(a, b);
 }
 
+bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format)
+{
+	return format->header == __sort__hpp_header;
+}
+
+static bool __sort__hpp_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+	struct hpp_sort_entry *hse_a;
+	struct hpp_sort_entry *hse_b;
+
+	if (!perf_hpp__is_sort_entry(a) || !perf_hpp__is_sort_entry(b))
+		return false;
+
+	hse_a = container_of(a, struct hpp_sort_entry, hpp);
+	hse_b = container_of(b, struct hpp_sort_entry, hpp);
+
+	return hse_a->se == hse_b->se;
+}
+
 static struct hpp_sort_entry *
 __sort_dimension__alloc_hpp(struct sort_dimension *sd)
 {
@@ -1561,6 +1566,7 @@ __sort_dimension__alloc_hpp(struct sort_dimension *sd)
 	hse->hpp.cmp = __sort__hpp_cmp;
 	hse->hpp.collapse = __sort__hpp_collapse;
 	hse->hpp.sort = __sort__hpp_sort;
+	hse->hpp.equal = __sort__hpp_equal;
 
 	INIT_LIST_HEAD(&hse->hpp.list);
 	INIT_LIST_HEAD(&hse->hpp.sort_list);
@@ -1571,11 +1577,6 @@ __sort_dimension__alloc_hpp(struct sort_dimension *sd)
 	return hse;
 }
 
-bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format)
-{
-	return format->header == __sort__hpp_header;
-}
-
 static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd)
 {
 	struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
-- 
cgit v0.10.2


From c0020efa079c5fc2388945ae7e856b362731442d Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:04 +0100
Subject: perf hists: Add 'hpp__equal' callback function

Adding 'hpp__equal' callback function to compare hpp output format
entries.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-7-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 71c8bb7..b543f4b 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -371,6 +371,19 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 	return 0;
 }
 
+static bool perf_hpp__is_hpp_entry(struct perf_hpp_fmt *a)
+{
+	return a->header == hpp__header_fn;
+}
+
+static bool hpp__equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+	if (!perf_hpp__is_hpp_entry(a) || !perf_hpp__is_hpp_entry(b))
+		return false;
+
+	return a->idx == b->idx;
+}
+
 #define HPP__COLOR_PRINT_FNS(_name, _fn, _idx)		\
 	{						\
 		.name   = _name,			\
@@ -382,6 +395,7 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 		.collapse = hpp__nop_cmp,		\
 		.sort	= hpp__sort_ ## _fn,		\
 		.idx	= PERF_HPP__ ## _idx,		\
+		.equal	= hpp__equal,			\
 	}
 
 #define HPP__COLOR_ACC_PRINT_FNS(_name, _fn, _idx)	\
@@ -395,6 +409,7 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 		.collapse = hpp__nop_cmp,		\
 		.sort	= hpp__sort_ ## _fn,		\
 		.idx	= PERF_HPP__ ## _idx,		\
+		.equal	= hpp__equal,			\
 	}
 
 #define HPP__PRINT_FNS(_name, _fn, _idx)		\
@@ -407,6 +422,7 @@ static int64_t hpp__nop_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
 		.collapse = hpp__nop_cmp,		\
 		.sort	= hpp__sort_ ## _fn,		\
 		.idx	= PERF_HPP__ ## _idx,		\
+		.equal	= hpp__equal,			\
 	}
 
 struct perf_hpp_fmt perf_hpp__format[] = {
-- 
cgit v0.10.2


From 3f931f2c4274565fd6c6a642b16387358cbe6266 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:05 +0100
Subject: perf hists: Make hpp setup function generic

Now that we have the 'equal' method implemented for hpp format entries
we can ease up the logic in the following functions and make them
generic wrt comparing format entries:

  perf_hpp__setup_output_field
  perf_hpp__append_sort_keys

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-8-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index b543f4b..b0fcaec 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -551,21 +551,11 @@ void perf_hpp__setup_output_field(void)
 
 	/* append sort keys to output field */
 	perf_hpp__for_each_sort_list(fmt) {
-		if (!list_empty(&fmt->list))
-			continue;
+		struct perf_hpp_fmt *pos;
 
-		/*
-		 * sort entry fields are dynamically created,
-		 * so they can share a same sort key even though
-		 * the list is empty.
-		 */
-		if (perf_hpp__is_sort_entry(fmt)) {
-			struct perf_hpp_fmt *pos;
-
-			perf_hpp__for_each_format(pos) {
-				if (fmt_equal(fmt, pos))
-					goto next;
-			}
+		perf_hpp__for_each_format(pos) {
+			if (fmt_equal(fmt, pos))
+				goto next;
 		}
 
 		perf_hpp__column_register(fmt);
@@ -580,21 +570,11 @@ void perf_hpp__append_sort_keys(void)
 
 	/* append output fields to sort keys */
 	perf_hpp__for_each_format(fmt) {
-		if (!list_empty(&fmt->sort_list))
-			continue;
+		struct perf_hpp_fmt *pos;
 
-		/*
-		 * sort entry fields are dynamically created,
-		 * so they can share a same sort key even though
-		 * the list is empty.
-		 */
-		if (perf_hpp__is_sort_entry(fmt)) {
-			struct perf_hpp_fmt *pos;
-
-			perf_hpp__for_each_sort_list(pos) {
-				if (fmt_equal(fmt, pos))
-					goto next;
-			}
+		perf_hpp__for_each_sort_list(pos) {
+			if (fmt_equal(fmt, pos))
+				goto next;
 		}
 
 		perf_hpp__register_sort_field(fmt);
-- 
cgit v0.10.2


From 9887804d01abf7a4e03cfd6be0312d0a5c4e4aba Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:06 +0100
Subject: perf report: Move UI initialization ahead of sort setup

The ui initialization changes hpp format callbacks, based on the used
browser. Thus we need this init being processed before setup_sorting.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-9-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 54ce047..1eab50a 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -912,15 +912,6 @@ repeat:
 		symbol_conf.cumulate_callchain = false;
 	}
 
-	if (setup_sorting(session->evlist) < 0) {
-		if (sort_order)
-			parse_options_usage(report_usage, options, "s", 1);
-		if (field_order)
-			parse_options_usage(sort_order ? NULL : report_usage,
-					    options, "F", 1);
-		goto error;
-	}
-
 	/* Force tty output for header output and per-thread stat. */
 	if (report.header || report.header_only || report.show_threads)
 		use_browser = 0;
@@ -930,6 +921,15 @@ repeat:
 	else
 		use_browser = 0;
 
+	if (setup_sorting(session->evlist) < 0) {
+		if (sort_order)
+			parse_options_usage(report_usage, options, "s", 1);
+		if (field_order)
+			parse_options_usage(sort_order ? NULL : report_usage,
+					    options, "F", 1);
+		goto error;
+	}
+
 	if (report.header || report.header_only) {
 		perf_session__fprintf_info(session, stdout,
 					   report.show_full_info);
-- 
cgit v0.10.2


From 3ee60c3b18bd4bf30ea9b70e7542116bb5c205ba Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 18 Jan 2016 10:24:06 +0100
Subject: perf top: Move UI initialization ahead of sort setup

The ui initialization changes hpp format callbacks, based on the used
browser. Thus we need this init being processed before setup_sorting.

Replica of a patch by Jiri for 'perf report'.

Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-9-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index f1bbe2a..a75de39 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1245,6 +1245,13 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 	/* display thread wants entries to be collapsed in a different tree */
 	sort__need_collapse = 1;
 
+	if (top.use_stdio)
+		use_browser = 0;
+	else if (top.use_tui)
+		use_browser = 1;
+
+	setup_browser(false);
+
 	if (setup_sorting(top.evlist) < 0) {
 		if (sort_order)
 			parse_options_usage(top_usage, options, "s", 1);
@@ -1254,13 +1261,6 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 		goto out_delete_evlist;
 	}
 
-	if (top.use_stdio)
-		use_browser = 0;
-	else if (top.use_tui)
-		use_browser = 1;
-
-	setup_browser(false);
-
 	status = target__validate(target);
 	if (status) {
 		target__strerror(target, status, errbuf, BUFSIZ);
-- 
cgit v0.10.2


From 1945c3e734cd1f01535dc76de47c38bbe9a87352 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:07 +0100
Subject: perf hists: Allocate output sort field

Currently we use static output fields, because we have single global
list of all sort/output fields.

We will add hists specific sort and output lists in following patches,
so we need all format entries to be dynamically allocated. Adding
support to allocate output sort field.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-10-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index b0fcaec..c877c52 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -533,11 +533,23 @@ void perf_hpp__column_disable(unsigned col)
 
 void perf_hpp__cancel_cumulate(void)
 {
+	struct perf_hpp_fmt *fmt, *acc, *ovh, *tmp;
+
 	if (is_strict_order(field_order))
 		return;
 
-	perf_hpp__column_disable(PERF_HPP__OVERHEAD_ACC);
-	perf_hpp__format[PERF_HPP__OVERHEAD].name = "Overhead";
+	ovh = &perf_hpp__format[PERF_HPP__OVERHEAD];
+	acc = &perf_hpp__format[PERF_HPP__OVERHEAD_ACC];
+
+	perf_hpp__for_each_format_safe(fmt, tmp) {
+		if (acc->equal(acc, fmt)) {
+			perf_hpp__column_unregister(fmt);
+			continue;
+		}
+
+		if (ovh->equal(ovh, fmt))
+			fmt->name = "Overhead";
+	}
 }
 
 static bool fmt_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 170f7f7..52e4a36 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1577,6 +1577,19 @@ __sort_dimension__alloc_hpp(struct sort_dimension *sd)
 	return hse;
 }
 
+static struct perf_hpp_fmt *__hpp_dimension__alloc_hpp(struct hpp_dimension *hd)
+{
+	struct perf_hpp_fmt *fmt;
+
+	fmt = memdup(hd->fmt, sizeof(*fmt));
+	if (fmt) {
+		INIT_LIST_HEAD(&fmt->list);
+		INIT_LIST_HEAD(&fmt->sort_list);
+	}
+
+	return fmt;
+}
+
 static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd)
 {
 	struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
@@ -2066,11 +2079,17 @@ static int __sort_dimension__add(struct sort_dimension *sd)
 
 static int __hpp_dimension__add(struct hpp_dimension *hd)
 {
-	if (!hd->taken) {
-		hd->taken = 1;
+	struct perf_hpp_fmt *fmt;
 
-		perf_hpp__register_sort_field(hd->fmt);
-	}
+	if (hd->taken)
+		return 0;
+
+	fmt = __hpp_dimension__alloc_hpp(hd);
+	if (!fmt)
+		return -1;
+
+	hd->taken = 1;
+	perf_hpp__register_sort_field(fmt);
 	return 0;
 }
 
@@ -2088,11 +2107,17 @@ static int __sort_dimension__add_output(struct sort_dimension *sd)
 
 static int __hpp_dimension__add_output(struct hpp_dimension *hd)
 {
-	if (!hd->taken) {
-		hd->taken = 1;
+	struct perf_hpp_fmt *fmt;
 
-		perf_hpp__column_register(hd->fmt);
-	}
+	if (hd->taken)
+		return 0;
+
+	fmt = __hpp_dimension__alloc_hpp(hd);
+	if (!fmt)
+		return -1;
+
+	hd->taken = 1;
+	perf_hpp__column_register(fmt);
 	return 0;
 }
 
-- 
cgit v0.10.2


From 12cb4397fb398545207acf772b219bd751786015 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:08 +0100
Subject: perf hists: Remove perf_hpp__column_(disable|enable)

Those functions are no longer needed. They operate over perf_hpp__format
array which is now used only as template for dynamic entries.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-11-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index c877c52..80d63a9 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -519,18 +519,6 @@ void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
 	list_add_tail(&format->sort_list, &perf_hpp__sort_list);
 }
 
-void perf_hpp__column_enable(unsigned col)
-{
-	BUG_ON(col >= PERF_HPP__MAX_INDEX);
-	perf_hpp__column_register(&perf_hpp__format[col]);
-}
-
-void perf_hpp__column_disable(unsigned col)
-{
-	BUG_ON(col >= PERF_HPP__MAX_INDEX);
-	perf_hpp__column_unregister(&perf_hpp__format[col]);
-}
-
 void perf_hpp__cancel_cumulate(void)
 {
 	struct perf_hpp_fmt *fmt, *acc, *ovh, *tmp;
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 9a240d7..1f9e21dd 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -259,8 +259,6 @@ enum {
 void perf_hpp__init(void);
 void perf_hpp__column_register(struct perf_hpp_fmt *format);
 void perf_hpp__column_unregister(struct perf_hpp_fmt *format);
-void perf_hpp__column_enable(unsigned col);
-void perf_hpp__column_disable(unsigned col);
 void perf_hpp__cancel_cumulate(void);
 
 void perf_hpp__register_sort_field(struct perf_hpp_fmt *format);
-- 
cgit v0.10.2


From 564132f3116cf376fdc04b2380e621f35efbb6c7 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:09 +0100
Subject: perf hists: Properly release format fields

With multiple list holding format entries, we need the support properly
releasing format output/sort fields.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-12-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 80d63a9..2cd1a03 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -583,6 +583,12 @@ next:
 	}
 }
 
+static void fmt_free(struct perf_hpp_fmt *fmt)
+{
+	if (fmt->free)
+		fmt->free(fmt);
+}
+
 void perf_hpp__reset_output_field(void)
 {
 	struct perf_hpp_fmt *fmt, *tmp;
@@ -591,12 +597,14 @@ void perf_hpp__reset_output_field(void)
 	perf_hpp__for_each_format_safe(fmt, tmp) {
 		list_del_init(&fmt->list);
 		list_del_init(&fmt->sort_list);
+		fmt_free(fmt);
 	}
 
 	/* reset sort keys */
 	perf_hpp__for_each_sort_list_safe(fmt, tmp) {
 		list_del_init(&fmt->list);
 		list_del_init(&fmt->sort_list);
+		fmt_free(fmt);
 	}
 }
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 1f9e21dd..f3bcf2d 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -216,6 +216,7 @@ struct perf_hpp_fmt {
 	int64_t (*sort)(struct perf_hpp_fmt *fmt,
 			struct hist_entry *a, struct hist_entry *b);
 	bool (*equal)(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b);
+	void (*free)(struct perf_hpp_fmt *fmt);
 
 	struct list_head list;
 	struct list_head sort_list;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 52e4a36..b5389a5 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1545,6 +1545,14 @@ static bool __sort__hpp_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
 	return hse_a->se == hse_b->se;
 }
 
+static void hse_free(struct perf_hpp_fmt *fmt)
+{
+	struct hpp_sort_entry *hse;
+
+	hse = container_of(fmt, struct hpp_sort_entry, hpp);
+	free(hse);
+}
+
 static struct hpp_sort_entry *
 __sort_dimension__alloc_hpp(struct sort_dimension *sd)
 {
@@ -1567,6 +1575,7 @@ __sort_dimension__alloc_hpp(struct sort_dimension *sd)
 	hse->hpp.collapse = __sort__hpp_collapse;
 	hse->hpp.sort = __sort__hpp_sort;
 	hse->hpp.equal = __sort__hpp_equal;
+	hse->hpp.free = hse_free;
 
 	INIT_LIST_HEAD(&hse->hpp.list);
 	INIT_LIST_HEAD(&hse->hpp.sort_list);
@@ -1577,6 +1586,11 @@ __sort_dimension__alloc_hpp(struct sort_dimension *sd)
 	return hse;
 }
 
+static void hpp_free(struct perf_hpp_fmt *fmt)
+{
+	free(fmt);
+}
+
 static struct perf_hpp_fmt *__hpp_dimension__alloc_hpp(struct hpp_dimension *hd)
 {
 	struct perf_hpp_fmt *fmt;
@@ -1585,6 +1599,7 @@ static struct perf_hpp_fmt *__hpp_dimension__alloc_hpp(struct hpp_dimension *hd)
 	if (fmt) {
 		INIT_LIST_HEAD(&fmt->list);
 		INIT_LIST_HEAD(&fmt->sort_list);
+		fmt->free = hpp_free;
 	}
 
 	return fmt;
@@ -1818,6 +1833,14 @@ bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *fmt)
 	return fmt->cmp == __sort__hde_cmp;
 }
 
+static void hde_free(struct perf_hpp_fmt *fmt)
+{
+	struct hpp_dynamic_entry *hde;
+
+	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+	free(hde);
+}
+
 static struct hpp_dynamic_entry *
 __alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
 {
@@ -1842,6 +1865,7 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
 	hde->hpp.cmp = __sort__hde_cmp;
 	hde->hpp.collapse = __sort__hde_cmp;
 	hde->hpp.sort = __sort__hde_cmp;
+	hde->hpp.free = hde_free;
 
 	INIT_LIST_HEAD(&hde->hpp.list);
 	INIT_LIST_HEAD(&hde->hpp.sort_list);
-- 
cgit v0.10.2


From 2fbaa39079672bf52a9208ec1263385b48933cc3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:10 +0100
Subject: perf hists: Separate sort fields parsing into setup_sort_list
 function

Separating sort fields parsing into setup_sort_list function, so it's
separated from sort_order string setup and could be reused later in
following patches.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-13-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index b5389a5..ab1c21a 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2241,6 +2241,26 @@ static int sort_dimension__add(const char *tok,
 	return -ESRCH;
 }
 
+static int setup_sort_list(char *str, struct perf_evlist *evlist)
+{
+	char *tmp, *tok;
+	int ret = 0;
+
+	for (tok = strtok_r(str, ", ", &tmp);
+			tok; tok = strtok_r(NULL, ", ", &tmp)) {
+		ret = sort_dimension__add(tok, evlist);
+		if (ret == -EINVAL) {
+			error("Invalid --sort key: `%s'", tok);
+			break;
+		} else if (ret == -ESRCH) {
+			error("Unknown --sort key: `%s'", tok);
+			break;
+		}
+	}
+
+	return ret;
+}
+
 static const char *get_default_sort_order(struct perf_evlist *evlist)
 {
 	const char *default_sort_orders[] = {
@@ -2335,7 +2355,7 @@ static char *setup_overhead(char *keys)
 
 static int __setup_sorting(struct perf_evlist *evlist)
 {
-	char *tmp, *tok, *str;
+	char *str;
 	const char *sort_keys;
 	int ret = 0;
 
@@ -2373,17 +2393,7 @@ static int __setup_sorting(struct perf_evlist *evlist)
 		}
 	}
 
-	for (tok = strtok_r(str, ", ", &tmp);
-			tok; tok = strtok_r(NULL, ", ", &tmp)) {
-		ret = sort_dimension__add(tok, evlist);
-		if (ret == -EINVAL) {
-			error("Invalid --sort key: `%s'", tok);
-			break;
-		} else if (ret == -ESRCH) {
-			error("Unknown --sort key: `%s'", tok);
-			break;
-		}
-	}
+	ret = setup_sort_list(str, evlist);
 
 	free(str);
 	return ret;
-- 
cgit v0.10.2


From 6d3375efebe906ad0ce55ddaa883bf41fd8c444b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:11 +0100
Subject: perf hists: Separate output fields parsing into setup_output_list
 function

Separating output fields parsing into setup_output_list function, so
it's separated from field_order string setup and could be reused later
in following patches.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-14-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index ab1c21a..36dbd55 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2535,6 +2535,26 @@ static int output_field_add(char *tok)
 	return -ESRCH;
 }
 
+static int setup_output_list(char *str)
+{
+	char *tmp, *tok;
+	int ret = 0;
+
+	for (tok = strtok_r(str, ", ", &tmp);
+			tok; tok = strtok_r(NULL, ", ", &tmp)) {
+		ret = output_field_add(tok);
+		if (ret == -EINVAL) {
+			error("Invalid --fields key: `%s'", tok);
+			break;
+		} else if (ret == -ESRCH) {
+			error("Unknown --fields key: `%s'", tok);
+			break;
+		}
+	}
+
+	return ret;
+}
+
 static void reset_dimensions(void)
 {
 	unsigned int i;
@@ -2559,7 +2579,7 @@ bool is_strict_order(const char *order)
 
 static int __setup_output_field(void)
 {
-	char *tmp, *tok, *str, *strp;
+	char *str, *strp;
 	int ret = -EINVAL;
 
 	if (field_order == NULL)
@@ -2579,17 +2599,7 @@ static int __setup_output_field(void)
 		goto out;
 	}
 
-	for (tok = strtok_r(strp, ", ", &tmp);
-			tok; tok = strtok_r(NULL, ", ", &tmp)) {
-		ret = output_field_add(tok);
-		if (ret == -EINVAL) {
-			error("Invalid --fields key: `%s'", tok);
-			break;
-		} else if (ret == -ESRCH) {
-			error("Unknown --fields key: `%s'", tok);
-			break;
-		}
-	}
+	ret = setup_output_list(strp);
 
 out:
 	free(str);
-- 
cgit v0.10.2


From 7c31e10266bd18de163d5c60899591c0540bb002 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:12 +0100
Subject: perf hists: Introduce struct perf_hpp_list

Gather output and sort lists under struct perf_hpp_list, so we could
have multiple instancies of sort/output format entries.

Replacing current perf_hpp__list and perf_hpp__sort_list lists with
single perf_hpp_list instance.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-15-git-send-email-jolsa@kernel.org
[ Renamed fields to .{fields,sorts} as suggested by Namhyung and acked by Jiri ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 2cd1a03..74dbeac 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -436,9 +436,10 @@ struct perf_hpp_fmt perf_hpp__format[] = {
 	HPP__PRINT_FNS("Period", period, PERIOD)
 };
 
-LIST_HEAD(perf_hpp__list);
-LIST_HEAD(perf_hpp__sort_list);
-
+struct perf_hpp_list perf_hpp_list = {
+	.fields	= LIST_HEAD_INIT(perf_hpp_list.fields),
+	.sorts	= LIST_HEAD_INIT(perf_hpp_list.sorts),
+};
 
 #undef HPP__COLOR_PRINT_FNS
 #undef HPP__COLOR_ACC_PRINT_FNS
@@ -506,7 +507,7 @@ void perf_hpp__init(void)
 
 void perf_hpp__column_register(struct perf_hpp_fmt *format)
 {
-	list_add_tail(&format->list, &perf_hpp__list);
+	list_add_tail(&format->list, &perf_hpp_list.fields);
 }
 
 void perf_hpp__column_unregister(struct perf_hpp_fmt *format)
@@ -516,7 +517,7 @@ void perf_hpp__column_unregister(struct perf_hpp_fmt *format)
 
 void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
 {
-	list_add_tail(&format->sort_list, &perf_hpp__sort_list);
+	list_add_tail(&format->sort_list, &perf_hpp_list.sorts);
 }
 
 void perf_hpp__cancel_cumulate(void)
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index f3bcf2d..203397a 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -226,20 +226,24 @@ struct perf_hpp_fmt {
 	int idx;
 };
 
-extern struct list_head perf_hpp__list;
-extern struct list_head perf_hpp__sort_list;
+struct perf_hpp_list {
+	struct list_head fields;
+	struct list_head sorts;
+};
+
+extern struct perf_hpp_list perf_hpp_list;
 
 #define perf_hpp__for_each_format(format) \
-	list_for_each_entry(format, &perf_hpp__list, list)
+	list_for_each_entry(format, &perf_hpp_list.fields, list)
 
 #define perf_hpp__for_each_format_safe(format, tmp)	\
-	list_for_each_entry_safe(format, tmp, &perf_hpp__list, list)
+	list_for_each_entry_safe(format, tmp, &perf_hpp_list.fields, list)
 
 #define perf_hpp__for_each_sort_list(format) \
-	list_for_each_entry(format, &perf_hpp__sort_list, sort_list)
+	list_for_each_entry(format, &perf_hpp_list.sorts, sort_list)
 
 #define perf_hpp__for_each_sort_list_safe(format, tmp)	\
-	list_for_each_entry_safe(format, tmp, &perf_hpp__sort_list, sort_list)
+	list_for_each_entry_safe(format, tmp, &perf_hpp_list.sorts, sort_list)
 
 extern struct perf_hpp_fmt perf_hpp__format[];
 
-- 
cgit v0.10.2


From 94b3dc3865097e11073f1abf5b20b5f80af223af Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:13 +0100
Subject: perf hists: Introduce perf_hpp_list__init function

Introducing perf_hpp_list__init function to have an easy way to
initialize perf_hpp_list struct.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-16-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index d07955c..b762ecc 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1642,3 +1642,9 @@ int hists__init(void)
 
 	return err;
 }
+
+void perf_hpp_list__init(struct perf_hpp_list *list)
+{
+	INIT_LIST_HEAD(&list->fields);
+	INIT_LIST_HEAD(&list->sorts);
+}
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 203397a..e22f98e 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -386,4 +386,6 @@ int parse_filter_percentage(const struct option *opt __maybe_unused,
 			    const char *arg, int unset __maybe_unused);
 int perf_hist_config(const char *var, const char *value);
 
+void perf_hpp_list__init(struct perf_hpp_list *list);
+
 #endif	/* __PERF_HIST_H */
-- 
cgit v0.10.2


From ebdd98e030f5ed6dd1bae9ab01b084f97685bd60 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:14 +0100
Subject: perf hists: Add perf_hpp_list register helpers

Adding 2 perf_hpp_list register helpers:

  perf_hpp_list__column_register()
  perf_hpp_list__register_sort_field()

to be called within existing helpers:

  perf_hpp__column_register()
  perf_hpp__register_sort_field()

to register format entries within global perf_hpp_list object.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-17-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 74dbeac..1655c0d 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -505,19 +505,21 @@ void perf_hpp__init(void)
 		hpp_dimension__add_output(PERF_HPP__PERIOD);
 }
 
-void perf_hpp__column_register(struct perf_hpp_fmt *format)
+void perf_hpp_list__column_register(struct perf_hpp_list *list,
+				    struct perf_hpp_fmt *format)
 {
-	list_add_tail(&format->list, &perf_hpp_list.fields);
+	list_add_tail(&format->list, &list->fields);
 }
 
-void perf_hpp__column_unregister(struct perf_hpp_fmt *format)
+void perf_hpp_list__register_sort_field(struct perf_hpp_list *list,
+					struct perf_hpp_fmt *format)
 {
-	list_del(&format->list);
+	list_add_tail(&format->sort_list, &list->sorts);
 }
 
-void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
+void perf_hpp__column_unregister(struct perf_hpp_fmt *format)
 {
-	list_add_tail(&format->sort_list, &perf_hpp_list.sorts);
+	list_del(&format->list);
 }
 
 void perf_hpp__cancel_cumulate(void)
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index e22f98e..a7769d7 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -233,6 +233,21 @@ struct perf_hpp_list {
 
 extern struct perf_hpp_list perf_hpp_list;
 
+void perf_hpp_list__column_register(struct perf_hpp_list *list,
+				    struct perf_hpp_fmt *format);
+void perf_hpp_list__register_sort_field(struct perf_hpp_list *list,
+					struct perf_hpp_fmt *format);
+
+static inline void perf_hpp__column_register(struct perf_hpp_fmt *format)
+{
+	perf_hpp_list__column_register(&perf_hpp_list, format);
+}
+
+static inline void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
+{
+	perf_hpp_list__register_sort_field(&perf_hpp_list, format);
+}
+
 #define perf_hpp__for_each_format(format) \
 	list_for_each_entry(format, &perf_hpp_list.fields, list)
 
@@ -262,11 +277,8 @@ enum {
 };
 
 void perf_hpp__init(void);
-void perf_hpp__column_register(struct perf_hpp_fmt *format);
 void perf_hpp__column_unregister(struct perf_hpp_fmt *format);
 void perf_hpp__cancel_cumulate(void);
-
-void perf_hpp__register_sort_field(struct perf_hpp_fmt *format);
 void perf_hpp__setup_output_field(void);
 void perf_hpp__reset_output_field(void);
 void perf_hpp__append_sort_keys(void);
-- 
cgit v0.10.2


From 07600027fb7114bf7bcabdd121e5178f200d8a44 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:16 +0100
Subject: perf hists: Pass perf_hpp_list all the way through setup_output_list

Passing perf_hpp_list all the way through setup_output_list so the
output entry could be added on the arbitrary list.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-19-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 36dbd55..f643bed 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1616,14 +1616,15 @@ static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd)
 	return 0;
 }
 
-static int __sort_dimension__add_hpp_output(struct sort_dimension *sd)
+static int __sort_dimension__add_hpp_output(struct perf_hpp_list *list,
+					    struct sort_dimension *sd)
 {
 	struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
 
 	if (hse == NULL)
 		return -1;
 
-	perf_hpp__column_register(&hse->hpp);
+	perf_hpp_list__column_register(list, &hse->hpp);
 	return 0;
 }
 
@@ -2117,19 +2118,21 @@ static int __hpp_dimension__add(struct hpp_dimension *hd)
 	return 0;
 }
 
-static int __sort_dimension__add_output(struct sort_dimension *sd)
+static int __sort_dimension__add_output(struct perf_hpp_list *list,
+					struct sort_dimension *sd)
 {
 	if (sd->taken)
 		return 0;
 
-	if (__sort_dimension__add_hpp_output(sd) < 0)
+	if (__sort_dimension__add_hpp_output(list, sd) < 0)
 		return -1;
 
 	sd->taken = 1;
 	return 0;
 }
 
-static int __hpp_dimension__add_output(struct hpp_dimension *hd)
+static int __hpp_dimension__add_output(struct perf_hpp_list *list,
+				       struct hpp_dimension *hd)
 {
 	struct perf_hpp_fmt *fmt;
 
@@ -2141,14 +2144,14 @@ static int __hpp_dimension__add_output(struct hpp_dimension *hd)
 		return -1;
 
 	hd->taken = 1;
-	perf_hpp__column_register(fmt);
+	perf_hpp_list__column_register(list, fmt);
 	return 0;
 }
 
 int hpp_dimension__add_output(unsigned col)
 {
 	BUG_ON(col >= PERF_HPP__MAX_INDEX);
-	return __hpp_dimension__add_output(&hpp_sort_dimensions[col]);
+	return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]);
 }
 
 static int sort_dimension__add(const char *tok,
@@ -2492,7 +2495,7 @@ void sort__setup_elide(FILE *output)
 	}
 }
 
-static int output_field_add(char *tok)
+static int output_field_add(struct perf_hpp_list *list, char *tok)
 {
 	unsigned int i;
 
@@ -2502,7 +2505,7 @@ static int output_field_add(char *tok)
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
-		return __sort_dimension__add_output(sd);
+		return __sort_dimension__add_output(list, sd);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) {
@@ -2511,7 +2514,7 @@ static int output_field_add(char *tok)
 		if (strncasecmp(tok, hd->name, strlen(tok)))
 			continue;
 
-		return __hpp_dimension__add_output(hd);
+		return __hpp_dimension__add_output(list, hd);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
@@ -2520,7 +2523,7 @@ static int output_field_add(char *tok)
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
-		return __sort_dimension__add_output(sd);
+		return __sort_dimension__add_output(list, sd);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(memory_sort_dimensions); i++) {
@@ -2529,20 +2532,20 @@ static int output_field_add(char *tok)
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
-		return __sort_dimension__add_output(sd);
+		return __sort_dimension__add_output(list, sd);
 	}
 
 	return -ESRCH;
 }
 
-static int setup_output_list(char *str)
+static int setup_output_list(struct perf_hpp_list *list, char *str)
 {
 	char *tmp, *tok;
 	int ret = 0;
 
 	for (tok = strtok_r(str, ", ", &tmp);
 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
-		ret = output_field_add(tok);
+		ret = output_field_add(list, tok);
 		if (ret == -EINVAL) {
 			error("Invalid --fields key: `%s'", tok);
 			break;
@@ -2599,7 +2602,7 @@ static int __setup_output_field(void)
 		goto out;
 	}
 
-	ret = setup_output_list(strp);
+	ret = setup_output_list(&perf_hpp_list, strp);
 
 out:
 	free(str);
-- 
cgit v0.10.2


From cf094045d718437e3d5cd42ac09d77561cb2f368 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:17 +0100
Subject: perf hists: Introduce perf_hpp_list__for_each_format macro

Introducing perf_hpp_list__for_each_format macro to iterate
perf_hpp_list object's output entries.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-20-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 61d578b..df0aedf 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1095,7 +1095,7 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 
 		hist_browser__gotorc(browser, row, 0);
 
-		perf_hpp__for_each_format(fmt) {
+		perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 			if (perf_hpp__should_skip(fmt, entry->hists) ||
 			    column++ < browser->b.horiz_scroll)
 				continue;
@@ -1175,7 +1175,7 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
 			return ret;
 	}
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		if (perf_hpp__should_skip(fmt, hists)  || column++ < browser->b.horiz_scroll)
 			continue;
 
@@ -1441,7 +1441,7 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
 	if (symbol_conf.use_callchain)
 		printed += fprintf(fp, "%c ", folded_sign);
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		if (perf_hpp__should_skip(fmt, he->hists))
 			continue;
 
@@ -2104,7 +2104,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 	memset(options, 0, sizeof(options));
 	memset(actions, 0, sizeof(actions));
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		perf_hpp__reset_width(fmt, hists);
 		/*
 		 * This is done just once, and activates the horizontal scrolling
diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 0f8dcfd..eca5151 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -306,7 +306,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 
 	nr_cols = 0;
 
-	perf_hpp__for_each_format(fmt)
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt)
 		col_types[nr_cols++] = G_TYPE_STRING;
 
 	store = gtk_tree_store_newv(nr_cols, col_types);
@@ -317,7 +317,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 
 	col_idx = 0;
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		if (perf_hpp__should_skip(fmt, hists))
 			continue;
 
@@ -367,7 +367,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 
 		col_idx = 0;
 
-		perf_hpp__for_each_format(fmt) {
+		perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 			if (perf_hpp__should_skip(fmt, h->hists))
 				continue;
 
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 1655c0d..7b5e8ce 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -556,7 +556,7 @@ void perf_hpp__setup_output_field(void)
 	perf_hpp__for_each_sort_list(fmt) {
 		struct perf_hpp_fmt *pos;
 
-		perf_hpp__for_each_format(pos) {
+		perf_hpp_list__for_each_format(&perf_hpp_list, pos) {
 			if (fmt_equal(fmt, pos))
 				goto next;
 		}
@@ -572,7 +572,7 @@ void perf_hpp__append_sort_keys(void)
 	struct perf_hpp_fmt *fmt;
 
 	/* append output fields to sort keys */
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		struct perf_hpp_fmt *pos;
 
 		perf_hpp__for_each_sort_list(pos) {
@@ -621,7 +621,7 @@ unsigned int hists__sort_list_width(struct hists *hists)
 	bool first = true;
 	struct perf_hpp dummy_hpp;
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		if (perf_hpp__should_skip(fmt, hists))
 			continue;
 
@@ -674,7 +674,7 @@ void perf_hpp__set_user_width(const char *width_list_str)
 	struct perf_hpp_fmt *fmt;
 	const char *ptr = width_list_str;
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		char *p;
 
 		int len = strtol(ptr, &p, 10);
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 691e52c..83e0bf2 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -384,7 +384,7 @@ static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
 	if (symbol_conf.exclude_other && !he->parent)
 		return 0;
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		if (perf_hpp__should_skip(fmt, he->hists))
 			continue;
 
@@ -453,7 +453,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 
 	init_rem_hits();
 
-	perf_hpp__for_each_format(fmt)
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt)
 		perf_hpp__reset_width(fmt, hists);
 
 	if (symbol_conf.col_width_list_str)
@@ -464,7 +464,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 
 	fprintf(fp, "# ");
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		if (perf_hpp__should_skip(fmt, hists))
 			continue;
 
@@ -488,7 +488,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 
 	fprintf(fp, "# ");
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		unsigned int i;
 
 		if (perf_hpp__should_skip(fmt, hists))
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index a7769d7..eadffca 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -248,8 +248,8 @@ static inline void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
 	perf_hpp_list__register_sort_field(&perf_hpp_list, format);
 }
 
-#define perf_hpp__for_each_format(format) \
-	list_for_each_entry(format, &perf_hpp_list.fields, list)
+#define perf_hpp_list__for_each_format(_list, format) \
+	list_for_each_entry(format, &(_list)->fields, list)
 
 #define perf_hpp__for_each_format_safe(format, tmp)	\
 	list_for_each_entry_safe(format, tmp, &perf_hpp_list.fields, list)
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index f643bed..1e134ff 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2407,7 +2407,7 @@ void perf_hpp__set_elide(int idx, bool elide)
 	struct perf_hpp_fmt *fmt;
 	struct hpp_sort_entry *hse;
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		if (!perf_hpp__is_sort_entry(fmt))
 			continue;
 
@@ -2467,7 +2467,7 @@ void sort__setup_elide(FILE *output)
 	struct perf_hpp_fmt *fmt;
 	struct hpp_sort_entry *hse;
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		if (!perf_hpp__is_sort_entry(fmt))
 			continue;
 
@@ -2479,7 +2479,7 @@ void sort__setup_elide(FILE *output)
 	 * It makes no sense to elide all of sort entries.
 	 * Just revert them to show up again.
 	 */
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		if (!perf_hpp__is_sort_entry(fmt))
 			continue;
 
@@ -2487,7 +2487,7 @@ void sort__setup_elide(FILE *output)
 			return;
 	}
 
-	perf_hpp__for_each_format(fmt) {
+	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		if (!perf_hpp__is_sort_entry(fmt))
 			continue;
 
-- 
cgit v0.10.2


From 7a1799e0a276069d8b903ba17179b4983b98c04b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:18 +0100
Subject: perf hists: Introduce perf_hpp_list__for_each_format_safe macro

Introducing perf_hpp_list__for_each_format_safe macro to iterate
perf_hpp_list object's output entries safely.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-21-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 7b5e8ce..348706a 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -532,7 +532,7 @@ void perf_hpp__cancel_cumulate(void)
 	ovh = &perf_hpp__format[PERF_HPP__OVERHEAD];
 	acc = &perf_hpp__format[PERF_HPP__OVERHEAD_ACC];
 
-	perf_hpp__for_each_format_safe(fmt, tmp) {
+	perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) {
 		if (acc->equal(acc, fmt)) {
 			perf_hpp__column_unregister(fmt);
 			continue;
@@ -597,7 +597,7 @@ void perf_hpp__reset_output_field(void)
 	struct perf_hpp_fmt *fmt, *tmp;
 
 	/* reset output fields */
-	perf_hpp__for_each_format_safe(fmt, tmp) {
+	perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) {
 		list_del_init(&fmt->list);
 		list_del_init(&fmt->sort_list);
 		fmt_free(fmt);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index eadffca..f5b2309 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -251,8 +251,8 @@ static inline void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
 #define perf_hpp_list__for_each_format(_list, format) \
 	list_for_each_entry(format, &(_list)->fields, list)
 
-#define perf_hpp__for_each_format_safe(format, tmp)	\
-	list_for_each_entry_safe(format, tmp, &perf_hpp_list.fields, list)
+#define perf_hpp_list__for_each_format_safe(_list, format, tmp)	\
+	list_for_each_entry_safe(format, tmp, &(_list)->fields, list)
 
 #define perf_hpp__for_each_sort_list(format) \
 	list_for_each_entry(format, &perf_hpp_list.sorts, sort_list)
-- 
cgit v0.10.2


From d29a497090845002ee449c8dc682dd59ad8bab42 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:19 +0100
Subject: perf hists: Introduce perf_hpp_list__for_each_sort_list macro

Introducing perf_hpp_list__for_each_sort_list macro to iterate
perf_hpp_list object's sort entries.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-22-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 348706a..f09eabe 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -553,7 +553,7 @@ void perf_hpp__setup_output_field(void)
 	struct perf_hpp_fmt *fmt;
 
 	/* append sort keys to output field */
-	perf_hpp__for_each_sort_list(fmt) {
+	perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
 		struct perf_hpp_fmt *pos;
 
 		perf_hpp_list__for_each_format(&perf_hpp_list, pos) {
@@ -575,7 +575,7 @@ void perf_hpp__append_sort_keys(void)
 	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
 		struct perf_hpp_fmt *pos;
 
-		perf_hpp__for_each_sort_list(pos) {
+		perf_hpp_list__for_each_sort_list(&perf_hpp_list, pos) {
 			if (fmt_equal(fmt, pos))
 				goto next;
 		}
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index b762ecc..dea475d 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -961,7 +961,7 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 	struct perf_hpp_fmt *fmt;
 	int64_t cmp = 0;
 
-	perf_hpp__for_each_sort_list(fmt) {
+	perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
 		cmp = fmt->cmp(fmt, left, right);
 		if (cmp)
 			break;
@@ -976,7 +976,7 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 	struct perf_hpp_fmt *fmt;
 	int64_t cmp = 0;
 
-	perf_hpp__for_each_sort_list(fmt) {
+	perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
 		cmp = fmt->collapse(fmt, left, right);
 		if (cmp)
 			break;
@@ -1120,7 +1120,7 @@ static int hist_entry__sort(struct hist_entry *a, struct hist_entry *b)
 	struct perf_hpp_fmt *fmt;
 	int64_t cmp = 0;
 
-	perf_hpp__for_each_sort_list(fmt) {
+	perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
 		if (perf_hpp__should_skip(fmt, a->hists))
 			continue;
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index f5b2309..c9b2ea4 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -254,8 +254,8 @@ static inline void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
 #define perf_hpp_list__for_each_format_safe(_list, format, tmp)	\
 	list_for_each_entry_safe(format, tmp, &(_list)->fields, list)
 
-#define perf_hpp__for_each_sort_list(format) \
-	list_for_each_entry(format, &perf_hpp_list.sorts, sort_list)
+#define perf_hpp_list__for_each_sort_list(_list, format) \
+	list_for_each_entry(format, &(_list)->sorts, sort_list)
 
 #define perf_hpp__for_each_sort_list_safe(format, tmp)	\
 	list_for_each_entry_safe(format, tmp, &perf_hpp_list.sorts, sort_list)
-- 
cgit v0.10.2


From 1a8ebd243a0b65c2a6d1458705d04dece937ab52 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:20 +0100
Subject: perf hists: Introduce perf_hpp_list__for_each_sort_list_safe macro

Introducing perf_hpp_list__for_each_sort_list_safe macro
to iterate perf_hpp_list object's sort entries safely.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-23-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index f09eabe..9cda51e 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -604,7 +604,7 @@ void perf_hpp__reset_output_field(void)
 	}
 
 	/* reset sort keys */
-	perf_hpp__for_each_sort_list_safe(fmt, tmp) {
+	perf_hpp_list__for_each_sort_list_safe(&perf_hpp_list, fmt, tmp) {
 		list_del_init(&fmt->list);
 		list_del_init(&fmt->sort_list);
 		fmt_free(fmt);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index c9b2ea4..61d35a9 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -257,8 +257,8 @@ static inline void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
 #define perf_hpp_list__for_each_sort_list(_list, format) \
 	list_for_each_entry(format, &(_list)->sorts, sort_list)
 
-#define perf_hpp__for_each_sort_list_safe(format, tmp)	\
-	list_for_each_entry_safe(format, tmp, &perf_hpp_list.sorts, sort_list)
+#define perf_hpp_list__for_each_sort_list_safe(_list, format, tmp)	\
+	list_for_each_entry_safe(format, tmp, &(_list)->sorts, sort_list)
 
 extern struct perf_hpp_fmt perf_hpp__format[];
 
-- 
cgit v0.10.2


From 43e0a68f13047750a3728c983a539c61fb4121c5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:21 +0100
Subject: perf hists: Add struct perf_hpp_list argument to helper functions

Adding struct perf_hpp_list argument to following helper functions:

  void perf_hpp__setup_output_field(struct perf_hpp_list *list);
  void perf_hpp__reset_output_field(struct perf_hpp_list *list);
  void perf_hpp__append_sort_keys(struct perf_hpp_list *list);

so they could be used on hists's hpp_list.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-24-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 9cda51e..8075d4c 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -548,15 +548,15 @@ static bool fmt_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
 	return a->equal && a->equal(a, b);
 }
 
-void perf_hpp__setup_output_field(void)
+void perf_hpp__setup_output_field(struct perf_hpp_list *list)
 {
 	struct perf_hpp_fmt *fmt;
 
 	/* append sort keys to output field */
-	perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
+	perf_hpp_list__for_each_sort_list(list, fmt) {
 		struct perf_hpp_fmt *pos;
 
-		perf_hpp_list__for_each_format(&perf_hpp_list, pos) {
+		perf_hpp_list__for_each_format(list, pos) {
 			if (fmt_equal(fmt, pos))
 				goto next;
 		}
@@ -567,15 +567,15 @@ next:
 	}
 }
 
-void perf_hpp__append_sort_keys(void)
+void perf_hpp__append_sort_keys(struct perf_hpp_list *list)
 {
 	struct perf_hpp_fmt *fmt;
 
 	/* append output fields to sort keys */
-	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
+	perf_hpp_list__for_each_format(list, fmt) {
 		struct perf_hpp_fmt *pos;
 
-		perf_hpp_list__for_each_sort_list(&perf_hpp_list, pos) {
+		perf_hpp_list__for_each_sort_list(list, pos) {
 			if (fmt_equal(fmt, pos))
 				goto next;
 		}
@@ -586,25 +586,26 @@ next:
 	}
 }
 
+
 static void fmt_free(struct perf_hpp_fmt *fmt)
 {
 	if (fmt->free)
 		fmt->free(fmt);
 }
 
-void perf_hpp__reset_output_field(void)
+void perf_hpp__reset_output_field(struct perf_hpp_list *list)
 {
 	struct perf_hpp_fmt *fmt, *tmp;
 
 	/* reset output fields */
-	perf_hpp_list__for_each_format_safe(&perf_hpp_list, fmt, tmp) {
+	perf_hpp_list__for_each_format_safe(list, fmt, tmp) {
 		list_del_init(&fmt->list);
 		list_del_init(&fmt->sort_list);
 		fmt_free(fmt);
 	}
 
 	/* reset sort keys */
-	perf_hpp_list__for_each_sort_list_safe(&perf_hpp_list, fmt, tmp) {
+	perf_hpp_list__for_each_sort_list_safe(list, fmt, tmp) {
 		list_del_init(&fmt->list);
 		list_del_init(&fmt->sort_list);
 		fmt_free(fmt);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 61d35a9..a39c9c1 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -279,9 +279,10 @@ enum {
 void perf_hpp__init(void);
 void perf_hpp__column_unregister(struct perf_hpp_fmt *format);
 void perf_hpp__cancel_cumulate(void);
-void perf_hpp__setup_output_field(void);
-void perf_hpp__reset_output_field(void);
-void perf_hpp__append_sort_keys(void);
+void perf_hpp__setup_output_field(struct perf_hpp_list *list);
+void perf_hpp__reset_output_field(struct perf_hpp_list *list);
+void perf_hpp__append_sort_keys(struct perf_hpp_list *list);
+
 
 bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format);
 bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *format);
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 1e134ff..de620f7 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2636,9 +2636,9 @@ int setup_sorting(struct perf_evlist *evlist)
 		return err;
 
 	/* copy sort keys to output fields */
-	perf_hpp__setup_output_field();
+	perf_hpp__setup_output_field(&perf_hpp_list);
 	/* and then copy output fields to sort keys */
-	perf_hpp__append_sort_keys();
+	perf_hpp__append_sort_keys(&perf_hpp_list);
 
 	return 0;
 }
@@ -2654,5 +2654,5 @@ void reset_output_field(void)
 	sort_order = NULL;
 
 	reset_dimensions();
-	perf_hpp__reset_output_field();
+	perf_hpp__reset_output_field(&perf_hpp_list);
 }
-- 
cgit v0.10.2


From 5b65855e20348a9e2772a1cb7c1e6ab477859ba6 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:22 +0100
Subject: perf tools: Add hpp_list into struct hists object

Adding hpp_list into struct hists object.

Initializing struct hists_evsel hists object to carry global
perf_hpp_list list.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-25-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index dea475d..2b9cc91 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1578,7 +1578,7 @@ int perf_hist_config(const char *var, const char *value)
 	return 0;
 }
 
-int __hists__init(struct hists *hists)
+int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list)
 {
 	memset(hists, 0, sizeof(*hists));
 	hists->entries_in_array[0] = hists->entries_in_array[1] = RB_ROOT;
@@ -1587,6 +1587,7 @@ int __hists__init(struct hists *hists)
 	hists->entries = RB_ROOT;
 	pthread_mutex_init(&hists->lock, NULL);
 	hists->socket_filter = -1;
+	hists->hpp_list = hpp_list;
 	return 0;
 }
 
@@ -1623,7 +1624,7 @@ static int hists_evsel__init(struct perf_evsel *evsel)
 {
 	struct hists *hists = evsel__hists(evsel);
 
-	__hists__init(hists);
+	__hists__init(hists, &perf_hpp_list);
 	return 0;
 }
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index a39c9c1..b296ff5 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -75,6 +75,7 @@ struct hists {
 	u64			event_stream;
 	u16			col_len[HISTC_NR_COLS];
 	int			socket_filter;
+	struct perf_hpp_list	*hpp_list;
 };
 
 struct hist_entry_iter;
@@ -186,7 +187,7 @@ static inline struct hists *evsel__hists(struct perf_evsel *evsel)
 }
 
 int hists__init(void);
-int __hists__init(struct hists *hists);
+int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list);
 
 struct rb_root *hists__get_rotate_entries_in(struct hists *hists);
 bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
-- 
cgit v0.10.2


From f0786af536bb0ba54cb516eee493af03cefdbaa3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:23 +0100
Subject: perf hists: Introduce hists__for_each_format macro

With the hist object having the perf_hpp_list we can now iterate output
format entries based in the hists object. Adding hists__for_each_format
macro to do that.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-26-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index df0aedf..3a1e096 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1095,7 +1095,7 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 
 		hist_browser__gotorc(browser, row, 0);
 
-		perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
+		hists__for_each_format(browser->hists, fmt) {
 			if (perf_hpp__should_skip(fmt, entry->hists) ||
 			    column++ < browser->b.horiz_scroll)
 				continue;
@@ -1175,7 +1175,7 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
 			return ret;
 	}
 
-	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
+	hists__for_each_format(browser->hists, fmt) {
 		if (perf_hpp__should_skip(fmt, hists)  || column++ < browser->b.horiz_scroll)
 			continue;
 
@@ -1441,7 +1441,7 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
 	if (symbol_conf.use_callchain)
 		printed += fprintf(fp, "%c ", folded_sign);
 
-	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
+	hists__for_each_format(browser->hists, fmt) {
 		if (perf_hpp__should_skip(fmt, he->hists))
 			continue;
 
@@ -2104,7 +2104,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 	memset(options, 0, sizeof(options));
 	memset(actions, 0, sizeof(actions));
 
-	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
+	hists__for_each_format(browser->hists, fmt) {
 		perf_hpp__reset_width(fmt, hists);
 		/*
 		 * This is done just once, and activates the horizontal scrolling
diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index eca5151..32cc38a 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -306,7 +306,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 
 	nr_cols = 0;
 
-	perf_hpp_list__for_each_format(&perf_hpp_list, fmt)
+	hists__for_each_format(hists, fmt)
 		col_types[nr_cols++] = G_TYPE_STRING;
 
 	store = gtk_tree_store_newv(nr_cols, col_types);
@@ -317,7 +317,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 
 	col_idx = 0;
 
-	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
+	hists__for_each_format(hists, fmt) {
 		if (perf_hpp__should_skip(fmt, hists))
 			continue;
 
@@ -367,7 +367,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 
 		col_idx = 0;
 
-		perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
+		hists__for_each_format(hists, fmt) {
 			if (perf_hpp__should_skip(fmt, h->hists))
 				continue;
 
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 8075d4c..1ba4117 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -622,7 +622,7 @@ unsigned int hists__sort_list_width(struct hists *hists)
 	bool first = true;
 	struct perf_hpp dummy_hpp;
 
-	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
+	hists__for_each_format(hists, fmt) {
 		if (perf_hpp__should_skip(fmt, hists))
 			continue;
 
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 83e0bf2..1a6e8f7 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -384,7 +384,7 @@ static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
 	if (symbol_conf.exclude_other && !he->parent)
 		return 0;
 
-	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
+	hists__for_each_format(he->hists, fmt) {
 		if (perf_hpp__should_skip(fmt, he->hists))
 			continue;
 
@@ -453,7 +453,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 
 	init_rem_hits();
 
-	perf_hpp_list__for_each_format(&perf_hpp_list, fmt)
+	hists__for_each_format(hists, fmt)
 		perf_hpp__reset_width(fmt, hists);
 
 	if (symbol_conf.col_width_list_str)
@@ -464,7 +464,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 
 	fprintf(fp, "# ");
 
-	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
+	hists__for_each_format(hists, fmt) {
 		if (perf_hpp__should_skip(fmt, hists))
 			continue;
 
@@ -488,7 +488,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 
 	fprintf(fp, "# ");
 
-	perf_hpp_list__for_each_format(&perf_hpp_list, fmt) {
+	hists__for_each_format(hists, fmt) {
 		unsigned int i;
 
 		if (perf_hpp__should_skip(fmt, hists))
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index b296ff5..bc90044 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -261,6 +261,9 @@ static inline void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
 #define perf_hpp_list__for_each_sort_list_safe(_list, format, tmp)	\
 	list_for_each_entry_safe(format, tmp, &(_list)->sorts, sort_list)
 
+#define hists__for_each_format(hists, format) \
+	perf_hpp_list__for_each_format((hists)->hpp_list, fmt)
+
 extern struct perf_hpp_fmt perf_hpp__format[];
 
 enum {
-- 
cgit v0.10.2


From aa6f50af822a552b579252ecd42224e09e11e879 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 18 Jan 2016 10:24:24 +0100
Subject: perf hists: Introduce hists__for_each_sort_list macro

With the hist object having the perf_hpp_list we can now iterate sort
format entries based in the hists object. Adding
hists__for_each_sort_list macro to do that.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1453109064-1026-27-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 2b9cc91..12f2d79 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -958,10 +958,11 @@ out:
 int64_t
 hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 {
+	struct hists *hists = left->hists;
 	struct perf_hpp_fmt *fmt;
 	int64_t cmp = 0;
 
-	perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
+	hists__for_each_sort_list(hists, fmt) {
 		cmp = fmt->cmp(fmt, left, right);
 		if (cmp)
 			break;
@@ -973,10 +974,11 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 int64_t
 hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 {
+	struct hists *hists = left->hists;
 	struct perf_hpp_fmt *fmt;
 	int64_t cmp = 0;
 
-	perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
+	hists__for_each_sort_list(hists, fmt) {
 		cmp = fmt->collapse(fmt, left, right);
 		if (cmp)
 			break;
@@ -1117,10 +1119,11 @@ void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
 
 static int hist_entry__sort(struct hist_entry *a, struct hist_entry *b)
 {
+	struct hists *hists = a->hists;
 	struct perf_hpp_fmt *fmt;
 	int64_t cmp = 0;
 
-	perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
+	hists__for_each_sort_list(hists, fmt) {
 		if (perf_hpp__should_skip(fmt, a->hists))
 			continue;
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index bc90044..1c7544a 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -264,6 +264,9 @@ static inline void perf_hpp__register_sort_field(struct perf_hpp_fmt *format)
 #define hists__for_each_format(hists, format) \
 	perf_hpp_list__for_each_format((hists)->hpp_list, fmt)
 
+#define hists__for_each_sort_list(hists, format) \
+	perf_hpp_list__for_each_sort_list((hists)->hpp_list, fmt)
+
 extern struct perf_hpp_fmt perf_hpp__format[];
 
 enum {
-- 
cgit v0.10.2


From c6f5f6b662719ded53700deefec7dbc4227c9778 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 3 Feb 2016 23:11:20 +0900
Subject: perf report: Update documentation of --sort option

The description of the memory sort key (used by --mem-mode) was
misplaced.  Move it under the --sort option so that it can be referenced
properly.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1454508683-5735-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 8a301f6..1cb8fac 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -117,6 +117,22 @@ OPTIONS
 	And default sort keys are changed to comm, dso_from, symbol_from, dso_to
 	and symbol_to, see '--branch-stack'.
 
+	If the --mem-mode option is used, the following sort keys are also available
+	(incompatible with --branch-stack):
+	symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline.
+
+	- symbol_daddr: name of data symbol being executed on at the time of sample
+	- dso_daddr: name of library or module containing the data being executed
+	on at the time of the sample
+	- locked: whether the bus was locked at the time of the sample
+	- tlb: type of tlb access for the data at the time of the sample
+	- mem: type of memory access for the data at the time of the sample
+	- snoop: type of snoop (if any) for the data at the time of the sample
+	- dcacheline: the cacheline the data address is on at the time of the sample
+
+	And the default sort keys are changed to local_weight, mem, sym, dso,
+	symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
+
 	If the data file has tracepoint event(s), following (dynamic) sort keys
 	are also available:
 	trace, trace_fields, [<event>.]<field>[/raw]
@@ -151,22 +167,6 @@ OPTIONS
 	By default, every sort keys not specified in -F will be appended
 	automatically.
 
-	If --mem-mode option is used, following sort keys are also available
-	(incompatible with --branch-stack):
-	symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline.
-
-	- symbol_daddr: name of data symbol being executed on at the time of sample
-	- dso_daddr: name of library or module containing the data being executed
-	on at the time of sample
-	- locked: whether the bus was locked at the time of sample
-	- tlb: type of tlb access for the data at the time of sample
-	- mem: type of memory access for the data at the time of sample
-	- snoop: type of snoop (if any) for the data at the time of sample
-	- dcacheline: the cacheline the data address is on at the time of sample
-
-	And default sort keys are changed to local_weight, mem, sym, dso,
-	symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
-
 -p::
 --parent=<regex>::
         A regex filter to identify parent. The parent is a caller of this
-- 
cgit v0.10.2


From 1ba2fc6de4ac1a87e3ece65651795760ea5cf658 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 3 Feb 2016 23:11:21 +0900
Subject: perf report: Update documention of --percent-limit option

The --percent-limit option was changed to be applied to callchains as
well as to hist entries recently, but it missed to update the doc.

Reported-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1454508683-5735-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 1cb8fac..89cab84 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -351,7 +351,10 @@ OPTIONS
 
 --percent-limit::
 	Do not show entries which have an overhead under that percent.
-	(Default: 0).
+	(Default: 0).  Note that this option also sets the percent limit (threshold)
+	of callchains.  However the default value of callchain threshold is
+	different than the default value of hist entries.  Please see the
+	--call-graph option for details.
 
 --percentage::
 	Determine how to display the overhead percentage of filtered entries.
-- 
cgit v0.10.2


From b62e8dfcda8cb133c062c0e1207afea2476eb7fd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 3 Feb 2016 23:11:23 +0900
Subject: perf hists browser: Add 'L' hotkey to change percent limit

Add 'L' key action to change the percent limit applied to both of hist
entries and callchains.

Suggested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1454508683-5735-4-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 3a1e096..a5a5390 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -2029,6 +2029,42 @@ static void hist_browser__update_nr_entries(struct hist_browser *hb)
 	hb->nr_non_filtered_entries = nr_entries;
 }
 
+static void hist_browser__update_percent_limit(struct hist_browser *hb,
+					       double percent)
+{
+	struct hist_entry *he;
+	struct rb_node *nd = rb_first(&hb->hists->entries);
+	u64 total = hists__total_period(hb->hists);
+	u64 min_callchain_hits = total * (percent / 100);
+
+	hb->min_pcnt = callchain_param.min_percent = percent;
+
+	if (!symbol_conf.use_callchain)
+		return;
+
+	while ((nd = hists__filter_entries(nd, hb->min_pcnt)) != NULL) {
+		he = rb_entry(nd, struct hist_entry, rb_node);
+
+		if (callchain_param.mode == CHAIN_GRAPH_REL) {
+			total = he->stat.period;
+
+			if (symbol_conf.cumulate_callchain)
+				total = he->stat_acc->period;
+
+			min_callchain_hits = total * (percent / 100);
+		}
+
+		callchain_param.sort(&he->sorted_chain, he->callchain,
+				     min_callchain_hits, &callchain_param);
+
+		/* force to re-evaluate folding state of callchains */
+		he->init_have_children = false;
+		hist_entry__set_folding(he, false);
+
+		nd = rb_next(nd);
+	}
+}
+
 static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 				    const char *helpline,
 				    bool left_exits,
@@ -2064,6 +2100,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 	"E             Expand all callchains\n"				\
 	"F             Toggle percentage of filtered entries\n"		\
 	"H             Display column headers\n"			\
+	"L             Change percent limit\n"				\
 	"m             Display context menu\n"				\
 	"S             Zoom into current Processor Socket\n"		\
 
@@ -2219,6 +2256,24 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events,
 				top->zero = !top->zero;
 			}
 			continue;
+		case 'L':
+			if (ui_browser__input_window("Percent Limit",
+					"Please enter the value you want to hide entries under that percent.",
+					buf, "ENTER: OK, ESC: Cancel",
+					delay_secs * 2) == K_ENTER) {
+				char *end;
+				double new_percent = strtod(buf, &end);
+
+				if (new_percent < 0 || new_percent > 100) {
+					ui_browser__warning(&browser->b, delay_secs * 2,
+						"Invalid percent: %.2f", new_percent);
+					continue;
+				}
+
+				hist_browser__update_percent_limit(browser, new_percent);
+				hist_browser__reset(browser);
+			}
+			continue;
 		case K_F1:
 		case 'h':
 		case '?':
-- 
cgit v0.10.2


From 5978531b296ab7e61abef43f2a1a2d9b92246de1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 3 Feb 2016 17:16:32 -0300
Subject: perf build tests: Elide "-f Makefile" from make invokation

Since this is the name that 'make' will look for if no explicit -f file
is passed.

This in turn makes the output of 'build-test' more compact:

Before:

  $ perf stat make -C tools/perf build-test
  <SNIP>
  cd . && make FEATURE_DUMP_COPY=/home/acme/git/linux/tools/perf/BUILD_TEST_FEATURE_DUMP feature-dump
          make_no_libaudit_O: cd . && make -f Makefile  O=/tmp/tmp.tHIa0Kkk2Y DESTDIR=/tmp/tmp.foK7rckkVi NO_LIBAUDIT=1 FEATURES_DUMP=/home/acme/git/linux/tools/perf/BUILD_TEST_FEATURE_DUMP
  <SNIP>

After:

  $ perf stat make -C tools/perf build-test
  <SNIP>
          make_no_libaudit_O: cd . && make  O=/tmp/tmp.tHIa0Kkk2Y DESTDIR=/tmp/tmp.foK7rckkVi NO_LIBAUDIT=1 FEATURES_DUMP=/home/acme/git/linux/tools/perf/BUILD_TEST_FEATURE_DUMP
  <SNIP>

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-m440lb8dkfsywsyah0htif6t@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index cc72b67..0b70cf1 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -111,6 +111,9 @@ run := make_pure
 # disable features detection
 ifeq ($(MK),Makefile)
 run += make_clean_all
+MAKE_F := $(MAKE)
+else
+MAKE_F := $(MAKE) -f $(MK)
 endif
 run += make_python_perf_so
 run += make_debug
@@ -270,12 +273,12 @@ endif
 
 MAKEFLAGS := --no-print-directory
 
-clean := @(cd $(PERF); make -s -f $(MK) $(O_OPT) clean >/dev/null)
+clean := @(cd $(PERF); $(MAKE_F) -s $(O_OPT) clean >/dev/null)
 
 $(run):
 	$(call clean)
 	@TMP_DEST=$$(mktemp -d); \
-	cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST $($@)"; \
+	cmd="cd $(PERF) && $(MAKE_F) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST $($@)"; \
 	printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
 	( eval $$cmd ) >> $@ 2>&1; \
 	echo "  test: $(call test,$@)" >> $@ 2>&1; \
@@ -286,7 +289,7 @@ $(run_O):
 	$(call clean)
 	@TMP_O=$$(mktemp -d); \
 	TMP_DEST=$$(mktemp -d); \
-	cmd="cd $(PERF) && make -f $(MK) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \
+	cmd="cd $(PERF) && $(MAKE_F) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \
 	printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
 	( eval $$cmd ) >> $@ 2>&1 && \
 	echo "  test: $(call test_O,$@)" >> $@ 2>&1; \
-- 
cgit v0.10.2


From 3c7a152b0d1c81b9bac5ab922dc57168046668bf Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 3 Feb 2016 17:24:11 -0300
Subject: perf build tests: Move the feature related vars to the front of the
 make cmdline

So that we do less visual searching on the 'make build-test' output to
see the feature related variables:

After:

  $ make -C tools/perf build-test
  <SNIP>
           make_no_newt_O: cd . && make NO_NEWT=1 FEATURES_DUMP=/home/acme/git/linux/tools/perf/BUILD_TEST_FEATURE_DUMP O=/tmp/tmp.dz55IX DESTDIR=/tmp/tmp.X29xxo
              make_tags_O: cd . && make tags FEATURES_DUMP=/home/acme/git/linux/tools/perf/BUILD_TEST_FEATURE_DUMP O=/tmp/tmp.6ecLh8 DESTDIR=/tmp/tmp.6vIla578Ho
  make_util_pmu_bison_o_O: cd . && make util/pmu-bison.o FEATURES_DUMP=/home/acme/git/linux/tools/perf/BUILD_TEST_FEATURE_DUMP O=/tmp/tmp.SVPM2G DESTDIR=/tmp/tmp.C0oAam

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-dx4krgzqa566v1pedrbrcchi@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 0b70cf1..12dcae7 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -278,7 +278,7 @@ clean := @(cd $(PERF); $(MAKE_F) -s $(O_OPT) clean >/dev/null)
 $(run):
 	$(call clean)
 	@TMP_DEST=$$(mktemp -d); \
-	cmd="cd $(PERF) && $(MAKE_F) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST $($@)"; \
+	cmd="cd $(PERF) && $(MAKE_F) $($@) $(PARALLEL_OPT) $(O_OPT) DESTDIR=$$TMP_DEST"; \
 	printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
 	( eval $$cmd ) >> $@ 2>&1; \
 	echo "  test: $(call test,$@)" >> $@ 2>&1; \
@@ -289,7 +289,7 @@ $(run_O):
 	$(call clean)
 	@TMP_O=$$(mktemp -d); \
 	TMP_DEST=$$(mktemp -d); \
-	cmd="cd $(PERF) && $(MAKE_F) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST $($(patsubst %_O,%,$@))"; \
+	cmd="cd $(PERF) && $(MAKE_F) $($(patsubst %_O,%,$@)) $(PARALLEL_OPT) O=$$TMP_O DESTDIR=$$TMP_DEST"; \
 	printf "%*.*s: %s\n" $(max_width) $(max_width) "$@" "$$cmd" && echo $$cmd > $@ && \
 	( eval $$cmd ) >> $@ 2>&1 && \
 	echo "  test: $(call test_O,$@)" >> $@ 2>&1; \
-- 
cgit v0.10.2


From 67f43c009778ddaae812aae29731bb04c256165e Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Thu, 4 Feb 2016 18:25:06 +0900
Subject: perf config: Document 'ui.show-headers' variable in man page

This option controls display of column headers (like 'Overhead' and
'Symbol') in 'report' and 'top'. If this option is false, they are
hidden.  This option is only applied to TUI.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1454577913-16401-2-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 74589c6..4278722 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -296,6 +296,12 @@ hist.*::
 	       and 'baz' to 50.00% for each, while 'absolute' would show their
 	       current overhead (33.33%).
 
+ui.*::
+	ui.show-headers::
+		This option controls display of column headers (like 'Overhead' and 'Symbol')
+		in 'report' and 'top'. If this option is false, they are hidden.
+		This option is only applied to TUI.
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From 56c94dc56f9e4c1c09fbe26ad65715caa2259438 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Thu, 4 Feb 2016 18:25:07 +0900
Subject: perf config: Document variables for 'call-graph' section in man page

Explain 'call-graph' section and its variables:

  'record-mode', 'dump-size', 'print-type', 'order', 'sort-key',
  'threshold' and 'print-limit'.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1454577913-16401-3-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 4278722..42310ae 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -302,6 +302,73 @@ ui.*::
 		in 'report' and 'top'. If this option is false, they are hidden.
 		This option is only applied to TUI.
 
+call-graph.*::
+	When sub-commands 'top' and 'report' work with -g/—-children
+	there're options in control of call-graph.
+
+	call-graph.record-mode::
+		The record-mode can be 'fp' (frame pointer), 'dwarf' and 'lbr'.
+		The value of 'dwarf' is effective only if perf detect needed library
+		(libunwind or a recent version of libdw).
+		'lbr' only work for cpus that support it.
+
+	call-graph.dump-size::
+		The size of stack to dump in order to do post-unwinding. Default is 8192 (byte).
+		When using dwarf into record-mode, the default size will be used if omitted.
+
+	call-graph.print-type::
+		The print-types can be graph (graph absolute), fractal (graph relative),
+		flat and folded. This option controls a way to show overhead for each callchain
+		entry. Suppose a following example.
+
+                Overhead  Symbols
+                ........  .......
+                  40.00%  foo
+                          |
+                          ---foo
+                             |
+                             |--50.00%--bar
+                             |          main
+                             |
+                              --50.00%--baz
+                                        main
+
+		This output is a 'fractal' format. The 'foo' came from 'bar' and 'baz' exactly
+		half and half so 'fractal' shows 50.00% for each
+		(meaning that it assumes 100% total overhead of 'foo').
+
+		The 'graph' uses absolute overhead value of 'foo' as total so each of
+		'bar' and 'baz' callchain will have 20.00% of overhead.
+		If 'flat' is used, single column and linear exposure of call chains.
+		'folded' mean call chains are displayed in a line, separated by semicolons.
+
+	call-graph.order::
+		This option controls print order of callchains. The default is
+		'callee' which means callee is printed at top and then followed by its
+		caller and so on. The 'caller' prints it in reverse order.
+
+		If this option is not set and report.children or top.children is
+		set to true (or the equivalent command line option is given),
+		the default value of this option is changed to 'caller' for the
+		execution of 'perf report' or 'perf top'. Other commands will
+		still default to 'callee'.
+
+	call-graph.sort-key::
+		The callchains are merged if they contain same information.
+		The sort-key option determines a way to compare the callchains.
+		A value of 'sort-key' can be 'function' or 'address'.
+		The default is 'function'.
+
+	call-graph.threshold::
+		When there're many callchains it'd print tons of lines. So perf omits
+		small callchains under a certain overhead (threshold) and this option
+		control the threshold. Default is 0.5 (%). The overhead is calculated
+		by value depends on call-graph.print-type.
+
+	call-graph.print-limit::
+		This is a maximum number of lines of callchain printed for a single
+		histogram entry. Default is 0 which means no limitation.
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From 806cb95bb6cb25105b37d971d9916105898cb6fe Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Thu, 4 Feb 2016 18:25:08 +0900
Subject: perf config: Document variables for 'report' section in man page

Explain 'report' section's variables:

  'percent-limit', 'queue-size' and 'children'.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1454577913-16401-4-git-send-email-treeze.taeung@gmail.com
[ Fix some grammar issues, add some more info ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 42310ae..f38f46f 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -369,6 +369,42 @@ call-graph.*::
 		This is a maximum number of lines of callchain printed for a single
 		histogram entry. Default is 0 which means no limitation.
 
+report.*::
+	report.percent-limit::
+		This one is mostly the same as call-graph.threshold but works for
+		histogram entries. Entries having an overhead lower than this
+		percentage will not be printed. Default is '0'. If percent-limit
+		is '10', only entries which have more than 10% of overhead will be
+		printed.
+
+	report.queue-size::
+		This option sets up the maximum allocation size of the internal
+		event queue for ordering events. Default is 0, meaning no limit.
+
+	report.children::
+		'Children' means functions called from another function.
+		If this option is true, 'perf report' cumulates callchains of children
+		and show (accumulated) total overhead as well as 'Self' overhead.
+		Please refer to the 'perf report' manual. The default is 'true'.
+
+	report.group::
+		This option is to show event group information together.
+		Example output with this turned on, notice that there is one column
+		per event in the group, ref-cycles and cycles:
+
+		# group: {ref-cycles,cycles}
+		# ========
+		#
+		# Samples: 7K of event 'anon group { ref-cycles, cycles }'
+		# Event count (approx.): 6876107743
+		#
+		#         Overhead  Command      Shared Object               Symbol
+		# ................  .......  .................  ...................
+		#
+		    99.84%  99.76%  noploop  noploop            [.] main
+		     0.07%   0.00%  noploop  ld-2.15.so         [.] strcmp
+		     0.03%   0.00%  noploop  [kernel.kallsyms]  [k] timerqueue_del
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From 0b04c84087d3188c648628a6c73738314724c921 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Thu, 4 Feb 2016 18:25:09 +0900
Subject: perf config: Document 'top.children' variable in man page

Explain 'top.children' variable.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1454577913-16401-5-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index f38f46f..5e1db5ae 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -405,6 +405,13 @@ report.*::
 		     0.07%   0.00%  noploop  ld-2.15.so         [.] strcmp
 		     0.03%   0.00%  noploop  [kernel.kallsyms]  [k] timerqueue_del
 
+top.*::
+	top.children::
+		Same as 'report.children'. So if it is enabled, the output of 'top'
+		command will have 'Children' overhead column as well as 'Self' overhead
+		column by default.
+		The default is 'true'.
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From 08b75b409e3799553a3536e628f1dba4c87d7c14 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Thu, 4 Feb 2016 18:25:10 +0900
Subject: perf config: Document 'man.viewer' variable in man page

Explain 'man.viewer' variable and how to add new man viewer tools.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1454577913-16401-6-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 5e1db5ae..fd3f048 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -412,6 +412,15 @@ top.*::
 		column by default.
 		The default is 'true'.
 
+man.*::
+	man.viewer::
+		This option can assign a tool to view manual pages when 'help'
+		subcommand was invoked. Supported tools are 'man', 'woman'
+		(with emacs client) and 'konqueror'. Default is 'man'.
+
+		New man viewer tool can be also added using 'man.<tool>.cmd'
+		or use different path using 'man.<tool>.path' config option.
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From ab2e08e8ba683f3e923a56b1e81b5c5e115bad0b Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Thu, 4 Feb 2016 18:25:11 +0900
Subject: perf config: Document 'pager.<subcommand>' variables in man page

Explain 'pager.<subcommand>' variables.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1454577913-16401-7-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index fd3f048..99aa72e 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -421,6 +421,11 @@ man.*::
 		New man viewer tool can be also added using 'man.<tool>.cmd'
 		or use different path using 'man.<tool>.path' config option.
 
+pager.*::
+	pager.<subcommand>::
+		When the subcommand is run on stdio, determine whether it uses
+		pager or not based on this value. Default is 'unspecified'.
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From 57f0dafe6a41de6c9d81bc6c403349a261e10fc4 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Thu, 4 Feb 2016 18:25:12 +0900
Subject: perf config: Document 'kmem.default' variable in man page

Explain 'kmem.default' variable.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1454577913-16401-8-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 99aa72e..fb1f4a9 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -426,6 +426,11 @@ pager.*::
 		When the subcommand is run on stdio, determine whether it uses
 		pager or not based on this value. Default is 'unspecified'.
 
+kmem.*::
+	kmem.default::
+		This option decides which allocator is to be analyzed if neither
+		'--slab' nor '--page' option is used. Default is 'slab'.
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From a9edec3ce211d776736b35b14b9bd2c0b5ed860b Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Thu, 4 Feb 2016 18:25:13 +0900
Subject: perf config: Document 'record.build-id' variable in man page

Explain 'record.build-id' variable.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1454577913-16401-9-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index fb1f4a9..c7158bf 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -431,6 +431,14 @@ kmem.*::
 		This option decides which allocator is to be analyzed if neither
 		'--slab' nor '--page' option is used. Default is 'slab'.
 
+record.*::
+	record.build-id::
+		This option can be 'cache', 'no-cache' or 'skip'.
+		'cache' is to post-process data and save/update the binaries into
+		the build-id cache (in ~/.debug). This is the default.
+		But if this option is 'no-cache', it will not update the build-id cache.
+		'skip' skips post-processing and does not update the cache.
+
 SEE ALSO
 --------
 linkperf:perf[1]
-- 
cgit v0.10.2


From 3e2751d9169563486c2bfe7382726f1315cb156b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 4 Feb 2016 12:30:36 +0100
Subject: perf tools: Fix parallel build including 'clean' target

Do not parallelize 'clean' with other targets, figure out if it is
present and do it first, then the other targets.

Noticed with:

  tools/perf> make -j24 clean all

   LD       arch/libperf-in.o
   LD       plugin_xen-in.o
 arch//libperf-in.o: file not recognized: File truncated
 make[3]: *** [arch/libperf-in.o] Error 1
 make[2]: *** [arch] Error 2
 make[2]: *** Waiting for unfinished jobs....
   AR       libapi.a

Reported-and-Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-kb0qs29zbz7hxn32mc5zbsoz@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 4b68f46..67837c6 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -68,6 +68,20 @@ all tags TAGS:
 	$(print_msg)
 	$(make)
 
+ifdef MAKECMDGOALS
+has_clean := 0
+ifneq ($(filter clean,$(MAKECMDGOALS)),)
+  has_clean := 1
+endif # clean
+
+ifeq ($(has_clean),1)
+  rest := $(filter-out clean,$(MAKECMDGOALS))
+  ifneq ($(rest),)
+$(rest): clean
+  endif # rest
+endif # has_clean
+endif # MAKECMDGOALS
+
 #
 # The clean target is not really parallel, don't print the jobs info:
 #
-- 
cgit v0.10.2


From be9e49911123516ef883836906269832aec37e01 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 3 Feb 2016 17:28:45 -0300
Subject: perf build tests: Do parallell builds with 'build-test'

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-jhmnf9g7y9ryqcjql00unk5y@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 67837c6..32a64e6 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -99,7 +99,7 @@ clean:
 # make -C tools/perf -f tests/make
 #
 build-test:
-	@$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile --no-print-directory tarpkg out
+	@$(MAKE) SHUF=1 -f tests/make REUSE_FEATURES_DUMP=1 MK=Makefile SET_PARALLEL=1 --no-print-directory tarpkg out
 
 #
 # All other targets get passed through:
-- 
cgit v0.10.2


From a6e4491c682a7b28574a62e6f311a0acec50b318 Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Thu, 4 Feb 2016 09:38:00 -0500
Subject: sched/isolcpus: Output warning when the 'isolcpus=' kernel parameter
 is invalid

The isolcpus= kernel boot parameter restricts userspace from scheduling on
the specified CPUs.

If a CPU is specified that is outside the range of 0 to nr_cpu_ids,
cpulist_parse() will return -ERANGE, return an empty cpulist, and
fail silently.

This patch adds an error message to isolated_cpu_setup() to indicate to
the user that something has gone awry, and returns 0 on error.

Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454596680-10367-1-git-send-email-prarit@redhat.com
[ Twiddled some details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9503d59..24fcdbf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6173,11 +6173,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
+	int ret;
+
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
-	cpulist_parse(str, cpu_isolated_map);
+	ret = cpulist_parse(str, cpu_isolated_map);
+	if (ret) {
+		pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+		return 0;
+	}
 	return 1;
 }
-
 __setup("isolcpus=", isolated_cpu_setup);
 
 struct s_data {
-- 
cgit v0.10.2


From 89fee59b504f86925894fcc9ba79d5c933842f93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marcin=20=C5=9Alusarz?= <marcin.slusarz@gmail.com>
Date: Tue, 19 Jan 2016 20:03:03 +0100
Subject: perf tools: handle spaces in file names obtained from /proc/pid/maps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Steam frequently puts game binaries in folders with spaces.

Note: "(deleted)" markers are now treated as part of the file name.

Signed-off-by: Marcin Ślusarz <marcin.slusarz@gmail.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Fixes: 6064803313ba ("perf tools: Use sscanf for parsing /proc/pid/maps")
Link: http://lkml.kernel.org/r/20160119190303.GA17579@marcin-Inspiron-7720
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 85155e9..7bad5c3 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -282,7 +282,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool,
 		strcpy(execname, "");
 
 		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
-		n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %s\n",
+		n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %[^\n]\n",
 		       &event->mmap2.start, &event->mmap2.len, prot,
 		       &event->mmap2.pgoff, &event->mmap2.maj,
 		       &event->mmap2.min,
-- 
cgit v0.10.2


From e9c4bcdd349eb00f6c704450a063b3dcbea25864 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 30 Nov 2015 10:02:20 +0100
Subject: perf symbols: add Java demangling support

Add Java function descriptor demangling support.  Something bfd cannot
do.

Use the JAVA_DEMANGLE_NORET flag to avoid decoding the return type of
functions.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Carl Love <cel@us.ibm.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John McCutchan <johnmccutchan@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Pawel Moll <pawel.moll@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sonny Rao <sonnyrao@chromium.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1448874143-7269-2-git-send-email-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 5eec53a..edae107 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -105,6 +105,7 @@ libperf-y += scripting-engines/
 
 libperf-$(CONFIG_ZLIB) += zlib.o
 libperf-$(CONFIG_LZMA) += lzma.o
+libperf-y += demangle-java.o
 
 CFLAGS_config.o   += -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 
diff --git a/tools/perf/util/demangle-java.c b/tools/perf/util/demangle-java.c
new file mode 100644
index 0000000..3e6062a
--- /dev/null
+++ b/tools/perf/util/demangle-java.c
@@ -0,0 +1,199 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include "util.h"
+#include "debug.h"
+#include "symbol.h"
+
+#include "demangle-java.h"
+
+enum {
+	MODE_PREFIX = 0,
+	MODE_CLASS  = 1,
+	MODE_FUNC   = 2,
+	MODE_TYPE   = 3,
+	MODE_CTYPE  = 3, /* class arg */
+};
+
+#define BASE_ENT(c, n)	[c - 'A']=n
+static const char *base_types['Z' - 'A' + 1] = {
+	BASE_ENT('B', "byte" ),
+	BASE_ENT('C', "char" ),
+	BASE_ENT('D', "double" ),
+	BASE_ENT('F', "float" ),
+	BASE_ENT('I', "int" ),
+	BASE_ENT('J', "long" ),
+	BASE_ENT('S', "short" ),
+	BASE_ENT('Z', "bool" ),
+};
+
+/*
+ * demangle Java symbol between str and end positions and stores
+ * up to maxlen characters into buf. The parser starts in mode.
+ *
+ * Use MODE_PREFIX to process entire prototype till end position
+ * Use MODE_TYPE to process return type if str starts on return type char
+ *
+ *  Return:
+ *	success: buf
+ *	error  : NULL
+ */
+static char *
+__demangle_java_sym(const char *str, const char *end, char *buf, int maxlen, int mode)
+{
+	int rlen = 0;
+	int array = 0;
+	int narg = 0;
+	const char *q;
+
+	if (!end)
+		end = str + strlen(str);
+
+	for (q = str; q != end; q++) {
+
+		if (rlen == (maxlen - 1))
+			break;
+
+		switch (*q) {
+		case 'L':
+			if (mode == MODE_PREFIX || mode == MODE_CTYPE) {
+				if (mode == MODE_CTYPE) {
+					if (narg)
+						rlen += scnprintf(buf + rlen, maxlen - rlen, ", ");
+					narg++;
+				}
+				rlen += scnprintf(buf + rlen, maxlen - rlen, "class ");
+				if (mode == MODE_PREFIX)
+					mode = MODE_CLASS;
+			} else
+				buf[rlen++] = *q;
+			break;
+		case 'B':
+		case 'C':
+		case 'D':
+		case 'F':
+		case 'I':
+		case 'J':
+		case 'S':
+		case 'Z':
+			if (mode == MODE_TYPE) {
+				if (narg)
+					rlen += scnprintf(buf + rlen, maxlen - rlen, ", ");
+				rlen += scnprintf(buf + rlen, maxlen - rlen, "%s", base_types[*q - 'A']);
+				while (array--)
+					rlen += scnprintf(buf + rlen, maxlen - rlen, "[]");
+				array = 0;
+				narg++;
+			} else
+				buf[rlen++] = *q;
+			break;
+		case 'V':
+			if (mode == MODE_TYPE) {
+				rlen += scnprintf(buf + rlen, maxlen - rlen, "void");
+				while (array--)
+					rlen += scnprintf(buf + rlen, maxlen - rlen, "[]");
+				array = 0;
+			} else
+				buf[rlen++] = *q;
+			break;
+		case '[':
+			if (mode != MODE_TYPE)
+				goto error;
+			array++;
+			break;
+		case '(':
+			if (mode != MODE_FUNC)
+				goto error;
+			buf[rlen++] = *q;
+			mode = MODE_TYPE;
+			break;
+		case ')':
+			if (mode != MODE_TYPE)
+				goto error;
+			buf[rlen++] = *q;
+			narg = 0;
+			break;
+		case ';':
+			if (mode != MODE_CLASS && mode != MODE_CTYPE)
+				goto error;
+			/* safe because at least one other char to process */
+			if (isalpha(*(q + 1)))
+				rlen += scnprintf(buf + rlen, maxlen - rlen, ".");
+			if (mode == MODE_CLASS)
+				mode = MODE_FUNC;
+			else if (mode == MODE_CTYPE)
+				mode = MODE_TYPE;
+			break;
+		case '/':
+			if (mode != MODE_CLASS && mode != MODE_CTYPE)
+				goto error;
+			rlen += scnprintf(buf + rlen, maxlen - rlen, ".");
+			break;
+		default :
+			buf[rlen++] = *q;
+		}
+	}
+	buf[rlen] = '\0';
+	return buf;
+error:
+	return NULL;
+}
+
+/*
+ * Demangle Java function signature (openJDK, not GCJ)
+ * input:
+ * 	str: string to parse. String is not modified
+ *    flags: comobination of JAVA_DEMANGLE_* flags to modify demangling
+ * return:
+ *	if input can be demangled, then a newly allocated string is returned.
+ *	if input cannot be demangled, then NULL is returned
+ *
+ * Note: caller is responsible for freeing demangled string
+ */
+char *
+java_demangle_sym(const char *str, int flags)
+{
+	char *buf, *ptr;
+	char *p;
+	size_t len, l1 = 0;
+
+	if (!str)
+		return NULL;
+
+	/* find start of retunr type */
+	p = strrchr(str, ')');
+	if (!p)
+		return NULL;
+
+	/*
+	 * expansion factor estimated to 3x
+	 */
+	len = strlen(str) * 3 + 1;
+	buf = malloc(len);
+	if (!buf)
+		return NULL;
+
+	buf[0] = '\0';
+	if (!(flags & JAVA_DEMANGLE_NORET)) {
+		/*
+		 * get return type first
+		 */
+		ptr = __demangle_java_sym(p + 1, NULL, buf, len, MODE_TYPE);
+		if (!ptr)
+			goto error;
+
+		/* add space between return type and function prototype */
+		l1 = strlen(buf);
+		buf[l1++] = ' ';
+	}
+
+	/* process function up to return type */
+	ptr = __demangle_java_sym(str, p + 1, buf + l1, len - l1, MODE_PREFIX);
+	if (!ptr)
+		goto error;
+
+	return buf;
+error:
+	free(buf);
+	return NULL;
+}
diff --git a/tools/perf/util/demangle-java.h b/tools/perf/util/demangle-java.h
new file mode 100644
index 0000000..a981c1f
--- /dev/null
+++ b/tools/perf/util/demangle-java.h
@@ -0,0 +1,10 @@
+#ifndef __PERF_DEMANGLE_JAVA
+#define __PERF_DEMANGLE_JAVA 1
+/*
+ * demangle function flags
+ */
+#define JAVA_DEMANGLE_NORET	0x1 /* do not process return type */
+
+char * java_demangle_sym(const char *str, int flags);
+
+#endif /* __PERF_DEMANGLE_JAVA */
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index 562b8eb..b1dd68f 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -6,6 +6,7 @@
 #include <inttypes.h>
 
 #include "symbol.h"
+#include "demangle-java.h"
 #include "machine.h"
 #include "vdso.h"
 #include <symbol/kallsyms.h>
@@ -1077,6 +1078,8 @@ new_symbol:
 				demangle_flags = DMGL_PARAMS | DMGL_ANSI;
 
 			demangled = bfd_demangle(NULL, elf_name, demangle_flags);
+			if (demangled == NULL)
+				demangled = java_demangle_sym(elf_name, JAVA_DEMANGLE_NORET);
 			if (demangled != NULL)
 				elf_name = demangled;
 		}
-- 
cgit v0.10.2


From 8ee4646038e47d065d35703e3e343136c4cd42aa Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 30 Nov 2015 10:02:21 +0100
Subject: perf build: Add libcrypto feature detection

Will be used to generate build-ids in the jitdump code.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Carl Love <cel@us.ibm.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John McCutchan <johnmccutchan@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Pawel Moll <pawel.moll@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sonny Rao <sonnyrao@chromium.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1448874143-7269-3-git-send-email-eranian@google.com
[ tools/perf/Makefile.perf comment about NO_LIBCRYPTO and added it to tests/make ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
index 7bff2ea..6b77072 100644
--- a/tools/build/Makefile.feature
+++ b/tools/build/Makefile.feature
@@ -46,6 +46,7 @@ FEATURE_TESTS_BASIC :=			\
 	libpython			\
 	libpython-version		\
 	libslang			\
+	libcrypto			\
 	libunwind			\
 	pthread-attr-setaffinity-np	\
 	stackprotector-all		\
@@ -87,6 +88,7 @@ FEATURE_DISPLAY ?=			\
 	libperl				\
 	libpython			\
 	libslang			\
+	libcrypto			\
 	libunwind			\
 	libdw-dwarf-unwind		\
 	zlib				\
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index bf8f035..c5f4c41 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -23,6 +23,7 @@ FILES=					\
 	test-libpython.bin		\
 	test-libpython-version.bin	\
 	test-libslang.bin		\
+	test-libcrypto.bin		\
 	test-libunwind.bin		\
 	test-libunwind-debug-frame.bin	\
 	test-pthread-attr-setaffinity-np.bin	\
@@ -105,6 +106,9 @@ $(OUTPUT)test-libaudit.bin:
 $(OUTPUT)test-libslang.bin:
 	$(BUILD) -I/usr/include/slang -lslang
 
+$(OUTPUT)test-libcrypto.bin:
+	$(BUILD) -lcrypto
+
 $(OUTPUT)test-gtk2.bin:
 	$(BUILD) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
 
diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c
index 81025ca..e499a36 100644
--- a/tools/build/feature/test-all.c
+++ b/tools/build/feature/test-all.c
@@ -129,6 +129,10 @@
 # include "test-bpf.c"
 #undef main
 
+#define main main_test_libcrypto
+# include "test-libcrypto.c"
+#undef main
+
 int main(int argc, char *argv[])
 {
 	main_test_libpython();
@@ -158,6 +162,7 @@ int main(int argc, char *argv[])
 	main_test_lzma();
 	main_test_get_cpuid();
 	main_test_bpf();
+	main_test_libcrypto();
 
 	return 0;
 }
diff --git a/tools/build/feature/test-libcrypto.c b/tools/build/feature/test-libcrypto.c
new file mode 100644
index 0000000..bd79dc7
--- /dev/null
+++ b/tools/build/feature/test-libcrypto.c
@@ -0,0 +1,17 @@
+#include <openssl/sha.h>
+#include <openssl/md5.h>
+
+int main(void)
+{
+	MD5_CTX context;
+	unsigned char md[MD5_DIGEST_LENGTH + SHA_DIGEST_LENGTH];
+	unsigned char dat[] = "12345";
+
+	MD5_Init(&context);
+	MD5_Update(&context, &dat[0], sizeof(dat));
+	MD5_Final(&md[0], &context);
+
+	SHA1(&dat[0], sizeof(dat), &md[0]);
+
+	return 0;
+}
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 0ef3d97..d404117 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -58,6 +58,9 @@ include config/utilities.mak
 #
 # Define NO_LIBBIONIC if you do not want bionic support
 #
+# Define NO_LIBCRYPTO if you do not want libcrypto (openssl) support
+# used for generating build-ids for ELFs generated by jitdump.
+#
 # Define NO_LIBDW_DWARF_UNWIND if you do not want libdw support
 # for dwarf backtrace post unwind.
 #
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 0045a5d..f7aeaf3 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -404,6 +404,17 @@ ifndef NO_LIBAUDIT
   endif
 endif
 
+ifndef NO_LIBCRYPTO
+  ifneq ($(feature-libcrypto), 1)
+    msg := $(warning No libcrypto.h found, disables jitted code injection, please install libssl-devel or libssl-dev);
+    NO_LIBCRYPTO := 1
+  else
+    CFLAGS += -DHAVE_LIBCRYPTO_SUPPORT
+    EXTLIBS += -lcrypto
+    $(call detected,CONFIG_CRYPTO)
+  endif
+endif
+
 ifdef NO_NEWT
   NO_SLANG=1
 endif
diff --git a/tools/perf/tests/make b/tools/perf/tests/make
index 12dcae7..cac15d9 100644
--- a/tools/perf/tests/make
+++ b/tools/perf/tests/make
@@ -80,6 +80,7 @@ make_no_libaudit    := NO_LIBAUDIT=1
 make_no_libbionic   := NO_LIBBIONIC=1
 make_no_auxtrace    := NO_AUXTRACE=1
 make_no_libbpf	    := NO_LIBBPF=1
+make_no_libcrypto   := NO_LIBCRYPTO=1
 make_tags           := tags
 make_cscope         := cscope
 make_help           := help
@@ -103,6 +104,7 @@ make_minimal        := NO_LIBPERL=1 NO_LIBPYTHON=1 NO_NEWT=1 NO_GTK2=1
 make_minimal        += NO_DEMANGLE=1 NO_LIBELF=1 NO_LIBUNWIND=1 NO_BACKTRACE=1
 make_minimal        += NO_LIBNUMA=1 NO_LIBAUDIT=1 NO_LIBBIONIC=1
 make_minimal        += NO_LIBDW_DWARF_UNWIND=1 NO_AUXTRACE=1 NO_LIBBPF=1
+make_minimal        += NO_LIBCRYPTO=1
 
 # $(run) contains all available tests
 run := make_pure
-- 
cgit v0.10.2


From 921f3fadbc48c7c3799b415b895297cd476cf7f1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 22 Jan 2016 18:41:00 -0300
Subject: perf inject: Make sure mmap records are ordered when injecting
 build_ids

To make sure the mmap records are ordered correctly and so that the
correct especially due to jitted code mmaps.

We cannot generate the buildid hit list and inject the jit mmaps (will
come right after this patch) in at the same time for now.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Carl Love <cel@us.ibm.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John McCutchan <johnmccutchan@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Pawel Moll <pawel.moll@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sonny Rao <sonnyrao@chromium.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1448874143-7269-3-git-send-email-eranian@google.com
[ Carved out from a larger patch ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 0022e02..6567bae 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -755,6 +755,17 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (inject.session == NULL)
 		return -1;
 
+	if (inject.build_ids) {
+		/*
+		 * to make sure the mmap records are ordered correctly
+		 * and so that the correct especially due to jitted code
+		 * mmaps. We cannot generate the buildid hit list and
+		 * inject the jit mmaps at the same time for now.
+		 */
+		inject.tool.ordered_events = true;
+		inject.tool.ordering_requires_timestamps = true;
+	}
+
 	ret = symbol__init(&inject.session->header.env);
 	if (ret < 0)
 		goto out_delete;
-- 
cgit v0.10.2


From 9b07e27f88b9cd785cdb23f9a2231c12521dda94 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 30 Nov 2015 10:02:21 +0100
Subject: perf inject: Add jitdump mmap injection support

This patch adds a --jit/-j option to perf inject.

This options injects MMAP records into the perf.data file to cover the
jitted code mmaps. It also emits ELF images for each function in the
jidump file.  Those images are created where the jitdump file is.  The
MMAP records point to that location as well.

Typical flow:

  $ perf record -k mono -- java -agentpath:libpjvmti.so java_class
  $ perf inject --jit -i perf.data -o perf.data.jitted
  $ perf report -i perf.data.jitted

Note that jitdump.h support is not limited to Java, it works with any
jitted environment modified to emit the jitdump file format, include
those where code can be jitted multiple times and moved around.

The jitdump.h format is adapted from the Oprofile project.

The genelf.c (ELF binary generation) depends on MD5 hash encoding for
the buildid. To enable this, libssl-dev must be installed. If not, then
genelf.c defaults to using urandom to generate the buildid, which is not
ideal.  The Makefile auto-detects the presence on libssl-dev.

This version mmaps the jitdump file to create a marker MMAP record in
the perf.data file. The marker is used to detect jitdump and cause perf
inject to inject the jitted mmaps and generate ELF images for jitted
functions.

In V8, the following fixes and changes were made among other things:

  -  the jidump header format include a new flags field to be used
     to carry information about the configuration of the runtime agent.
     Contributed by: Adrian Hunter <adrian.hunter@intel.com>

  - Fix mmap pgoff: MMAP event pgoff must be the offset within the ELF file
    at which the code resides.
    Contributed by: Adrian Hunter <adrian.hunter@intel.com>

  - Fix ELF virtual addresses: perf tools expect the ELF virtual addresses of dynamic
    objects to match the file offset.
    Contributed by: Adrian Hunter <adrian.hunter@intel.com>

  - JIT MMAP injection does not obey finished_round semantics. JIT MMAP injection injects all
    MMAP events in one go, so it does not obey finished_round semantics, so drop the
    finished_round events from the output perf.data file.
    Contributed by: Adrian Hunter <adrian.hunter@intel.com>

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Carl Love <cel@us.ibm.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John McCutchan <johnmccutchan@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Pawel Moll <pawel.moll@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sonny Rao <sonnyrao@chromium.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1448874143-7269-3-git-send-email-eranian@google.com
[ Moved inject.build_ids ordering bits to a separate patch, fixed the NO_LIBELF=1 build ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index 0b1cede..87b2588 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -53,6 +53,13 @@ include::itrace.txt[]
 --strip::
 	Use with --itrace to strip out non-synthesized events.
 
+-j::
+--jit::
+	Process jitdump files by injecting the mmap records corresponding to jitted
+	functions. This option also generates the ELF images for each jitted function
+	found in the jitdumps files captured in the input perf.data file. Use this option
+	if you are monitoring environment using JIT runtimes, such as Java, DART or V8.
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1]
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 6567bae..b38445f 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -17,6 +17,7 @@
 #include "util/build-id.h"
 #include "util/data.h"
 #include "util/auxtrace.h"
+#include "util/jit.h"
 
 #include <subcmd/parse-options.h>
 
@@ -29,6 +30,7 @@ struct perf_inject {
 	bool			sched_stat;
 	bool			have_auxtrace;
 	bool			strip;
+	bool			jit_mode;
 	const char		*input_name;
 	struct perf_data_file	output;
 	u64			bytes_written;
@@ -71,6 +73,15 @@ static int perf_event__repipe_oe_synth(struct perf_tool *tool,
 	return perf_event__repipe_synth(tool, event);
 }
 
+#ifdef HAVE_LIBELF_SUPPORT
+static int perf_event__drop_oe(struct perf_tool *tool __maybe_unused,
+			       union perf_event *event __maybe_unused,
+			       struct ordered_events *oe __maybe_unused)
+{
+	return 0;
+}
+#endif
+
 static int perf_event__repipe_op2_synth(struct perf_tool *tool,
 					union perf_event *event,
 					struct perf_session *session
@@ -234,6 +245,27 @@ static int perf_event__repipe_mmap(struct perf_tool *tool,
 	return err;
 }
 
+#ifdef HAVE_LIBELF_SUPPORT
+static int perf_event__jit_repipe_mmap(struct perf_tool *tool,
+				       union perf_event *event,
+				       struct perf_sample *sample,
+				       struct machine *machine)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	u64 n = 0;
+
+	/*
+	 * if jit marker, then inject jit mmaps and generate ELF images
+	 */
+	if (!jit_process(inject->session, &inject->output, machine,
+			 event->mmap.filename, sample->pid, &n)) {
+		inject->bytes_written += n;
+		return 0;
+	}
+	return perf_event__repipe_mmap(tool, event, sample, machine);
+}
+#endif
+
 static int perf_event__repipe_mmap2(struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_sample *sample,
@@ -247,6 +279,27 @@ static int perf_event__repipe_mmap2(struct perf_tool *tool,
 	return err;
 }
 
+#ifdef HAVE_LIBELF_SUPPORT
+static int perf_event__jit_repipe_mmap2(struct perf_tool *tool,
+					union perf_event *event,
+					struct perf_sample *sample,
+					struct machine *machine)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	u64 n = 0;
+
+	/*
+	 * if jit marker, then inject jit mmaps and generate ELF images
+	 */
+	if (!jit_process(inject->session, &inject->output, machine,
+			  event->mmap2.filename, sample->pid, &n)) {
+		inject->bytes_written += n;
+		return 0;
+	}
+	return perf_event__repipe_mmap2(tool, event, sample, machine);
+}
+#endif
+
 static int perf_event__repipe_fork(struct perf_tool *tool,
 				   union perf_event *event,
 				   struct perf_sample *sample,
@@ -664,6 +717,23 @@ static int __cmd_inject(struct perf_inject *inject)
 	return ret;
 }
 
+#ifdef HAVE_LIBELF_SUPPORT
+static int
+jit_validate_events(struct perf_session *session)
+{
+	struct perf_evsel *evsel;
+
+	/*
+	 * check that all events use CLOCK_MONOTONIC
+	 */
+	evlist__for_each(session->evlist, evsel) {
+		if (evsel->attr.use_clockid == 0 || evsel->attr.clockid != CLOCK_MONOTONIC)
+			return -1;
+	}
+	return 0;
+}
+#endif
+
 int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	struct perf_inject inject = {
@@ -703,7 +773,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 	};
 	int ret;
 
-	const struct option options[] = {
+	struct option options[] = {
 		OPT_BOOLEAN('b', "build-ids", &inject.build_ids,
 			    "Inject build-ids into the output stream"),
 		OPT_STRING('i', "input", &inject.input_name, "file",
@@ -713,6 +783,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 		OPT_BOOLEAN('s', "sched-stat", &inject.sched_stat,
 			    "Merge sched-stat and sched-switch for getting events "
 			    "where and how long tasks slept"),
+		OPT_BOOLEAN('j', "jit", &inject.jit_mode, "merge jitdump files into perf.data file"),
 		OPT_INCR('v', "verbose", &verbose,
 			 "be more verbose (show build ids, etc)"),
 		OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file",
@@ -729,7 +800,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 		"perf inject [<options>]",
 		NULL
 	};
-
+#ifndef HAVE_LIBELF_SUPPORT
+	set_option_nobuild(options, 'j', "jit", "NO_LIBELF=1", true);
+#endif
 	argc = parse_options(argc, argv, options, inject_usage, 0);
 
 	/*
@@ -765,7 +838,26 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 		inject.tool.ordered_events = true;
 		inject.tool.ordering_requires_timestamps = true;
 	}
-
+#ifdef HAVE_LIBELF_SUPPORT
+	if (inject.jit_mode) {
+		/*
+		 * validate event is using the correct clockid
+		 */
+		if (jit_validate_events(inject.session)) {
+			fprintf(stderr, "error, jitted code must be sampled with perf record -k 1\n");
+			return -1;
+		}
+		inject.tool.mmap2	   = perf_event__jit_repipe_mmap2;
+		inject.tool.mmap	   = perf_event__jit_repipe_mmap;
+		inject.tool.ordered_events = true;
+		inject.tool.ordering_requires_timestamps = true;
+		/*
+		 * JIT MMAP injection injects all MMAP events in one go, so it
+		 * does not obey finished_round semantics.
+		 */
+		inject.tool.finished_round = perf_event__drop_oe;
+	}
+#endif
 	ret = symbol__init(&inject.session->header.env);
 	if (ret < 0)
 		goto out_delete;
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index edae107..52a4a80 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -106,6 +106,8 @@ libperf-y += scripting-engines/
 libperf-$(CONFIG_ZLIB) += zlib.o
 libperf-$(CONFIG_LZMA) += lzma.o
 libperf-y += demangle-java.o
+libperf-$(CONFIG_LIBELF) += jitdump.o
+libperf-$(CONFIG_LIBELF) += genelf.o
 
 CFLAGS_config.o   += -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 
diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c
new file mode 100644
index 0000000..145f811
--- /dev/null
+++ b/tools/perf/util/genelf.c
@@ -0,0 +1,442 @@
+/*
+ * genelf.c
+ * Copyright (C) 2014, Google, Inc
+ *
+ * Contributed by:
+ * 	Stephane Eranian <eranian@gmail.com>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <stddef.h>
+#include <libelf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <err.h>
+#include <dwarf.h>
+
+#include "perf.h"
+#include "genelf.h"
+#include "../util/jitdump.h"
+
+#define JVMTI
+
+#define BUILD_ID_URANDOM /* different uuid for each run */
+
+#ifdef HAVE_LIBCRYPTO
+
+#define BUILD_ID_MD5
+#undef BUILD_ID_SHA	/* does not seem to work well when linked with Java */
+#undef BUILD_ID_URANDOM /* different uuid for each run */
+
+#ifdef BUILD_ID_SHA
+#include <openssl/sha.h>
+#endif
+
+#ifdef BUILD_ID_MD5
+#include <openssl/md5.h>
+#endif
+#endif
+
+
+typedef struct {
+  unsigned int namesz;  /* Size of entry's owner string */
+  unsigned int descsz;  /* Size of the note descriptor */
+  unsigned int type;    /* Interpretation of the descriptor */
+  char         name[0]; /* Start of the name+desc data */
+} Elf_Note;
+
+struct options {
+	char *output;
+	int fd;
+};
+
+static char shd_string_table[] = {
+	0,
+	'.', 't', 'e', 'x', 't', 0,			/*  1 */
+	'.', 's', 'h', 's', 't', 'r', 't', 'a', 'b', 0, /*  7 */
+	'.', 's', 'y', 'm', 't', 'a', 'b', 0,		/* 17 */
+	'.', 's', 't', 'r', 't', 'a', 'b', 0,		/* 25 */
+	'.', 'n', 'o', 't', 'e', '.', 'g', 'n', 'u', '.', 'b', 'u', 'i', 'l', 'd', '-', 'i', 'd', 0, /* 33 */
+	'.', 'd', 'e', 'b', 'u', 'g', '_', 'l', 'i', 'n', 'e', 0, /* 52 */
+	'.', 'd', 'e', 'b', 'u', 'g', '_', 'i', 'n', 'f', 'o', 0, /* 64 */
+	'.', 'd', 'e', 'b', 'u', 'g', '_', 'a', 'b', 'b', 'r', 'e', 'v', 0, /* 76 */
+};
+
+static struct buildid_note {
+	Elf_Note desc;		/* descsz: size of build-id, must be multiple of 4 */
+	char	 name[4];	/* GNU\0 */
+	char	 build_id[20];
+} bnote;
+
+static Elf_Sym symtab[]={
+	/* symbol 0 MUST be the undefined symbol */
+	{ .st_name  = 0, /* index in sym_string table */
+	  .st_info  = ELF_ST_TYPE(STT_NOTYPE),
+	  .st_shndx = 0, /* for now */
+	  .st_value = 0x0,
+	  .st_other = ELF_ST_VIS(STV_DEFAULT),
+	  .st_size  = 0,
+	},
+	{ .st_name  = 1, /* index in sym_string table */
+	  .st_info  = ELF_ST_BIND(STB_LOCAL) | ELF_ST_TYPE(STT_FUNC),
+	  .st_shndx = 1,
+	  .st_value = 0, /* for now */
+	  .st_other = ELF_ST_VIS(STV_DEFAULT),
+	  .st_size  = 0, /* for now */
+	}
+};
+
+#ifdef BUILD_ID_URANDOM
+static void
+gen_build_id(struct buildid_note *note,
+	     unsigned long load_addr __maybe_unused,
+	     const void *code __maybe_unused,
+	     size_t csize __maybe_unused)
+{
+	int fd;
+	size_t sz = sizeof(note->build_id);
+	ssize_t sret;
+
+	fd = open("/dev/urandom", O_RDONLY);
+	if (fd == -1)
+		err(1, "cannot access /dev/urandom for builid");
+
+	sret = read(fd, note->build_id, sz);
+
+	close(fd);
+
+	if (sret != (ssize_t)sz)
+		memset(note->build_id, 0, sz);
+}
+#endif
+
+#ifdef BUILD_ID_SHA
+static void
+gen_build_id(struct buildid_note *note,
+	     unsigned long load_addr __maybe_unused,
+	     const void *code,
+	     size_t csize)
+{
+	if (sizeof(note->build_id) < SHA_DIGEST_LENGTH)
+		errx(1, "build_id too small for SHA1");
+
+	SHA1(code, csize, (unsigned char *)note->build_id);
+}
+#endif
+
+#ifdef BUILD_ID_MD5
+static void
+gen_build_id(struct buildid_note *note, unsigned long load_addr, const void *code, size_t csize)
+{
+	MD5_CTX context;
+
+	if (sizeof(note->build_id) < 16)
+		errx(1, "build_id too small for MD5");
+
+	MD5_Init(&context);
+	MD5_Update(&context, &load_addr, sizeof(load_addr));
+	MD5_Update(&context, code, csize);
+	MD5_Final((unsigned char *)note->build_id, &context);
+}
+#endif
+
+/*
+ * fd: file descriptor open for writing for the output file
+ * load_addr: code load address (could be zero, just used for buildid)
+ * sym: function name (for native code - used as the symbol)
+ * code: the native code
+ * csize: the code size in bytes
+ */
+int
+jit_write_elf(int fd, uint64_t load_addr, const char *sym,
+	      const void *code, int csize)
+{
+	Elf *e;
+	Elf_Data *d;
+	Elf_Scn *scn;
+	Elf_Ehdr *ehdr;
+	Elf_Shdr *shdr;
+	char *strsym = NULL;
+	int symlen;
+	int retval = -1;
+
+	if (elf_version(EV_CURRENT) == EV_NONE) {
+		warnx("ELF initialization failed");
+		return -1;
+	}
+
+	e = elf_begin(fd, ELF_C_WRITE, NULL);
+	if (!e) {
+		warnx("elf_begin failed");
+		goto error;
+	}
+
+	/*
+	 * setup ELF header
+	 */
+	ehdr = elf_newehdr(e);
+	if (!ehdr) {
+		warnx("cannot get ehdr");
+		goto error;
+	}
+
+	ehdr->e_ident[EI_DATA] = GEN_ELF_ENDIAN;
+	ehdr->e_ident[EI_CLASS] = GEN_ELF_CLASS;
+	ehdr->e_machine = GEN_ELF_ARCH;
+	ehdr->e_type = ET_DYN;
+	ehdr->e_entry = GEN_ELF_TEXT_OFFSET;
+	ehdr->e_version = EV_CURRENT;
+	ehdr->e_shstrndx= 2; /* shdr index for section name */
+
+	/*
+	 * setup text section
+	 */
+	scn = elf_newscn(e);
+	if (!scn) {
+		warnx("cannot create section");
+		goto error;
+	}
+
+	d = elf_newdata(scn);
+	if (!d) {
+		warnx("cannot get new data");
+		goto error;
+	}
+
+	d->d_align = 16;
+	d->d_off = 0LL;
+	d->d_buf = (void *)code;
+	d->d_type = ELF_T_BYTE;
+	d->d_size = csize;
+	d->d_version = EV_CURRENT;
+
+	shdr = elf_getshdr(scn);
+	if (!shdr) {
+		warnx("cannot get section header");
+		goto error;
+	}
+
+	shdr->sh_name = 1;
+	shdr->sh_type = SHT_PROGBITS;
+	shdr->sh_addr = GEN_ELF_TEXT_OFFSET;
+	shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+	shdr->sh_entsize = 0;
+
+	/*
+	 * setup section headers string table
+	 */
+	scn = elf_newscn(e);
+	if (!scn) {
+		warnx("cannot create section");
+		goto error;
+	}
+
+	d = elf_newdata(scn);
+	if (!d) {
+		warnx("cannot get new data");
+		goto error;
+	}
+
+	d->d_align = 1;
+	d->d_off = 0LL;
+	d->d_buf = shd_string_table;
+	d->d_type = ELF_T_BYTE;
+	d->d_size = sizeof(shd_string_table);
+	d->d_version = EV_CURRENT;
+
+	shdr = elf_getshdr(scn);
+	if (!shdr) {
+		warnx("cannot get section header");
+		goto error;
+	}
+
+	shdr->sh_name = 7; /* offset of '.shstrtab' in shd_string_table */
+	shdr->sh_type = SHT_STRTAB;
+	shdr->sh_flags = 0;
+	shdr->sh_entsize = 0;
+
+	/*
+	 * setup symtab section
+	 */
+	symtab[1].st_size  = csize;
+	symtab[1].st_value = GEN_ELF_TEXT_OFFSET;
+
+	scn = elf_newscn(e);
+	if (!scn) {
+		warnx("cannot create section");
+		goto error;
+	}
+
+	d = elf_newdata(scn);
+	if (!d) {
+		warnx("cannot get new data");
+		goto error;
+	}
+
+	d->d_align = 8;
+	d->d_off = 0LL;
+	d->d_buf = symtab;
+	d->d_type = ELF_T_SYM;
+	d->d_size = sizeof(symtab);
+	d->d_version = EV_CURRENT;
+
+	shdr = elf_getshdr(scn);
+	if (!shdr) {
+		warnx("cannot get section header");
+		goto error;
+	}
+
+	shdr->sh_name = 17; /* offset of '.symtab' in shd_string_table */
+	shdr->sh_type = SHT_SYMTAB;
+	shdr->sh_flags = 0;
+	shdr->sh_entsize = sizeof(Elf_Sym);
+	shdr->sh_link = 4; /* index of .strtab section */
+
+	/*
+	 * setup symbols string table
+	 * 2 = 1 for 0 in 1st entry, 1 for the 0 at end of symbol for 2nd entry
+	 */
+	symlen = 2 + strlen(sym);
+	strsym = calloc(1, symlen);
+	if (!strsym) {
+		warnx("cannot allocate strsym");
+		goto error;
+	}
+	strcpy(strsym + 1, sym);
+
+	scn = elf_newscn(e);
+	if (!scn) {
+		warnx("cannot create section");
+		goto error;
+	}
+
+	d = elf_newdata(scn);
+	if (!d) {
+		warnx("cannot get new data");
+		goto error;
+	}
+
+	d->d_align = 1;
+	d->d_off = 0LL;
+	d->d_buf = strsym;
+	d->d_type = ELF_T_BYTE;
+	d->d_size = symlen;
+	d->d_version = EV_CURRENT;
+
+	shdr = elf_getshdr(scn);
+	if (!shdr) {
+		warnx("cannot get section header");
+		goto error;
+	}
+
+	shdr->sh_name = 25; /* offset in shd_string_table */
+	shdr->sh_type = SHT_STRTAB;
+	shdr->sh_flags = 0;
+	shdr->sh_entsize = 0;
+
+	/*
+	 * setup build-id section
+	 */
+	scn = elf_newscn(e);
+	if (!scn) {
+		warnx("cannot create section");
+		goto error;
+	}
+
+	d = elf_newdata(scn);
+	if (!d) {
+		warnx("cannot get new data");
+		goto error;
+	}
+
+	/*
+	 * build-id generation
+	 */
+	gen_build_id(&bnote, load_addr, code, csize);
+	bnote.desc.namesz = sizeof(bnote.name); /* must include 0 termination */
+	bnote.desc.descsz = sizeof(bnote.build_id);
+	bnote.desc.type   = NT_GNU_BUILD_ID;
+	strcpy(bnote.name, "GNU");
+
+	d->d_align = 4;
+	d->d_off = 0LL;
+	d->d_buf = &bnote;
+	d->d_type = ELF_T_BYTE;
+	d->d_size = sizeof(bnote);
+	d->d_version = EV_CURRENT;
+
+	shdr = elf_getshdr(scn);
+	if (!shdr) {
+		warnx("cannot get section header");
+		goto error;
+	}
+
+	shdr->sh_name = 33; /* offset in shd_string_table */
+	shdr->sh_type = SHT_NOTE;
+	shdr->sh_addr = 0x0;
+	shdr->sh_flags = SHF_ALLOC;
+	shdr->sh_size = sizeof(bnote);
+	shdr->sh_entsize = 0;
+
+	if (elf_update(e, ELF_C_WRITE) < 0) {
+		warnx("elf_update 4 failed");
+		goto error;
+	}
+
+	retval = 0;
+error:
+	(void)elf_end(e);
+
+	free(strsym);
+
+
+	return retval;
+}
+
+#ifndef JVMTI
+
+static unsigned char x86_code[] = {
+    0xBB, 0x2A, 0x00, 0x00, 0x00, /* movl $42, %ebx */
+    0xB8, 0x01, 0x00, 0x00, 0x00, /* movl $1, %eax */
+    0xCD, 0x80            /* int $0x80 */
+};
+
+static struct options options;
+
+int main(int argc, char **argv)
+{
+	int c, fd, ret;
+
+	while ((c = getopt(argc, argv, "o:h")) != -1) {
+		switch (c) {
+		case 'o':
+			options.output = optarg;
+			break;
+		case 'h':
+			printf("Usage: genelf -o output_file [-h]\n");
+			return 0;
+		default:
+			errx(1, "unknown option");
+		}
+	}
+
+	fd = open(options.output, O_CREAT|O_TRUNC|O_RDWR, 0666);
+	if (fd == -1)
+		err(1, "cannot create file %s", options.output);
+
+	ret = jit_write_elf(fd, "main", x86_code, sizeof(x86_code));
+	close(fd);
+
+	if (ret != 0)
+		unlink(options.output);
+
+	return ret;
+}
+#endif
diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h
new file mode 100644
index 0000000..d8e9ece
--- /dev/null
+++ b/tools/perf/util/genelf.h
@@ -0,0 +1,63 @@
+#ifndef __GENELF_H__
+#define __GENELF_H__
+
+/* genelf.c */
+extern int jit_write_elf(int fd, uint64_t code_addr, const char *sym,
+			 const void *code, int csize);
+
+#if   defined(__arm__)
+#define GEN_ELF_ARCH	EM_ARM
+#define GEN_ELF_ENDIAN	ELFDATA2LSB
+#define GEN_ELF_CLASS	ELFCLASS32
+#elif defined(__aarch64__)
+#define GEN_ELF_ARCH	EM_AARCH64
+#define GEN_ELF_ENDIAN	ELFDATA2LSB
+#define GEN_ELF_CLASS	ELFCLASS64
+#elif defined(__x86_64__)
+#define GEN_ELF_ARCH	EM_X86_64
+#define GEN_ELF_ENDIAN	ELFDATA2LSB
+#define GEN_ELF_CLASS	ELFCLASS64
+#elif defined(__i386__)
+#define GEN_ELF_ARCH	EM_386
+#define GEN_ELF_ENDIAN	ELFDATA2LSB
+#define GEN_ELF_CLASS	ELFCLASS32
+#elif defined(__ppcle__)
+#define GEN_ELF_ARCH	EM_PPC
+#define GEN_ELF_ENDIAN	ELFDATA2LSB
+#define GEN_ELF_CLASS	ELFCLASS64
+#elif defined(__powerpc__)
+#define GEN_ELF_ARCH	EM_PPC64
+#define GEN_ELF_ENDIAN	ELFDATA2MSB
+#define GEN_ELF_CLASS	ELFCLASS64
+#elif defined(__powerpcle__)
+#define GEN_ELF_ARCH	EM_PPC64
+#define GEN_ELF_ENDIAN	ELFDATA2LSB
+#define GEN_ELF_CLASS	ELFCLASS64
+#else
+#error "unsupported architecture"
+#endif
+
+#if GEN_ELF_CLASS == ELFCLASS64
+#define elf_newehdr	elf64_newehdr
+#define elf_getshdr	elf64_getshdr
+#define Elf_Ehdr	Elf64_Ehdr
+#define Elf_Shdr	Elf64_Shdr
+#define Elf_Sym		Elf64_Sym
+#define ELF_ST_TYPE(a)	ELF64_ST_TYPE(a)
+#define ELF_ST_BIND(a)	ELF64_ST_BIND(a)
+#define ELF_ST_VIS(a)	ELF64_ST_VISIBILITY(a)
+#else
+#define elf_newehdr	elf32_newehdr
+#define elf_getshdr	elf32_getshdr
+#define Elf_Ehdr	Elf32_Ehdr
+#define Elf_Shdr	Elf32_Shdr
+#define Elf_Sym		Elf32_Sym
+#define ELF_ST_TYPE(a)	ELF32_ST_TYPE(a)
+#define ELF_ST_BIND(a)	ELF32_ST_BIND(a)
+#define ELF_ST_VIS(a)	ELF32_ST_VISIBILITY(a)
+#endif
+
+/* The .text section is directly after the ELF header */
+#define GEN_ELF_TEXT_OFFSET sizeof(Elf_Ehdr)
+
+#endif
diff --git a/tools/perf/util/jit.h b/tools/perf/util/jit.h
new file mode 100644
index 0000000..a1e99da
--- /dev/null
+++ b/tools/perf/util/jit.h
@@ -0,0 +1,15 @@
+#ifndef __JIT_H__
+#define __JIT_H__
+
+#include <data.h>
+
+extern int jit_process(struct perf_session *session,
+		       struct perf_data_file *output,
+		       struct machine *machine,
+		       char *filename,
+		       pid_t pid,
+		       u64 *nbytes);
+
+extern int jit_inject_record(const char *filename);
+
+#endif /* __JIT_H__ */
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
new file mode 100644
index 0000000..9f7a012
--- /dev/null
+++ b/tools/perf/util/jitdump.c
@@ -0,0 +1,670 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <byteswap.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "util.h"
+#include "event.h"
+#include "debug.h"
+#include "evlist.h"
+#include "symbol.h"
+#include "strlist.h"
+#include <elf.h>
+
+#include "session.h"
+#include "jit.h"
+#include "jitdump.h"
+#include "genelf.h"
+#include "../builtin.h"
+
+struct jit_buf_desc {
+	struct perf_data_file *output;
+	struct perf_session *session;
+	struct machine *machine;
+	union jr_entry   *entry;
+	void             *buf;
+	uint64_t	 sample_type;
+	size_t           bufsize;
+	FILE             *in;
+	bool		 needs_bswap; /* handles cross-endianess */
+	void		 *debug_data;
+	size_t		 nr_debug_entries;
+	uint32_t         code_load_count;
+	u64		 bytes_written;
+	struct rb_root   code_root;
+	char		 dir[PATH_MAX];
+};
+
+struct debug_line_info {
+	unsigned long vma;
+	unsigned int lineno;
+	/* The filename format is unspecified, absolute path, relative etc. */
+	char const filename[0];
+};
+
+struct jit_tool {
+	struct perf_tool tool;
+	struct perf_data_file	output;
+	struct perf_data_file	input;
+	u64 bytes_written;
+};
+
+#define hmax(a, b) ((a) > (b) ? (a) : (b))
+#define get_jit_tool(t) (container_of(tool, struct jit_tool, tool))
+
+static int
+jit_emit_elf(char *filename,
+	     const char *sym,
+	     uint64_t code_addr,
+	     const void *code,
+	     int csize)
+{
+	int ret, fd;
+
+	if (verbose > 0)
+		fprintf(stderr, "write ELF image %s\n", filename);
+
+	fd = open(filename, O_CREAT|O_TRUNC|O_WRONLY, 0644);
+	if (fd == -1) {
+		pr_warning("cannot create jit ELF %s: %s\n", filename, strerror(errno));
+		return -1;
+	}
+
+        ret = jit_write_elf(fd, code_addr, sym, (const void *)code, csize);
+
+        close(fd);
+
+        if (ret)
+                unlink(filename);
+
+	return ret;
+}
+
+static void
+jit_close(struct jit_buf_desc *jd)
+{
+	if (!(jd && jd->in))
+		return;
+	funlockfile(jd->in);
+	fclose(jd->in);
+	jd->in = NULL;
+}
+
+static int
+jit_open(struct jit_buf_desc *jd, const char *name)
+{
+	struct jitheader header;
+	struct jr_prefix *prefix;
+	ssize_t bs, bsz = 0;
+	void *n, *buf = NULL;
+	int ret, retval = -1;
+
+	jd->in = fopen(name, "r");
+	if (!jd->in)
+		return -1;
+
+	bsz = hmax(sizeof(header), sizeof(*prefix));
+
+	buf = malloc(bsz);
+	if (!buf)
+		goto error;
+
+	/*
+	 * protect from writer modifying the file while we are reading it
+	 */
+	flockfile(jd->in);
+
+	ret = fread(buf, sizeof(header), 1, jd->in);
+	if (ret != 1)
+		goto error;
+
+	memcpy(&header, buf, sizeof(header));
+
+	if (header.magic != JITHEADER_MAGIC) {
+		if (header.magic != JITHEADER_MAGIC_SW)
+			goto error;
+		jd->needs_bswap = true;
+	}
+
+	if (jd->needs_bswap) {
+		header.version    = bswap_32(header.version);
+		header.total_size = bswap_32(header.total_size);
+		header.pid	  = bswap_32(header.pid);
+		header.elf_mach   = bswap_32(header.elf_mach);
+		header.timestamp  = bswap_64(header.timestamp);
+		header.flags      = bswap_64(header.flags);
+	}
+
+	if (verbose > 2)
+		pr_debug("version=%u\nhdr.size=%u\nts=0x%llx\npid=%d\nelf_mach=%d\n",
+			header.version,
+			header.total_size,
+			(unsigned long long)header.timestamp,
+			header.pid,
+			header.elf_mach);
+
+	if (header.flags & JITDUMP_FLAGS_RESERVED) {
+		pr_err("jitdump file contains invalid or unsupported flags 0x%llx\n",
+		       (unsigned long long)header.flags & JITDUMP_FLAGS_RESERVED);
+		goto error;
+	}
+
+	bs = header.total_size - sizeof(header);
+
+	if (bs > bsz) {
+		n = realloc(buf, bs);
+		if (!n)
+			goto error;
+		bsz = bs;
+		buf = n;
+		/* read extra we do not know about */
+		ret = fread(buf, bs - bsz, 1, jd->in);
+		if (ret != 1)
+			goto error;
+	}
+	/*
+	 * keep dirname for generating files and mmap records
+	 */
+	strcpy(jd->dir, name);
+	dirname(jd->dir);
+
+	return 0;
+error:
+	funlockfile(jd->in);
+	fclose(jd->in);
+	return retval;
+}
+
+static union jr_entry *
+jit_get_next_entry(struct jit_buf_desc *jd)
+{
+	struct jr_prefix *prefix;
+	union jr_entry *jr;
+	void *addr;
+	size_t bs, size;
+	int id, ret;
+
+	if (!(jd && jd->in))
+		return NULL;
+
+	if (jd->buf == NULL) {
+		size_t sz = getpagesize();
+		if (sz < sizeof(*prefix))
+			sz = sizeof(*prefix);
+
+		jd->buf = malloc(sz);
+		if (jd->buf == NULL)
+			return NULL;
+
+		jd->bufsize = sz;
+	}
+
+	prefix = jd->buf;
+
+	/*
+	 * file is still locked at this point
+	 */
+	ret = fread(prefix, sizeof(*prefix), 1, jd->in);
+	if (ret  != 1)
+		return NULL;
+
+	if (jd->needs_bswap) {
+		prefix->id   	   = bswap_32(prefix->id);
+		prefix->total_size = bswap_32(prefix->total_size);
+		prefix->timestamp  = bswap_64(prefix->timestamp);
+	}
+	id   = prefix->id;
+	size = prefix->total_size;
+
+	bs = (size_t)size;
+	if (bs < sizeof(*prefix))
+		return NULL;
+
+	if (id >= JIT_CODE_MAX) {
+		pr_warning("next_entry: unknown prefix %d, skipping\n", id);
+		return NULL;
+	}
+	if (bs > jd->bufsize) {
+		void *n;
+		n = realloc(jd->buf, bs);
+		if (!n)
+			return NULL;
+		jd->buf = n;
+		jd->bufsize = bs;
+	}
+
+	addr = ((void *)jd->buf) + sizeof(*prefix);
+
+	ret = fread(addr, bs - sizeof(*prefix), 1, jd->in);
+	if (ret != 1)
+		return NULL;
+
+	jr = (union jr_entry *)jd->buf;
+
+	switch(id) {
+	case JIT_CODE_DEBUG_INFO:
+		if (jd->needs_bswap) {
+			uint64_t n;
+			jr->info.code_addr = bswap_64(jr->info.code_addr);
+			jr->info.nr_entry  = bswap_64(jr->info.nr_entry);
+			for (n = 0 ; n < jr->info.nr_entry; n++) {
+				jr->info.entries[n].addr    = bswap_64(jr->info.entries[n].addr);
+				jr->info.entries[n].lineno  = bswap_32(jr->info.entries[n].lineno);
+				jr->info.entries[n].discrim = bswap_32(jr->info.entries[n].discrim);
+			}
+		}
+		break;
+	case JIT_CODE_CLOSE:
+		break;
+	case JIT_CODE_LOAD:
+		if (jd->needs_bswap) {
+			jr->load.pid       = bswap_32(jr->load.pid);
+			jr->load.tid       = bswap_32(jr->load.tid);
+			jr->load.vma       = bswap_64(jr->load.vma);
+			jr->load.code_addr = bswap_64(jr->load.code_addr);
+			jr->load.code_size = bswap_64(jr->load.code_size);
+			jr->load.code_index= bswap_64(jr->load.code_index);
+		}
+		jd->code_load_count++;
+		break;
+	case JIT_CODE_MOVE:
+		if (jd->needs_bswap) {
+			jr->move.pid           = bswap_32(jr->move.pid);
+			jr->move.tid           = bswap_32(jr->move.tid);
+			jr->move.vma           = bswap_64(jr->move.vma);
+			jr->move.old_code_addr = bswap_64(jr->move.old_code_addr);
+			jr->move.new_code_addr = bswap_64(jr->move.new_code_addr);
+			jr->move.code_size     = bswap_64(jr->move.code_size);
+			jr->move.code_index    = bswap_64(jr->move.code_index);
+		}
+		break;
+	case JIT_CODE_MAX:
+	default:
+		return NULL;
+	}
+	return jr;
+}
+
+static int
+jit_inject_event(struct jit_buf_desc *jd, union perf_event *event)
+{
+	ssize_t size;
+
+	size = perf_data_file__write(jd->output, event, event->header.size);
+	if (size < 0)
+		return -1;
+
+	jd->bytes_written += size;
+	return 0;
+}
+
+static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+	struct perf_sample sample;
+	union perf_event *event;
+	struct perf_tool *tool = jd->session->tool;
+	uint64_t code, addr;
+	uintptr_t uaddr;
+	char *filename;
+	struct stat st;
+	size_t size;
+	u16 idr_size;
+	const char *sym;
+	uint32_t count;
+	int ret, csize;
+	pid_t pid, tid;
+	struct {
+		u32 pid, tid;
+		u64 time;
+	} *id;
+
+	pid   = jr->load.pid;
+	tid   = jr->load.tid;
+	csize = jr->load.code_size;
+	addr  = jr->load.code_addr;
+	sym   = (void *)((unsigned long)jr + sizeof(jr->load));
+	code  = (unsigned long)jr + jr->load.p.total_size - csize;
+	count = jr->load.code_index;
+	idr_size = jd->machine->id_hdr_size;
+
+	event = calloc(1, sizeof(*event) + idr_size);
+	if (!event)
+		return -1;
+
+	filename = event->mmap2.filename;
+	size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%u.so",
+			jd->dir,
+			pid,
+			count);
+
+	size++; /* for \0 */
+
+	size = PERF_ALIGN(size, sizeof(u64));
+	uaddr = (uintptr_t)code;
+	ret = jit_emit_elf(filename, sym, addr, (const void *)uaddr, csize);
+
+	if (jd->debug_data && jd->nr_debug_entries) {
+		free(jd->debug_data);
+		jd->debug_data = NULL;
+		jd->nr_debug_entries = 0;
+	}
+
+	if (ret) {
+		free(event);
+		return -1;
+	}
+	if (stat(filename, &st))
+		memset(&st, 0, sizeof(stat));
+
+	event->mmap2.header.type = PERF_RECORD_MMAP2;
+	event->mmap2.header.misc = PERF_RECORD_MISC_USER;
+	event->mmap2.header.size = (sizeof(event->mmap2) -
+			(sizeof(event->mmap2.filename) - size) + idr_size);
+
+	event->mmap2.pgoff = GEN_ELF_TEXT_OFFSET;
+	event->mmap2.start = addr;
+	event->mmap2.len   = csize;
+	event->mmap2.pid   = pid;
+	event->mmap2.tid   = tid;
+	event->mmap2.ino   = st.st_ino;
+	event->mmap2.maj   = major(st.st_dev);
+	event->mmap2.min   = minor(st.st_dev);
+	event->mmap2.prot  = st.st_mode;
+	event->mmap2.flags = MAP_SHARED;
+	event->mmap2.ino_generation = 1;
+
+	id = (void *)((unsigned long)event + event->mmap.header.size - idr_size);
+	if (jd->sample_type & PERF_SAMPLE_TID) {
+		id->pid  = pid;
+		id->tid  = tid;
+	}
+	if (jd->sample_type & PERF_SAMPLE_TIME)
+		id->time = jr->load.p.timestamp;
+
+	/*
+	 * create pseudo sample to induce dso hit increment
+	 * use first address as sample address
+	 */
+	memset(&sample, 0, sizeof(sample));
+	sample.pid  = pid;
+	sample.tid  = tid;
+	sample.time = id->time;
+	sample.ip   = addr;
+
+	ret = perf_event__process_mmap2(tool, event, &sample, jd->machine);
+	if (ret)
+		return ret;
+
+	ret = jit_inject_event(jd, event);
+	/*
+	 * mark dso as use to generate buildid in the header
+	 */
+	if (!ret)
+		build_id__mark_dso_hit(tool, event, &sample, NULL, jd->machine);
+
+	return ret;
+}
+
+static int jit_repipe_code_move(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+	struct perf_sample sample;
+	union perf_event *event;
+	struct perf_tool *tool = jd->session->tool;
+	char *filename;
+	size_t size;
+	struct stat st;
+	u16 idr_size;
+	int ret;
+	pid_t pid, tid;
+	struct {
+		u32 pid, tid;
+		u64 time;
+	} *id;
+
+	pid = jr->move.pid;
+	tid =  jr->move.tid;
+	idr_size = jd->machine->id_hdr_size;
+
+	/*
+	 * +16 to account for sample_id_all (hack)
+	 */
+	event = calloc(1, sizeof(*event) + 16);
+	if (!event)
+		return -1;
+
+	filename = event->mmap2.filename;
+	size = snprintf(filename, PATH_MAX, "%s/jitted-%d-%"PRIu64,
+	         jd->dir,
+	         pid,
+		 jr->move.code_index);
+
+	size++; /* for \0 */
+
+	if (stat(filename, &st))
+		memset(&st, 0, sizeof(stat));
+
+	size = PERF_ALIGN(size, sizeof(u64));
+
+	event->mmap2.header.type = PERF_RECORD_MMAP2;
+	event->mmap2.header.misc = PERF_RECORD_MISC_USER;
+	event->mmap2.header.size = (sizeof(event->mmap2) -
+			(sizeof(event->mmap2.filename) - size) + idr_size);
+	event->mmap2.pgoff = GEN_ELF_TEXT_OFFSET;
+	event->mmap2.start = jr->move.new_code_addr;
+	event->mmap2.len   = jr->move.code_size;
+	event->mmap2.pid   = pid;
+	event->mmap2.tid   = tid;
+	event->mmap2.ino   = st.st_ino;
+	event->mmap2.maj   = major(st.st_dev);
+	event->mmap2.min   = minor(st.st_dev);
+	event->mmap2.prot  = st.st_mode;
+	event->mmap2.flags = MAP_SHARED;
+	event->mmap2.ino_generation = 1;
+
+	id = (void *)((unsigned long)event + event->mmap.header.size - idr_size);
+	if (jd->sample_type & PERF_SAMPLE_TID) {
+		id->pid  = pid;
+		id->tid  = tid;
+	}
+	if (jd->sample_type & PERF_SAMPLE_TIME)
+		id->time = jr->load.p.timestamp;
+
+	/*
+	 * create pseudo sample to induce dso hit increment
+	 * use first address as sample address
+	 */
+	memset(&sample, 0, sizeof(sample));
+	sample.pid  = pid;
+	sample.tid  = tid;
+	sample.time = id->time;
+	sample.ip   = jr->move.new_code_addr;
+
+	ret = perf_event__process_mmap2(tool, event, &sample, jd->machine);
+	if (ret)
+		return ret;
+
+	ret = jit_inject_event(jd, event);
+	if (!ret)
+		build_id__mark_dso_hit(tool, event, &sample, NULL, jd->machine);
+
+	return ret;
+}
+
+static int jit_repipe_debug_info(struct jit_buf_desc *jd, union jr_entry *jr)
+{
+	void *data;
+	size_t sz;
+
+	if (!(jd && jr))
+		return -1;
+
+	sz  = jr->prefix.total_size - sizeof(jr->info);
+	data = malloc(sz);
+	if (!data)
+		return -1;
+
+	memcpy(data, &jr->info.entries, sz);
+
+	jd->debug_data       = data;
+
+	/*
+	 * we must use nr_entry instead of size here because
+	 * we cannot distinguish actual entry from padding otherwise
+	 */
+	jd->nr_debug_entries = jr->info.nr_entry;
+
+	return 0;
+}
+
+static int
+jit_process_dump(struct jit_buf_desc *jd)
+{
+	union jr_entry *jr;
+	int ret;
+
+	while ((jr = jit_get_next_entry(jd))) {
+		switch(jr->prefix.id) {
+		case JIT_CODE_LOAD:
+			ret = jit_repipe_code_load(jd, jr);
+			break;
+		case JIT_CODE_MOVE:
+			ret = jit_repipe_code_move(jd, jr);
+			break;
+		case JIT_CODE_DEBUG_INFO:
+			ret = jit_repipe_debug_info(jd, jr);
+			break;
+		default:
+			ret = 0;
+			continue;
+		}
+	}
+	return ret;
+}
+
+static int
+jit_inject(struct jit_buf_desc *jd, char *path)
+{
+	int ret;
+
+	if (verbose > 0)
+		fprintf(stderr, "injecting: %s\n", path);
+
+	ret = jit_open(jd, path);
+	if (ret)
+		return -1;
+
+	ret = jit_process_dump(jd);
+
+	jit_close(jd);
+
+	if (verbose > 0)
+		fprintf(stderr, "injected: %s (%d)\n", path, ret);
+
+	return 0;
+}
+
+/*
+ * File must be with pattern .../jit-XXXX.dump
+ * where XXXX is the PID of the process which did the mmap()
+ * as captured in the RECORD_MMAP record
+ */
+static int
+jit_detect(char *mmap_name, pid_t pid)
+ {
+	char *p;
+	char *end = NULL;
+	pid_t pid2;
+
+	if (verbose > 2)
+		fprintf(stderr, "jit marker trying : %s\n", mmap_name);
+	/*
+	 * get file name
+	 */
+	p = strrchr(mmap_name, '/');
+	if (!p)
+		return -1;
+
+	/*
+	 * match prefix
+	 */
+	if (strncmp(p, "/jit-", 5))
+		return -1;
+
+	/*
+	 * skip prefix
+	 */
+	p += 5;
+
+	/*
+	 * must be followed by a pid
+	 */
+	if (!isdigit(*p))
+		return -1;
+
+	pid2 = (int)strtol(p, &end, 10);
+	if (!end)
+		return -1;
+
+	/*
+	 * pid does not match mmap pid
+	 * pid==0 in system-wide mode (synthesized)
+	 */
+	if (pid && pid2 != pid)
+		return -1;
+	/*
+	 * validate suffix
+	 */
+	if (strcmp(end, ".dump"))
+		return -1;
+
+	if (verbose > 0)
+		fprintf(stderr, "jit marker found: %s\n", mmap_name);
+
+	return 0;
+}
+
+int
+jit_process(struct perf_session *session,
+	    struct perf_data_file *output,
+	    struct machine *machine,
+	    char *filename,
+	    pid_t pid,
+	    u64 *nbytes)
+{
+	struct perf_evsel *first;
+	struct jit_buf_desc jd;
+	int ret;
+
+	/*
+	 * first, detect marker mmap (i.e., the jitdump mmap)
+	 */
+	if (jit_detect(filename, pid))
+		return -1;
+
+	memset(&jd, 0, sizeof(jd));
+
+	jd.session = session;
+	jd.output  = output;
+	jd.machine = machine;
+
+	/*
+	 * track sample_type to compute id_all layout
+	 * perf sets the same sample type to all events as of now
+	 */
+	first = perf_evlist__first(session->evlist);
+	jd.sample_type = first->attr.sample_type;
+
+	*nbytes = 0;
+
+	ret = jit_inject(&jd, filename);
+	if (!ret)
+		*nbytes = jd.bytes_written;
+
+	return ret;
+}
diff --git a/tools/perf/util/jitdump.h b/tools/perf/util/jitdump.h
new file mode 100644
index 0000000..b66c1f5
--- /dev/null
+++ b/tools/perf/util/jitdump.h
@@ -0,0 +1,124 @@
+/*
+ * jitdump.h: jitted code info encapsulation file format
+ *
+ * Adapted from OProfile GPLv2 support jidump.h:
+ * Copyright 2007 OProfile authors
+ * Jens Wilke
+ * Daniel Hansel
+ * Copyright IBM Corporation 2007
+ */
+#ifndef JITDUMP_H
+#define JITDUMP_H
+
+#include <sys/time.h>
+#include <time.h>
+#include <stdint.h>
+
+/* JiTD */
+#define JITHEADER_MAGIC		0x4A695444
+#define JITHEADER_MAGIC_SW	0x4454694A
+
+#define PADDING_8ALIGNED(x) ((((x) + 7) & 7) ^ 7)
+
+#define JITHEADER_VERSION 1
+
+enum jitdump_flags_bits {
+	JITDUMP_FLAGS_MAX_BIT,
+};
+
+#define JITDUMP_FLAGS_RESERVED (JITDUMP_FLAGS_MAX_BIT < 64 ? \
+				(~((1ULL << JITDUMP_FLAGS_MAX_BIT) - 1)) : 0)
+
+struct jitheader {
+	uint32_t magic;		/* characters "jItD" */
+	uint32_t version;	/* header version */
+	uint32_t total_size;	/* total size of header */
+	uint32_t elf_mach;	/* elf mach target */
+	uint32_t pad1;		/* reserved */
+	uint32_t pid;		/* JIT process id */
+	uint64_t timestamp;	/* timestamp */
+	uint64_t flags;		/* flags */
+};
+
+enum jit_record_type {
+	JIT_CODE_LOAD		= 0,
+        JIT_CODE_MOVE           = 1,
+	JIT_CODE_DEBUG_INFO	= 2,
+	JIT_CODE_CLOSE		= 3,
+
+	JIT_CODE_MAX,
+};
+
+/* record prefix (mandatory in each record) */
+struct jr_prefix {
+	uint32_t id;
+	uint32_t total_size;
+	uint64_t timestamp;
+};
+
+struct jr_code_load {
+	struct jr_prefix p;
+
+	uint32_t pid;
+	uint32_t tid;
+	uint64_t vma;
+	uint64_t code_addr;
+	uint64_t code_size;
+	uint64_t code_index;
+};
+
+struct jr_code_close {
+	struct jr_prefix p;
+};
+
+struct jr_code_move {
+	struct jr_prefix p;
+
+	uint32_t pid;
+	uint32_t tid;
+	uint64_t vma;
+	uint64_t old_code_addr;
+	uint64_t new_code_addr;
+	uint64_t code_size;
+	uint64_t code_index;
+};
+
+struct debug_entry {
+	uint64_t addr;
+	int lineno;	    /* source line number starting at 1 */
+	int discrim;	    /* column discriminator, 0 is default */
+	const char name[0]; /* null terminated filename, \xff\0 if same as previous entry */
+};
+
+struct jr_code_debug_info {
+	struct jr_prefix p;
+
+	uint64_t code_addr;
+	uint64_t nr_entry;
+	struct debug_entry entries[0];
+};
+
+union jr_entry {
+        struct jr_code_debug_info info;
+        struct jr_code_close close;
+        struct jr_code_load load;
+        struct jr_code_move move;
+        struct jr_prefix prefix;
+};
+
+static inline struct debug_entry *
+debug_entry_next(struct debug_entry *ent)
+{
+	void *a = ent + 1;
+	size_t l = strlen(ent->name) + 1;
+	return a + l;
+}
+
+static inline char *
+debug_entry_file(struct debug_entry *ent)
+{
+	void *a = ent + 1;
+	return a;
+}
+
+#endif /* !JITDUMP_H */
-- 
cgit v0.10.2


From 209045adc2bbdb2b315fa5539cec54d01cd3e7db Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 30 Nov 2015 10:02:22 +0100
Subject: perf tools: add JVMTI agent library

This is a standalone JVMTI library to help  profile Java jitted code with perf
record/perf report. The library is not installed or compiled automatically by
perf Makefile. It is not used directly by perf. It is arch agnostic and has
been tested on X86 and ARM. It needs to be used with a Java runtime, such as
OpenJDK, as follows:

  $ java -agentpath:libjvmti.so .......

See the "Committer Notes" below on how to build it.

When used this way, java will generate a jitdump binary file in
$HOME/.debug/java/jit/java-jit-*

This binary dump file contains information to help symbolize and
annotate jitted code.

The jitdump information must be injected into the perf.data file
using:

  $ perf inject --jit -i perf.data -o perf.data.jitted

This injects the MMAP records to cover the jitted code and also generates
one ELF image for each jitted function. The ELF images are created in the
same subdir as the jitdump file. The MMAP records point there too.

Then, to visualize the function or asm profile, simply use the regular
perf commands:

  $ perf report -i perf.data.jitted

or

  $ perf annotate -i perf.data.jitted

JVMTI agent code adapted from the OProfile's opagent code.

This version of the JVMTI agent is using the CLOCK_MONOTONIC as the time
source to timestamp jit samples. To correlate with perf_events samples,
it needs to run on kernel 4.0.0-rc5+ or later with the following commit
from Peter Zijlstra:

  34f439278cef ("perf: Add per event clockid support")

With this patch recording jitted code is done as follows:

   $ perf record -k mono -- java -agentpath:libjvmti.so .......

 --------------------------------------------------------------------------

Committer Notes:

Extended testing instructions:

  $ cd tools/perf/jvmti/
  $ dnf install java-devel
  $ make

Then, create some simple java stuff to record some samples:

  $ cat hello.java
  public class hello {
	public static void main(String[] args) {
                 System.out.println("Hello, World");
       	}
  }
  $ javac hello.java
  $ java hello
  Hello, World
  $

And then record it using this jvmti thing:

  $ perf record -k mono java -agentpath:/home/acme/git/linux/tools/perf/jvmti/libjvmti.so hello
  java: jvmti: jitdump in /home/acme/.debug/jit/java-jit-20160205.XXWIEDls/jit-1908.dump
  Hello, World
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.030 MB perf.data (268 samples) ]
  $

Now lets insert the PERF_RECORD_MMAP2 records to point jitted mmaps to
files created by the agent:

  $ perf inject --jit -i perf.data -o perf.data.jitted

And finally see that it did its job:

  $ perf report -D -i perf.data.jitted | grep PERF_RECORD_MMAP2 | tail -5
  79197149129422 0xfe10 [0xa0]: PERF_RECORD_MMAP2 1908/1923: [0x7f172428bd60(0x80) @ 0x40 fd:02 1840554 1]: --xs /home/acme/.debug/jit/java-jit-20160205.XXWIEDls/jitted-1908-283.so
  79197149235701 0xfeb0 [0xa0]: PERF_RECORD_MMAP2 1908/1923: [0x7f172428ba60(0x180) @ 0x40 fd:02 1840555 1]: --xs /home/acme/.debug/jit/java-jit-20160205.XXWIEDls/jitted-1908-284.so
  79197149250558 0xff50 [0xa0]: PERF_RECORD_MMAP2 1908/1923: [0x7f172428b860(0x180) @ 0x40 fd:02 1840556 1]: --xs /home/acme/.debug/jit/java-jit-20160205.XXWIEDls/jitted-1908-285.so
  79197149714746 0xfff0 [0xa0]: PERF_RECORD_MMAP2 1908/1923: [0x7f172428b660(0x180) @ 0x40 fd:02 1840557 1]: --xs /home/acme/.debug/jit/java-jit-20160205.XXWIEDls/jitted-1908-286.so
  79197149806558 0x10090 [0xa0]: PERF_RECORD_MMAP2 1908/1923: [0x7f172428b460(0x180) @ 0x40 fd:02 1840558 1]: --xs /home/acme/.debug/jit/java-jit-20160205.XXWIEDls/jitted-1908-287.so
  $

So:

  $ perf report -D -i perf.data | grep PERF_RECORD_MMAP2 | wc -l
  Failed to open /tmp/perf-1908.map, continuing without symbols
  21
  $ perf report -D -i perf.data.jitted | grep PERF_RECORD_MMAP2 | wc -l
  307
  $ echo $((307 - 21))
  286
  $

286 extra PERF_RECORD_MMAP2 records.

All for thise tiny, with just one function, ELF files:

  $ file /home/acme/.debug/jit/java-jit-20160205.XXWIEDls/jitted-1908-9.so
  /home/acme/.debug/jit/java-jit-20160205.XXWIEDls/jitted-1908-9.so: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), corrupted program header size, BuildID[sha1]=ae54a2ebc3ecf0ba547bfc8cabdea1519df5203f, not stripped
  $ readelf -sw /home/acme/.debug/jit/java-jit-20160205.XXWIEDls/jitted-1908-9.so

  Symbol table '.symtab' contains 2 entries:
   Num:    Value          Size Type    Bind   Vis      Ndx Name
     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT  UND
     1: 0000000000000040     9 FUNC    LOCAL  DEFAULT    1 atomic_cmpxchg_long
  $

Inserted into the build-id cache:

  $ ls -la ~/.debug/.build-id/ae/54a2ebc3ecf0ba547bfc8cabdea1519df5203f
  lrwxrwxrwx. 1 acme acme 111 Feb  5 11:30 /home/acme/.debug/.build-id/ae/54a2ebc3ecf0ba547bfc8cabdea1519df5203f -> ../../home/acme/.debug/jit/java-jit-20160205.XXWIEDls/jitted-1908-9.so/ae54a2ebc3ecf0ba547bfc8cabdea1519df5203f

Note: check why 'file' reports that 'corrupted program header size'.

With a stupid java hog to do some profiling:

$ cat hog.java
  public class hog {
	private static double do_something_else(int i) {
		double total = 0;
		while (i > 0) {
			total += Math.log(i--);
		}
		return total;
	}
	private static double do_something(int i) {
		double total = 0;
		while (i > 0) {
			total += Math.sqrt(i--) + do_something_else(i / 100);
		}
		return total;
	}
	public static void main(String[] args) {
		System.out.println(String.format("%s=%f & %f", args[0],
				   do_something(Integer.parseInt(args[0])),
				   do_something_else(Integer.parseInt(args[1]))));
	}
  }
  $ javac hog.java
  $ perf record -F 10000 -g -k mono java -agentpath:/home/acme/git/linux/tools/perf/jvmti/libjvmti.so hog 100000 2345000
  java: jvmti: jitdump in /home/acme/.debug/jit/java-jit-20160205.XX4sqd14/jit-8670.dump
  100000=291561592.669602 & 32050989.778714
  [ perf record: Woken up 6 times to write data ]
  [ perf record: Captured and wrote 1.536 MB perf.data (12538 samples) ]
  $ perf inject --jit -i perf.data -o perf.data.jitted

Looking at the 'perf report' TUI, at one expanded callchain leading
to the jitted code:

  $ perf report --no-children -i perf.data.jitted

Samples: 12K of event 'cycles:pp', Event count (approx.): 3829569932
  Overhead  Comm  Shared Object       Symbol
-   93.38%  java  jitted-8670-291.so  [.] class hog.do_something_else(int)
     class hog.do_something_else(int)
   - Interpreter
      - 75.86% call_stub
           JavaCalls::call_helper
           jni_invoke_static
           jni_CallStaticVoidMethod
           JavaMain
           start_thread
      - 17.52% JavaCalls::call_helper
           jni_invoke_static
           jni_CallStaticVoidMethod
           JavaMain
           start_thread

Signed-off-by: Stephane Eranian <eranian@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Carl Love <cel@us.ibm.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John McCutchan <johnmccutchan@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Pawel Moll <pawel.moll@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sonny Rao <sonnyrao@chromium.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1448874143-7269-4-git-send-email-eranian@google.com
[ Made it build on fedora23, added some build/usage instructions ]
[ Check if filename != NULL in compiled_method_load_cb, fixing segfault ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/jvmti/Makefile b/tools/perf/jvmti/Makefile
new file mode 100644
index 0000000..5968f83
--- /dev/null
+++ b/tools/perf/jvmti/Makefile
@@ -0,0 +1,76 @@
+ARCH=$(shell uname -m)
+
+ifeq ($(ARCH), x86_64)
+JARCH=amd64
+endif
+ifeq ($(ARCH), armv7l)
+JARCH=armhf
+endif
+ifeq ($(ARCH), armv6l)
+JARCH=armhf
+endif
+ifeq ($(ARCH), aarch64)
+JARCH=aarch64
+endif
+ifeq ($(ARCH), ppc64)
+JARCH=powerpc
+endif
+ifeq ($(ARCH), ppc64le)
+JARCH=powerpc
+endif
+
+DESTDIR=/usr/local
+
+VERSION=1
+REVISION=0
+AGE=0
+
+LN=ln -sf
+RM=rm
+
+SLIBJVMTI=libjvmti.so.$(VERSION).$(REVISION).$(AGE)
+VLIBJVMTI=libjvmti.so.$(VERSION)
+SLDFLAGS=-shared -Wl,-soname -Wl,$(VLIBJVMTI)
+SOLIBEXT=so
+
+# The following works at least on fedora 23, you may need the next
+# line for other distros.
+JDIR=$(shell alternatives --display java | tail -1 | cut -d' ' -f 5 | sed 's%/jre/bin/java.%%g')
+#JDIR=$(shell /usr/sbin/update-java-alternatives -l | head -1 | cut -d ' ' -f 3)
+# -lrt required in 32-bit mode for clock_gettime()
+LIBS=-lelf -lrt
+INCDIR=-I $(JDIR)/include -I $(JDIR)/include/linux
+
+TARGETS=$(SLIBJVMTI)
+
+SRCS=libjvmti.c jvmti_agent.c
+OBJS=$(SRCS:.c=.o)
+SOBJS=$(OBJS:.o=.lo)
+OPT=-O2 -g -Werror -Wall
+
+CFLAGS=$(INCDIR) $(OPT)
+
+all: $(TARGETS)
+
+.c.o:
+	$(CC) $(CFLAGS) -c $*.c
+.c.lo:
+	$(CC) -fPIC -DPIC $(CFLAGS) -c $*.c -o $*.lo
+
+$(OBJS) $(SOBJS): Makefile jvmti_agent.h ../util/jitdump.h
+
+$(SLIBJVMTI):  $(SOBJS)
+	$(CC) $(CFLAGS) $(SLDFLAGS)  -o $@ $(SOBJS) $(LIBS)
+	$(LN) $@ libjvmti.$(SOLIBEXT)
+
+clean:
+	$(RM) -f *.o *.so.* *.so *.lo
+
+install:
+	-mkdir -p $(DESTDIR)/lib
+	install -m 755 $(SLIBJVMTI) $(DESTDIR)/lib/
+	(cd $(DESTDIR)/lib; $(LN) $(SLIBJVMTI) $(VLIBJVMTI))
+	(cd $(DESTDIR)/lib; $(LN) $(SLIBJVMTI) libjvmti.$(SOLIBEXT))
+	ldconfig
+
+.SUFFIXES: .c .S .o .lo
diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c
new file mode 100644
index 0000000..cbab139
--- /dev/null
+++ b/tools/perf/jvmti/jvmti_agent.c
@@ -0,0 +1,465 @@
+/*
+ * jvmti_agent.c: JVMTI agent interface
+ *
+ * Adapted from the Oprofile code in opagent.c:
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Copyright 2007 OProfile authors
+ * Jens Wilke
+ * Daniel Hansel
+ * Copyright IBM Corporation 2007
+ */
+#include <sys/types.h>
+#include <sys/stat.h> /* for mkdir() */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/mman.h>
+#include <syscall.h> /* for gettid() */
+#include <err.h>
+
+#include "jvmti_agent.h"
+#include "../util/jitdump.h"
+
+#define JIT_LANG "java"
+
+static char jit_path[PATH_MAX];
+static void *marker_addr;
+
+/*
+ * padding buffer
+ */
+static const char pad_bytes[7];
+
+static inline pid_t gettid(void)
+{
+	return (pid_t)syscall(__NR_gettid);
+}
+
+static int get_e_machine(struct jitheader *hdr)
+{
+	ssize_t sret;
+	char id[16];
+	int fd, ret = -1;
+	int m = -1;
+	struct {
+		uint16_t e_type;
+		uint16_t e_machine;
+	} info;
+
+	fd = open("/proc/self/exe", O_RDONLY);
+	if (fd == -1)
+		return -1;
+
+	sret = read(fd, id, sizeof(id));
+	if (sret != sizeof(id))
+		goto error;
+
+	/* check ELF signature */
+	if (id[0] != 0x7f || id[1] != 'E' || id[2] != 'L' || id[3] != 'F')
+		goto error;
+
+	sret = read(fd, &info, sizeof(info));
+	if (sret != sizeof(info))
+		goto error;
+
+	m = info.e_machine;
+	if (m < 0)
+		m = 0; /* ELF EM_NONE */
+
+	hdr->elf_mach = m;
+	ret = 0;
+error:
+	close(fd);
+	return ret;
+}
+
+#define NSEC_PER_SEC	1000000000
+static int perf_clk_id = CLOCK_MONOTONIC;
+
+static inline uint64_t
+timespec_to_ns(const struct timespec *ts)
+{
+        return ((uint64_t) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
+}
+
+static inline uint64_t
+perf_get_timestamp(void)
+{
+	struct timespec ts;
+	int ret;
+
+	ret = clock_gettime(perf_clk_id, &ts);
+	if (ret)
+		return 0;
+
+	return timespec_to_ns(&ts);
+}
+
+static int
+debug_cache_init(void)
+{
+	char str[32];
+	char *base, *p;
+	struct tm tm;
+	time_t t;
+	int ret;
+
+	time(&t);
+	localtime_r(&t, &tm);
+
+	base = getenv("JITDUMPDIR");
+	if (!base)
+		base = getenv("HOME");
+	if (!base)
+		base = ".";
+
+	strftime(str, sizeof(str), JIT_LANG"-jit-%Y%m%d", &tm);
+
+	snprintf(jit_path, PATH_MAX - 1, "%s/.debug/", base);
+
+	ret = mkdir(jit_path, 0755);
+	if (ret == -1) {
+		if (errno != EEXIST) {
+			warn("jvmti: cannot create jit cache dir %s", jit_path);
+			return -1;
+		}
+	}
+
+	snprintf(jit_path, PATH_MAX - 1, "%s/.debug/jit", base);
+	ret = mkdir(jit_path, 0755);
+	if (ret == -1) {
+		if (errno != EEXIST) {
+			warn("cannot create jit cache dir %s", jit_path);
+			return -1;
+		}
+	}
+
+	snprintf(jit_path, PATH_MAX - 1, "%s/.debug/jit/%s.XXXXXXXX", base, str);
+
+	p = mkdtemp(jit_path);
+	if (p != jit_path) {
+		warn("cannot create jit cache dir %s", jit_path);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+perf_open_marker_file(int fd)
+{
+	long pgsz;
+
+	pgsz = sysconf(_SC_PAGESIZE);
+	if (pgsz == -1)
+		return -1;
+
+	/*
+	 * we mmap the jitdump to create an MMAP RECORD in perf.data file.
+	 * The mmap is captured either live (perf record running when we mmap)
+	 * or  in deferred mode, via /proc/PID/maps
+	 * the MMAP record is used as a marker of a jitdump file for more meta
+	 * data info about the jitted code. Perf report/annotate detect this
+	 * special filename and process the jitdump file.
+	 *
+	 * mapping must be PROT_EXEC to ensure it is captured by perf record
+	 * even when not using -d option
+	 */
+	marker_addr = mmap(NULL, pgsz, PROT_READ|PROT_EXEC, MAP_PRIVATE, fd, 0);
+	return (marker_addr == MAP_FAILED) ? -1 : 0;
+}
+
+static void
+perf_close_marker_file(void)
+{
+	long pgsz;
+
+	if (!marker_addr)
+		return;
+
+	pgsz = sysconf(_SC_PAGESIZE);
+	if (pgsz == -1)
+		return;
+
+	munmap(marker_addr, pgsz);
+}
+
+void *jvmti_open(void)
+{
+	int pad_cnt;
+	char dump_path[PATH_MAX];
+	struct jitheader header;
+	int fd;
+	FILE *fp;
+
+	/*
+	 * check if clockid is supported
+	 */
+	if (!perf_get_timestamp())
+		warnx("jvmti: kernel does not support %d clock id", perf_clk_id);
+
+	memset(&header, 0, sizeof(header));
+
+	debug_cache_init();
+
+	/*
+	 * jitdump file name
+	 */
+	snprintf(dump_path, PATH_MAX, "%s/jit-%i.dump", jit_path, getpid());
+
+	fd = open(dump_path, O_CREAT|O_TRUNC|O_RDWR, 0666);
+	if (fd == -1)
+		return NULL;
+
+	/*
+	 * create perf.data maker for the jitdump file
+	 */
+	if (perf_open_marker_file(fd)) {
+		warnx("jvmti: failed to create marker file");
+		return NULL;
+	}
+
+	fp = fdopen(fd, "w+");
+	if (!fp) {
+		warn("jvmti: cannot create %s", dump_path);
+		close(fd);
+		goto error;
+	}
+
+	warnx("jvmti: jitdump in %s", dump_path);
+
+	if (get_e_machine(&header)) {
+		warn("get_e_machine failed\n");
+		goto error;
+	}
+
+	header.magic      = JITHEADER_MAGIC;
+	header.version    = JITHEADER_VERSION;
+	header.total_size = sizeof(header);
+	header.pid        = getpid();
+
+	/* calculate amount of padding '\0' */
+	pad_cnt = PADDING_8ALIGNED(header.total_size);
+	header.total_size += pad_cnt;
+
+	header.timestamp = perf_get_timestamp();
+
+	if (!fwrite(&header, sizeof(header), 1, fp)) {
+		warn("jvmti: cannot write dumpfile header");
+		goto error;
+	}
+
+	/* write padding '\0' if necessary */
+	if (pad_cnt && !fwrite(pad_bytes, pad_cnt, 1, fp)) {
+		warn("jvmti: cannot write dumpfile header padding");
+		goto error;
+	}
+
+	return fp;
+error:
+	fclose(fp);
+	return NULL;
+}
+
+int
+jvmti_close(void *agent)
+{
+	struct jr_code_close rec;
+	FILE *fp = agent;
+
+	if (!fp) {
+		warnx("jvmti: incalid fd in close_agent");
+		return -1;
+	}
+
+	rec.p.id = JIT_CODE_CLOSE;
+	rec.p.total_size = sizeof(rec);
+
+	rec.p.timestamp = perf_get_timestamp();
+
+	if (!fwrite(&rec, sizeof(rec), 1, fp))
+		return -1;
+
+	fclose(fp);
+
+	fp = NULL;
+
+	perf_close_marker_file();
+
+	return 0;
+}
+
+int
+jvmti_write_code(void *agent, char const *sym,
+	uint64_t vma, void const *code, unsigned int const size)
+{
+	static int code_generation = 1;
+	struct jr_code_load rec;
+	size_t sym_len;
+	size_t padding_count;
+	FILE *fp = agent;
+	int ret = -1;
+
+	/* don't care about 0 length function, no samples */
+	if (size == 0)
+		return 0;
+
+	if (!fp) {
+		warnx("jvmti: invalid fd in write_native_code");
+		return -1;
+	}
+
+	sym_len = strlen(sym) + 1;
+
+	rec.p.id           = JIT_CODE_LOAD;
+	rec.p.total_size   = sizeof(rec) + sym_len;
+	padding_count      = PADDING_8ALIGNED(rec.p.total_size);
+	rec.p. total_size += padding_count;
+	rec.p.timestamp    = perf_get_timestamp();
+
+	rec.code_size  = size;
+	rec.vma        = vma;
+	rec.code_addr  = vma;
+	rec.pid	       = getpid();
+	rec.tid	       = gettid();
+
+	if (code)
+		rec.p.total_size += size;
+
+	/*
+	 * If JVM is multi-threaded, nultiple concurrent calls to agent
+	 * may be possible, so protect file writes
+	 */
+	flockfile(fp);
+
+	/*
+	 * get code index inside lock to avoid race condition
+	 */
+	rec.code_index = code_generation++;
+
+	ret = fwrite_unlocked(&rec, sizeof(rec), 1, fp);
+	fwrite_unlocked(sym, sym_len, 1, fp);
+
+	if (padding_count)
+		fwrite_unlocked(pad_bytes, padding_count, 1, fp);
+
+	if (code)
+		fwrite_unlocked(code, size, 1, fp);
+
+	funlockfile(fp);
+
+	ret = 0;
+
+	return ret;
+}
+
+int
+jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
+		       jvmtiAddrLocationMap const *map,
+		       jvmtiLineNumberEntry *li, jint num)
+{
+	static const char *prev_str = "\xff";
+	struct jr_code_debug_info rec;
+	size_t sret, len, size, flen;
+	size_t padding_count;
+	FILE *fp = agent;
+	int i;
+
+	/*
+	 * no entry to write
+	 */
+	if (!num)
+		return 0;
+
+	if (!fp) {
+		warnx("jvmti: invalid fd in write_debug_info");
+		return -1;
+	}
+
+	flen = strlen(file) + 1;
+
+	rec.p.id        = JIT_CODE_DEBUG_INFO;
+	size            = sizeof(rec);
+	rec.p.timestamp = perf_get_timestamp();
+	rec.code_addr   = (uint64_t)(uintptr_t)code;
+	rec.nr_entry    = num;
+
+	/*
+	 * on disk source line info layout:
+	 * uint64_t : addr
+	 * int      : line number
+	 * file[]   : source file name
+	 * padding  : pad to multiple of 8 bytes
+	 */
+	size += num * (sizeof(uint64_t) + sizeof(int));
+	size += flen + (num - 1) * 2;
+	/*
+	 * pad to 8 bytes
+	 */
+	padding_count = PADDING_8ALIGNED(size);
+
+	rec.p.total_size = size + padding_count;
+
+	/*
+	 * If JVM is multi-threaded, nultiple concurrent calls to agent
+	 * may be possible, so protect file writes
+	 */
+	flockfile(fp);
+
+	sret = fwrite_unlocked(&rec, sizeof(rec), 1, fp);
+	if (sret != 1)
+		goto error;
+
+	for (i = 0; i < num; i++) {
+		uint64_t addr;
+
+		addr = (uint64_t)map[i].start_address;
+		len  = sizeof(addr);
+		sret = fwrite_unlocked(&addr, len, 1, fp);
+		if (sret != 1)
+			goto error;
+
+		len  = sizeof(int);
+		sret = fwrite_unlocked(&li[i].line_number, len, 1, fp);
+		if (sret != 1)
+			goto error;
+
+		if (i == 0) {
+			sret = fwrite_unlocked(file, flen, 1, fp);
+		} else {
+			sret = fwrite_unlocked(prev_str, 2, 1, fp);
+		}
+		if (sret != 1)
+			goto error;
+
+	}
+	if (padding_count)
+		sret = fwrite_unlocked(pad_bytes, padding_count, 1, fp);
+		if (sret != 1)
+			goto error;
+
+	funlockfile(fp);
+	return 0;
+error:
+	funlockfile(fp);
+	return -1;
+}
diff --git a/tools/perf/jvmti/jvmti_agent.h b/tools/perf/jvmti/jvmti_agent.h
new file mode 100644
index 0000000..8251a1c
--- /dev/null
+++ b/tools/perf/jvmti/jvmti_agent.h
@@ -0,0 +1,29 @@
+#ifndef __JVMTI_AGENT_H__
+#define __JVMTI_AGENT_H__
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <jvmti.h>
+
+#define __unused __attribute__((unused))
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void *jvmti_open(void);
+int   jvmti_close(void *agent);
+int   jvmti_write_code(void *agent, char const *symbol_name,
+		       uint64_t vma, void const *code,
+		       const unsigned int code_size);
+int   jvmti_write_debug_info(void *agent,
+		             uint64_t code,
+			     const char *file,
+			     jvmtiAddrLocationMap const *map,
+			     jvmtiLineNumberEntry *tab, jint nr);
+
+#if defined(__cplusplus)
+}
+
+#endif
+#endif /* __JVMTI_H__ */
diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c
new file mode 100644
index 0000000..92ffbe4
--- /dev/null
+++ b/tools/perf/jvmti/libjvmti.c
@@ -0,0 +1,208 @@
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <err.h>
+#include <jvmti.h>
+#include <limits.h>
+
+#include "jvmti_agent.h"
+
+static int has_line_numbers;
+void *jvmti_agent;
+
+static void JNICALL
+compiled_method_load_cb(jvmtiEnv *jvmti,
+			jmethodID method,
+			jint code_size,
+			void const *code_addr,
+			jint map_length,
+			jvmtiAddrLocationMap const *map,
+			void const *compile_info __unused)
+{
+	jvmtiLineNumberEntry *tab = NULL;
+	jclass decl_class;
+	char *class_sign = NULL;
+	char *func_name = NULL;
+	char *func_sign = NULL;
+	char *file_name= NULL;
+	char fn[PATH_MAX];
+	uint64_t addr = (uint64_t)(uintptr_t)code_addr;
+	jvmtiError ret;
+	jint nr_lines = 0;
+	size_t len;
+
+	ret = (*jvmti)->GetMethodDeclaringClass(jvmti, method,
+						&decl_class);
+	if (ret != JVMTI_ERROR_NONE) {
+		warnx("jvmti: cannot get declaring class");
+		return;
+	}
+
+	if (has_line_numbers && map && map_length) {
+
+		ret = (*jvmti)->GetLineNumberTable(jvmti, method, &nr_lines, &tab);
+		if (ret != JVMTI_ERROR_NONE) {
+			warnx("jvmti: cannot get line table for method");
+		} else {
+			ret = (*jvmti)->GetSourceFileName(jvmti, decl_class, &file_name);
+			if (ret != JVMTI_ERROR_NONE) {
+				warnx("jvmti: cannot get source filename ret=%d", ret);
+				nr_lines = 0;
+			}
+		}
+	}
+
+	ret = (*jvmti)->GetClassSignature(jvmti, decl_class,
+					  &class_sign, NULL);
+	if (ret != JVMTI_ERROR_NONE) {
+		warnx("jvmti: getclassignature failed");
+		goto error;
+	}
+
+	ret = (*jvmti)->GetMethodName(jvmti, method, &func_name,
+				      &func_sign, NULL);
+	if (ret != JVMTI_ERROR_NONE) {
+		warnx("jvmti: failed getmethodname");
+		goto error;
+	}
+
+	/*
+	 * Assume path name is class hierarchy, this is a common practice with Java programs
+	 */
+	if (*class_sign == 'L') {
+		int j, i = 0;
+		char *p = strrchr(class_sign, '/');
+		if (p) {
+			/* drop the 'L' prefix and copy up to the final '/' */
+			for (i = 0; i < (p - class_sign); i++)
+				fn[i] = class_sign[i+1];
+		}
+		/*
+		 * append file name, we use loops and not string ops to avoid modifying
+		 * class_sign which is used later for the symbol name
+		 */
+		for (j = 0; i < (PATH_MAX - 1) && file_name && j < strlen(file_name); j++, i++)
+			fn[i] = file_name[j];
+		fn[i] = '\0';
+	} else {
+		/* fallback case */
+		strcpy(fn, file_name);
+	}
+	/*
+	 * write source line info record if we have it
+	 */
+	if (jvmti_write_debug_info(jvmti_agent, addr, fn, map, tab, nr_lines))
+		warnx("jvmti: write_debug_info() failed");
+
+	len = strlen(func_name) + strlen(class_sign) + strlen(func_sign) + 2;
+	{
+		char str[len];
+		snprintf(str, len, "%s%s%s", class_sign, func_name, func_sign);
+		if (jvmti_write_code(jvmti_agent, str, addr, code_addr, code_size))
+			warnx("jvmti: write_code() failed");
+	}
+error:
+	(*jvmti)->Deallocate(jvmti, (unsigned char *)func_name);
+	(*jvmti)->Deallocate(jvmti, (unsigned char *)func_sign);
+	(*jvmti)->Deallocate(jvmti, (unsigned char *)class_sign);
+	(*jvmti)->Deallocate(jvmti, (unsigned char *)tab);
+	(*jvmti)->Deallocate(jvmti, (unsigned char *)file_name);
+}
+
+static void JNICALL
+code_generated_cb(jvmtiEnv *jvmti,
+		  char const *name,
+		  void const *code_addr,
+		  jint code_size)
+{
+	uint64_t addr = (uint64_t)(unsigned long)code_addr;
+	int ret;
+
+	ret = jvmti_write_code(jvmti_agent, name, addr, code_addr, code_size);
+	if (ret)
+		warnx("jvmti: write_code() failed for code_generated");
+}
+
+JNIEXPORT jint JNICALL
+Agent_OnLoad(JavaVM *jvm, char *options, void *reserved __unused)
+{
+	jvmtiEventCallbacks cb;
+	jvmtiCapabilities caps1;
+	jvmtiJlocationFormat format;
+	jvmtiEnv *jvmti = NULL;
+	jint ret;
+
+	jvmti_agent = jvmti_open();
+	if (!jvmti_agent) {
+		warnx("jvmti: open_agent failed");
+		return -1;
+	}
+
+	/*
+	 * Request a JVMTI interface version 1 environment
+	 */
+	ret = (*jvm)->GetEnv(jvm, (void *)&jvmti, JVMTI_VERSION_1);
+	if (ret != JNI_OK) {
+		warnx("jvmti: jvmti version 1 not supported");
+		return -1;
+	}
+
+	/*
+	 * acquire method_load capability, we require it
+	 * request line numbers (optional)
+	 */
+	memset(&caps1, 0, sizeof(caps1));
+	caps1.can_generate_compiled_method_load_events = 1;
+
+	ret = (*jvmti)->AddCapabilities(jvmti, &caps1);
+	if (ret != JVMTI_ERROR_NONE) {
+		warnx("jvmti: acquire compiled_method capability failed");
+		return -1;
+	}
+	ret = (*jvmti)->GetJLocationFormat(jvmti, &format);
+        if (ret == JVMTI_ERROR_NONE && format == JVMTI_JLOCATION_JVMBCI) {
+                memset(&caps1, 0, sizeof(caps1));
+                caps1.can_get_line_numbers = 1;
+                caps1.can_get_source_file_name = 1;
+		ret = (*jvmti)->AddCapabilities(jvmti, &caps1);
+                if (ret == JVMTI_ERROR_NONE)
+                        has_line_numbers = 1;
+        }
+
+	memset(&cb, 0, sizeof(cb));
+
+	cb.CompiledMethodLoad   = compiled_method_load_cb;
+	cb.DynamicCodeGenerated = code_generated_cb;
+
+	ret = (*jvmti)->SetEventCallbacks(jvmti, &cb, sizeof(cb));
+	if (ret != JVMTI_ERROR_NONE) {
+		warnx("jvmti: cannot set event callbacks");
+		return -1;
+	}
+
+	ret = (*jvmti)->SetEventNotificationMode(jvmti, JVMTI_ENABLE,
+			JVMTI_EVENT_COMPILED_METHOD_LOAD, NULL);
+	if (ret != JVMTI_ERROR_NONE) {
+		warnx("jvmti: setnotification failed for method_load");
+		return -1;
+	}
+
+	ret = (*jvmti)->SetEventNotificationMode(jvmti, JVMTI_ENABLE,
+			JVMTI_EVENT_DYNAMIC_CODE_GENERATED, NULL);
+	if (ret != JVMTI_ERROR_NONE) {
+		warnx("jvmti: setnotification failed on code_generated");
+		return -1;
+	}
+	return 0;
+}
+
+JNIEXPORT void JNICALL
+Agent_OnUnload(JavaVM *jvm __unused)
+{
+	int ret;
+
+	ret = jvmti_close(jvmti_agent);
+	if (ret)
+		errx(1, "Error: op_close_agent()");
+}
-- 
cgit v0.10.2


From 598b7c6919c7bbcc1243009721a01bc12275ff3e Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 30 Nov 2015 10:02:23 +0100
Subject: perf jit: add source line info support

This patch adds source line information support to perf for jitted code.

The source line info must be emitted by the runtime, such as JVMTI.

Perf injects extract the source line info from the jitdump file and adds
the corresponding .debug_lines section in the ELF image generated for
each jitted function.

The source line enables matching any address in the profile with a
source file and line number.

The improvement is visible in perf annotate with the source code
displayed alongside the assembly code.

The dwarf code leverages the support from OProfile which is also
released under GPLv2.  Copyright 2007 OProfile authors.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Carl Love <cel@us.ibm.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John McCutchan <johnmccutchan@google.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Pawel Moll <pawel.moll@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sonny Rao <sonnyrao@chromium.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1448874143-7269-5-git-send-email-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/jvmti/jvmti_agent.c b/tools/perf/jvmti/jvmti_agent.c
index cbab139..6461e02 100644
--- a/tools/perf/jvmti/jvmti_agent.c
+++ b/tools/perf/jvmti/jvmti_agent.c
@@ -374,20 +374,20 @@ jvmti_write_code(void *agent, char const *sym,
 
 int
 jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
-		       jvmtiAddrLocationMap const *map,
-		       jvmtiLineNumberEntry *li, jint num)
+		       jvmti_line_info_t *li, int nr_lines)
 {
-	static const char *prev_str = "\xff";
 	struct jr_code_debug_info rec;
 	size_t sret, len, size, flen;
 	size_t padding_count;
+	uint64_t addr;
+	const char *fn = file;
 	FILE *fp = agent;
 	int i;
 
 	/*
 	 * no entry to write
 	 */
-	if (!num)
+	if (!nr_lines)
 		return 0;
 
 	if (!fp) {
@@ -401,17 +401,18 @@ jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
 	size            = sizeof(rec);
 	rec.p.timestamp = perf_get_timestamp();
 	rec.code_addr   = (uint64_t)(uintptr_t)code;
-	rec.nr_entry    = num;
+	rec.nr_entry    = nr_lines;
 
 	/*
 	 * on disk source line info layout:
 	 * uint64_t : addr
 	 * int      : line number
+	 * int      : column discriminator
 	 * file[]   : source file name
 	 * padding  : pad to multiple of 8 bytes
 	 */
-	size += num * (sizeof(uint64_t) + sizeof(int));
-	size += flen + (num - 1) * 2;
+	size += nr_lines * sizeof(struct debug_entry);
+	size += flen * nr_lines;
 	/*
 	 * pad to 8 bytes
 	 */
@@ -429,28 +430,27 @@ jvmti_write_debug_info(void *agent, uint64_t code, const char *file,
 	if (sret != 1)
 		goto error;
 
-	for (i = 0; i < num; i++) {
-		uint64_t addr;
+	for (i = 0; i < nr_lines; i++) {
 
-		addr = (uint64_t)map[i].start_address;
+		addr = (uint64_t)li[i].pc;
 		len  = sizeof(addr);
 		sret = fwrite_unlocked(&addr, len, 1, fp);
 		if (sret != 1)
 			goto error;
 
-		len  = sizeof(int);
+		len  = sizeof(li[0].line_number);
 		sret = fwrite_unlocked(&li[i].line_number, len, 1, fp);
 		if (sret != 1)
 			goto error;
 
-		if (i == 0) {
-			sret = fwrite_unlocked(file, flen, 1, fp);
-		} else {
-			sret = fwrite_unlocked(prev_str, 2, 1, fp);
-		}
+		len  = sizeof(li[0].discrim);
+		sret = fwrite_unlocked(&li[i].discrim, len, 1, fp);
 		if (sret != 1)
 			goto error;
 
+		sret = fwrite_unlocked(fn, flen, 1, fp);
+		if (sret != 1)
+			goto error;
 	}
 	if (padding_count)
 		sret = fwrite_unlocked(pad_bytes, padding_count, 1, fp);
diff --git a/tools/perf/jvmti/jvmti_agent.h b/tools/perf/jvmti/jvmti_agent.h
index 8251a1c..bedf5d0 100644
--- a/tools/perf/jvmti/jvmti_agent.h
+++ b/tools/perf/jvmti/jvmti_agent.h
@@ -11,16 +11,23 @@
 extern "C" {
 #endif
 
+typedef struct {
+	unsigned long	pc;
+	int		line_number;
+	int		discrim; /* discriminator -- 0 for now */
+} jvmti_line_info_t;
+
 void *jvmti_open(void);
 int   jvmti_close(void *agent);
 int   jvmti_write_code(void *agent, char const *symbol_name,
 		       uint64_t vma, void const *code,
 		       const unsigned int code_size);
+
 int   jvmti_write_debug_info(void *agent,
 		             uint64_t code,
 			     const char *file,
-			     jvmtiAddrLocationMap const *map,
-			     jvmtiLineNumberEntry *tab, jint nr);
+			     jvmti_line_info_t *li,
+			     int nr_lines);
 
 #if defined(__cplusplus)
 }
diff --git a/tools/perf/jvmti/libjvmti.c b/tools/perf/jvmti/libjvmti.c
index 92ffbe4..ac12e4b 100644
--- a/tools/perf/jvmti/libjvmti.c
+++ b/tools/perf/jvmti/libjvmti.c
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include <err.h>
 #include <jvmti.h>
+#include <jvmticmlr.h>
 #include <limits.h>
 
 #include "jvmti_agent.h"
@@ -11,6 +12,100 @@
 static int has_line_numbers;
 void *jvmti_agent;
 
+static jvmtiError
+do_get_line_numbers(jvmtiEnv *jvmti, void *pc, jmethodID m, jint bci,
+		    jvmti_line_info_t *tab, jint *nr)
+{
+	jint i, lines = 0;
+	jint nr_lines = 0;
+	jvmtiLineNumberEntry *loc_tab = NULL;
+	jvmtiError ret;
+
+	ret = (*jvmti)->GetLineNumberTable(jvmti, m, &nr_lines, &loc_tab);
+	if (ret != JVMTI_ERROR_NONE)
+		return ret;
+
+	for (i = 0; i < nr_lines; i++) {
+		if (loc_tab[i].start_location < bci) {
+			tab[lines].pc = (unsigned long)pc;
+			tab[lines].line_number = loc_tab[i].line_number;
+			tab[lines].discrim = 0; /* not yet used */
+			lines++;
+		} else {
+			break;
+		}
+	}
+	(*jvmti)->Deallocate(jvmti, (unsigned char *)loc_tab);
+	*nr = lines;
+	return JVMTI_ERROR_NONE;
+}
+
+static jvmtiError
+get_line_numbers(jvmtiEnv *jvmti, const void *compile_info, jvmti_line_info_t **tab, int *nr_lines)
+{
+	const jvmtiCompiledMethodLoadRecordHeader *hdr;
+	jvmtiCompiledMethodLoadInlineRecord *rec;
+	jvmtiLineNumberEntry *lne = NULL;
+	PCStackInfo *c;
+	jint nr, ret;
+	int nr_total = 0;
+	int i, lines_total = 0;
+
+	if (!(tab && nr_lines))
+		return JVMTI_ERROR_NULL_POINTER;
+
+	/*
+	 * Phase 1 -- get the number of lines necessary
+	 */
+	for (hdr = compile_info; hdr != NULL; hdr = hdr->next) {
+		if (hdr->kind == JVMTI_CMLR_INLINE_INFO) {
+			rec = (jvmtiCompiledMethodLoadInlineRecord *)hdr;
+			for (i = 0; i < rec->numpcs; i++) {
+				c = rec->pcinfo + i;
+				nr = 0;
+				/*
+				 * unfortunately, need a tab to get the number of lines!
+				 */
+				ret = (*jvmti)->GetLineNumberTable(jvmti, c->methods[0], &nr, &lne);
+				if (ret == JVMTI_ERROR_NONE) {
+					/* free what was allocated for nothing */
+					(*jvmti)->Deallocate(jvmti, (unsigned char *)lne);
+					nr_total += (int)nr;
+				}
+			}
+		}
+	}
+
+	if (nr_total == 0)
+		return JVMTI_ERROR_NOT_FOUND;
+
+	/*
+	 * Phase 2 -- allocate big enough line table
+	 */
+	*tab = malloc(nr_total * sizeof(**tab));
+	if (!*tab)
+		return JVMTI_ERROR_OUT_OF_MEMORY;
+
+	for (hdr = compile_info; hdr != NULL; hdr = hdr->next) {
+		if (hdr->kind == JVMTI_CMLR_INLINE_INFO) {
+			rec = (jvmtiCompiledMethodLoadInlineRecord *)hdr;
+			for (i = 0; i < rec->numpcs; i++) {
+				c = rec->pcinfo + i;
+				nr = 0;
+				ret = do_get_line_numbers(jvmti, c->pc,
+							  c->methods[0],
+							  c->bcis[0],
+							  *tab + lines_total,
+							  &nr);
+				if (ret == JVMTI_ERROR_NONE)
+					lines_total += nr;
+			}
+		}
+	}
+	*nr_lines = lines_total;
+	return JVMTI_ERROR_NONE;
+}
+
 static void JNICALL
 compiled_method_load_cb(jvmtiEnv *jvmti,
 			jmethodID method,
@@ -18,9 +113,9 @@ compiled_method_load_cb(jvmtiEnv *jvmti,
 			void const *code_addr,
 			jint map_length,
 			jvmtiAddrLocationMap const *map,
-			void const *compile_info __unused)
+			const void *compile_info)
 {
-	jvmtiLineNumberEntry *tab = NULL;
+	jvmti_line_info_t *line_tab = NULL;
 	jclass decl_class;
 	char *class_sign = NULL;
 	char *func_name = NULL;
@@ -29,7 +124,7 @@ compiled_method_load_cb(jvmtiEnv *jvmti,
 	char fn[PATH_MAX];
 	uint64_t addr = (uint64_t)(uintptr_t)code_addr;
 	jvmtiError ret;
-	jint nr_lines = 0;
+	int nr_lines = 0; /* in line_tab[] */
 	size_t len;
 
 	ret = (*jvmti)->GetMethodDeclaringClass(jvmti, method,
@@ -40,19 +135,19 @@ compiled_method_load_cb(jvmtiEnv *jvmti,
 	}
 
 	if (has_line_numbers && map && map_length) {
-
-		ret = (*jvmti)->GetLineNumberTable(jvmti, method, &nr_lines, &tab);
+		ret = get_line_numbers(jvmti, compile_info, &line_tab, &nr_lines);
 		if (ret != JVMTI_ERROR_NONE) {
 			warnx("jvmti: cannot get line table for method");
-		} else {
-			ret = (*jvmti)->GetSourceFileName(jvmti, decl_class, &file_name);
-			if (ret != JVMTI_ERROR_NONE) {
-				warnx("jvmti: cannot get source filename ret=%d", ret);
-				nr_lines = 0;
-			}
+			nr_lines = 0;
 		}
 	}
 
+	ret = (*jvmti)->GetSourceFileName(jvmti, decl_class, &file_name);
+	if (ret != JVMTI_ERROR_NONE) {
+		warnx("jvmti: cannot get source filename ret=%d", ret);
+		goto error;
+	}
+
 	ret = (*jvmti)->GetClassSignature(jvmti, decl_class,
 					  &class_sign, NULL);
 	if (ret != JVMTI_ERROR_NONE) {
@@ -92,13 +187,14 @@ compiled_method_load_cb(jvmtiEnv *jvmti,
 	/*
 	 * write source line info record if we have it
 	 */
-	if (jvmti_write_debug_info(jvmti_agent, addr, fn, map, tab, nr_lines))
+	if (jvmti_write_debug_info(jvmti_agent, addr, fn, line_tab, nr_lines))
 		warnx("jvmti: write_debug_info() failed");
 
 	len = strlen(func_name) + strlen(class_sign) + strlen(func_sign) + 2;
 	{
 		char str[len];
 		snprintf(str, len, "%s%s%s", class_sign, func_name, func_sign);
+
 		if (jvmti_write_code(jvmti_agent, str, addr, code_addr, code_size))
 			warnx("jvmti: write_code() failed");
 	}
@@ -106,8 +202,8 @@ error:
 	(*jvmti)->Deallocate(jvmti, (unsigned char *)func_name);
 	(*jvmti)->Deallocate(jvmti, (unsigned char *)func_sign);
 	(*jvmti)->Deallocate(jvmti, (unsigned char *)class_sign);
-	(*jvmti)->Deallocate(jvmti, (unsigned char *)tab);
 	(*jvmti)->Deallocate(jvmti, (unsigned char *)file_name);
+	free(line_tab);
 }
 
 static void JNICALL
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 52a4a80..a34752d 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -108,8 +108,11 @@ libperf-$(CONFIG_LZMA) += lzma.o
 libperf-y += demangle-java.o
 libperf-$(CONFIG_LIBELF) += jitdump.o
 libperf-$(CONFIG_LIBELF) += genelf.o
+libperf-$(CONFIG_LIBELF) += genelf_debug.o
 
 CFLAGS_config.o   += -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
+# avoid compiler warnings in 32-bit mode
+CFLAGS_genelf_debug.o  += -Wno-packed
 
 $(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
 	$(call rule_mkdir)
diff --git a/tools/perf/util/genelf.c b/tools/perf/util/genelf.c
index 145f811..c1ef805 100644
--- a/tools/perf/util/genelf.c
+++ b/tools/perf/util/genelf.c
@@ -156,7 +156,8 @@ gen_build_id(struct buildid_note *note, unsigned long load_addr, const void *cod
  */
 int
 jit_write_elf(int fd, uint64_t load_addr, const char *sym,
-	      const void *code, int csize)
+	      const void *code, int csize,
+	      void *debug, int nr_debug_entries)
 {
 	Elf *e;
 	Elf_Data *d;
@@ -385,9 +386,15 @@ jit_write_elf(int fd, uint64_t load_addr, const char *sym,
 	shdr->sh_size = sizeof(bnote);
 	shdr->sh_entsize = 0;
 
-	if (elf_update(e, ELF_C_WRITE) < 0) {
-		warnx("elf_update 4 failed");
-		goto error;
+	if (debug && nr_debug_entries) {
+		retval = jit_add_debug_info(e, load_addr, debug, nr_debug_entries);
+		if (retval)
+			goto error;
+	} else {
+		if (elf_update(e, ELF_C_WRITE) < 0) {
+			warnx("elf_update 4 failed");
+			goto error;
+		}
 	}
 
 	retval = 0;
diff --git a/tools/perf/util/genelf.h b/tools/perf/util/genelf.h
index d8e9ece..45bf9c6 100644
--- a/tools/perf/util/genelf.h
+++ b/tools/perf/util/genelf.h
@@ -3,7 +3,11 @@
 
 /* genelf.c */
 extern int jit_write_elf(int fd, uint64_t code_addr, const char *sym,
-			 const void *code, int csize);
+			 const void *code, int csize,
+			 void *debug, int nr_debug_entries);
+/* genelf_debug.c */
+extern int jit_add_debug_info(Elf *e, uint64_t code_addr,
+			      void *debug, int nr_debug_entries);
 
 #if   defined(__arm__)
 #define GEN_ELF_ARCH	EM_ARM
diff --git a/tools/perf/util/genelf_debug.c b/tools/perf/util/genelf_debug.c
new file mode 100644
index 0000000..5980f7d
--- /dev/null
+++ b/tools/perf/util/genelf_debug.c
@@ -0,0 +1,610 @@
+/*
+ * genelf_debug.c
+ * Copyright (C) 2015, Google, Inc
+ *
+ * Contributed by:
+ * 	Stephane Eranian <eranian@google.com>
+ *
+ * Released under the GPL v2.
+ *
+ * based on GPLv2 source code from Oprofile
+ * @remark Copyright 2007 OProfile authors
+ * @author Philippe Elie
+ */
+#include <sys/types.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <stddef.h>
+#include <libelf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <err.h>
+#include <dwarf.h>
+
+#include "perf.h"
+#include "genelf.h"
+#include "../util/jitdump.h"
+
+#define BUFFER_EXT_DFL_SIZE	(4 * 1024)
+
+typedef uint32_t uword;
+typedef uint16_t uhalf;
+typedef int32_t  sword;
+typedef int16_t  shalf;
+typedef uint8_t  ubyte;
+typedef int8_t   sbyte;
+
+struct buffer_ext {
+	size_t cur_pos;
+	size_t max_sz;
+	void *data;
+};
+
+static void
+buffer_ext_dump(struct buffer_ext *be, const char *msg)
+{
+	size_t i;
+	warnx("DUMP for %s", msg);
+	for (i = 0 ; i < be->cur_pos; i++)
+		warnx("%4zu 0x%02x", i, (((char *)be->data)[i]) & 0xff);
+}
+
+static inline int
+buffer_ext_add(struct buffer_ext *be, void *addr, size_t sz)
+{
+	void *tmp;
+	size_t be_sz = be->max_sz;
+
+retry:
+	if ((be->cur_pos + sz) < be_sz) {
+		memcpy(be->data + be->cur_pos, addr, sz);
+		be->cur_pos += sz;
+		return 0;
+	}
+
+	if (!be_sz)
+		be_sz = BUFFER_EXT_DFL_SIZE;
+	else
+		be_sz <<= 1;
+
+	tmp = realloc(be->data, be_sz);
+	if (!tmp)
+		return -1;
+
+	be->data   = tmp;
+	be->max_sz = be_sz;
+
+	goto retry;
+}
+
+static void
+buffer_ext_init(struct buffer_ext *be)
+{
+	be->data = NULL;
+	be->cur_pos = 0;
+	be->max_sz = 0;
+}
+
+static inline size_t
+buffer_ext_size(struct buffer_ext *be)
+{
+	return be->cur_pos;
+}
+
+static inline void *
+buffer_ext_addr(struct buffer_ext *be)
+{
+	return be->data;
+}
+
+struct debug_line_header {
+	// Not counting this field
+	uword total_length;
+	// version number (2 currently)
+	uhalf version;
+	// relative offset from next field to
+	// program statement
+	uword prolog_length;
+	ubyte minimum_instruction_length;
+	ubyte default_is_stmt;
+	// line_base - see DWARF 2 specs
+	sbyte line_base;
+	// line_range - see DWARF 2 specs
+	ubyte line_range;
+	// number of opcode + 1
+	ubyte opcode_base;
+	/* follow the array of opcode args nr: ubytes [nr_opcode_base] */
+	/* follow the search directories index, zero terminated string
+	 * terminated by an empty string.
+	 */
+	/* follow an array of { filename, LEB128, LEB128, LEB128 }, first is
+	 * the directory index entry, 0 means current directory, then mtime
+	 * and filesize, last entry is followed by en empty string.
+	 */
+	/* follow the first program statement */
+} __attribute__((packed));
+
+/* DWARF 2 spec talk only about one possible compilation unit header while
+ * binutils can handle two flavours of dwarf 2, 32 and 64 bits, this is not
+ * related to the used arch, an ELF 32 can hold more than 4 Go of debug
+ * information. For now we handle only DWARF 2 32 bits comp unit. It'll only
+ * become a problem if we generate more than 4GB of debug information.
+ */
+struct compilation_unit_header {
+	uword total_length;
+	uhalf version;
+	uword debug_abbrev_offset;
+	ubyte pointer_size;
+} __attribute__((packed));
+
+#define DW_LNS_num_opcode (DW_LNS_set_isa + 1)
+
+/* field filled at run time are marked with -1 */
+static struct debug_line_header const default_debug_line_header = {
+	.total_length = -1,
+	.version = 2,
+	.prolog_length = -1,
+	.minimum_instruction_length = 1,	/* could be better when min instruction size != 1 */
+	.default_is_stmt = 1,	/* we don't take care about basic block */
+	.line_base = -5,	/* sensible value for line base ... */
+	.line_range = -14,     /* ... and line range are guessed statically */
+	.opcode_base = DW_LNS_num_opcode
+};
+
+static ubyte standard_opcode_length[] =
+{
+	0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1
+};
+#if 0
+{
+	[DW_LNS_advance_pc]   = 1,
+	[DW_LNS_advance_line] = 1,
+	[DW_LNS_set_file] =  1,
+	[DW_LNS_set_column] = 1,
+	[DW_LNS_fixed_advance_pc] = 1,
+	[DW_LNS_set_isa] = 1,
+};
+#endif
+
+/* field filled at run time are marked with -1 */
+static struct compilation_unit_header default_comp_unit_header = {
+	.total_length = -1,
+	.version = 2,
+	.debug_abbrev_offset = 0,     /* we reuse the same abbrev entries for all comp unit */
+	.pointer_size = sizeof(void *)
+};
+
+static void emit_uword(struct buffer_ext *be, uword data)
+{
+	buffer_ext_add(be, &data, sizeof(uword));
+}
+
+static void emit_string(struct buffer_ext *be, const char *s)
+{
+	buffer_ext_add(be, (void *)s, strlen(s) + 1);
+}
+
+static void emit_unsigned_LEB128(struct buffer_ext *be,
+				 unsigned long data)
+{
+	do {
+		ubyte cur = data & 0x7F;
+		data >>= 7;
+		if (data)
+			cur |= 0x80;
+		buffer_ext_add(be, &cur, 1);
+	} while (data);
+}
+
+static void emit_signed_LEB128(struct buffer_ext *be, long data)
+{
+	int more = 1;
+	int negative = data < 0;
+	int size = sizeof(long) * CHAR_BIT;
+	while (more) {
+		ubyte cur = data & 0x7F;
+		data >>= 7;
+		if (negative)
+			data |= - (1 << (size - 7));
+		if ((data == 0 && !(cur & 0x40)) ||
+		    (data == -1l && (cur & 0x40)))
+			more = 0;
+		else
+			cur |= 0x80;
+		buffer_ext_add(be, &cur, 1);
+	}
+}
+
+static void emit_extended_opcode(struct buffer_ext *be, ubyte opcode,
+				 void *data, size_t data_len)
+{
+	buffer_ext_add(be, (char *)"", 1);
+
+	emit_unsigned_LEB128(be, data_len + 1);
+
+	buffer_ext_add(be, &opcode, 1);
+	buffer_ext_add(be, data, data_len);
+}
+
+static void emit_opcode(struct buffer_ext *be, ubyte opcode)
+{
+	buffer_ext_add(be, &opcode, 1);
+}
+
+static void emit_opcode_signed(struct buffer_ext  *be,
+			       ubyte opcode, long data)
+{
+	buffer_ext_add(be, &opcode, 1);
+	emit_signed_LEB128(be, data);
+}
+
+static void emit_opcode_unsigned(struct buffer_ext *be, ubyte opcode,
+				 unsigned long data)
+{
+	buffer_ext_add(be, &opcode, 1);
+	emit_unsigned_LEB128(be, data);
+}
+
+static void emit_advance_pc(struct buffer_ext *be, unsigned long delta_pc)
+{
+	emit_opcode_unsigned(be, DW_LNS_advance_pc, delta_pc);
+}
+
+static void emit_advance_lineno(struct buffer_ext  *be, long delta_lineno)
+{
+	emit_opcode_signed(be, DW_LNS_advance_line, delta_lineno);
+}
+
+static void emit_lne_end_of_sequence(struct buffer_ext *be)
+{
+	emit_extended_opcode(be, DW_LNE_end_sequence, NULL, 0);
+}
+
+static void emit_set_file(struct buffer_ext *be, unsigned long idx)
+{
+	emit_opcode_unsigned(be, DW_LNS_set_file, idx);
+}
+
+static void emit_lne_define_filename(struct buffer_ext *be,
+				     const char *filename)
+{
+	buffer_ext_add(be, (void *)"", 1);
+
+	/* LNE field, strlen(filename) + zero termination, 3 bytes for: the dir entry, timestamp, filesize */
+	emit_unsigned_LEB128(be, strlen(filename) + 5);
+	emit_opcode(be, DW_LNE_define_file);
+	emit_string(be, filename);
+	/* directory index 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+	/* last modification date on file 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+	/* filesize 0=do not know */
+        emit_unsigned_LEB128(be, 0);
+}
+
+static void emit_lne_set_address(struct buffer_ext *be,
+				 void *address)
+{
+	emit_extended_opcode(be, DW_LNE_set_address, &address, sizeof(unsigned long));
+}
+
+static ubyte get_special_opcode(struct debug_entry *ent,
+				unsigned int last_line,
+				unsigned long last_vma)
+{
+	unsigned int temp;
+	unsigned long delta_addr;
+
+	/*
+	 * delta from line_base
+	 */
+	temp = (ent->lineno - last_line) - default_debug_line_header.line_base;
+
+	if (temp >= default_debug_line_header.line_range)
+		return 0;
+
+	/*
+	 * delta of addresses
+	 */
+	delta_addr = (ent->addr - last_vma) / default_debug_line_header.minimum_instruction_length;
+
+	/* This is not sufficient to ensure opcode will be in [0-256] but
+	 * sufficient to ensure when summing with the delta lineno we will
+	 * not overflow the unsigned long opcode */
+
+	if (delta_addr <= 256 / default_debug_line_header.line_range) {
+		unsigned long opcode = temp +
+			(delta_addr * default_debug_line_header.line_range) +
+			default_debug_line_header.opcode_base;
+
+		return opcode <= 255 ? opcode : 0;
+	}
+	return 0;
+}
+
+static void emit_lineno_info(struct buffer_ext *be,
+			     struct debug_entry *ent, size_t nr_entry,
+			     unsigned long code_addr)
+{
+	size_t i;
+
+	/*
+	 * Machine state at start of a statement program
+	 * address = 0
+	 * file    = 1
+	 * line    = 1
+	 * column  = 0
+	 * is_stmt = default_is_stmt as given in the debug_line_header
+	 * basic block = 0
+	 * end sequence = 0
+	 */
+
+	/* start state of the state machine we take care of */
+	unsigned long last_vma = code_addr;
+	char const  *cur_filename = NULL;
+	unsigned long cur_file_idx = 0;
+	int last_line = 1;
+
+	emit_lne_set_address(be, (void *)code_addr);
+
+	for (i = 0; i < nr_entry; i++, ent = debug_entry_next(ent)) {
+		int need_copy = 0;
+		ubyte special_opcode;
+
+		/*
+		 * check if filename changed, if so add it
+		 */
+		if (!cur_filename || strcmp(cur_filename, ent->name)) {
+			emit_lne_define_filename(be, ent->name);
+			cur_filename = ent->name;
+			emit_set_file(be, ++cur_file_idx);
+			need_copy = 1;
+		}
+
+		special_opcode = get_special_opcode(ent, last_line, last_vma);
+		if (special_opcode != 0) {
+			last_line = ent->lineno;
+			last_vma  = ent->addr;
+			emit_opcode(be, special_opcode);
+		} else {
+			/*
+			 * lines differ, emit line delta
+			 */
+			if (last_line != ent->lineno) {
+				emit_advance_lineno(be, ent->lineno - last_line);
+				last_line = ent->lineno;
+				need_copy = 1;
+			}
+			/*
+			 * addresses differ, emit address delta
+			 */
+			if (last_vma != ent->addr) {
+				emit_advance_pc(be, ent->addr - last_vma);
+				last_vma = ent->addr;
+				need_copy = 1;
+			}
+			/*
+			 * add new row to matrix
+			 */
+			if (need_copy)
+				emit_opcode(be, DW_LNS_copy);
+		}
+	}
+}
+
+static void add_debug_line(struct buffer_ext *be,
+	struct debug_entry *ent, size_t nr_entry,
+	unsigned long code_addr)
+{
+	struct debug_line_header * dbg_header;
+	size_t old_size;
+
+	old_size = buffer_ext_size(be);
+
+	buffer_ext_add(be, (void *)&default_debug_line_header,
+		 sizeof(default_debug_line_header));
+
+	buffer_ext_add(be, &standard_opcode_length,  sizeof(standard_opcode_length));
+
+	// empty directory entry
+	buffer_ext_add(be, (void *)"", 1);
+
+	// empty filename directory
+	buffer_ext_add(be, (void *)"", 1);
+
+	dbg_header = buffer_ext_addr(be) + old_size;
+	dbg_header->prolog_length = (buffer_ext_size(be) - old_size) -
+		offsetof(struct debug_line_header, minimum_instruction_length);
+
+	emit_lineno_info(be, ent, nr_entry, code_addr);
+
+	emit_lne_end_of_sequence(be);
+
+	dbg_header = buffer_ext_addr(be) + old_size;
+	dbg_header->total_length = (buffer_ext_size(be) - old_size) -
+		offsetof(struct debug_line_header, version);
+}
+
+static void
+add_debug_abbrev(struct buffer_ext *be)
+{
+        emit_unsigned_LEB128(be, 1);
+        emit_unsigned_LEB128(be, DW_TAG_compile_unit);
+        emit_unsigned_LEB128(be, DW_CHILDREN_yes);
+        emit_unsigned_LEB128(be, DW_AT_stmt_list);
+        emit_unsigned_LEB128(be, DW_FORM_data4);
+        emit_unsigned_LEB128(be, 0);
+        emit_unsigned_LEB128(be, 0);
+        emit_unsigned_LEB128(be, 0);
+}
+
+static void
+add_compilation_unit(struct buffer_ext *be,
+		     size_t offset_debug_line)
+{
+	struct compilation_unit_header *comp_unit_header;
+	size_t old_size = buffer_ext_size(be);
+
+	buffer_ext_add(be, &default_comp_unit_header,
+		       sizeof(default_comp_unit_header));
+
+	emit_unsigned_LEB128(be, 1);
+	emit_uword(be, offset_debug_line);
+
+	comp_unit_header = buffer_ext_addr(be) + old_size;
+	comp_unit_header->total_length = (buffer_ext_size(be) - old_size) -
+		offsetof(struct compilation_unit_header, version);
+}
+
+static int
+jit_process_debug_info(uint64_t code_addr,
+		       void *debug, int nr_debug_entries,
+		       struct buffer_ext *dl,
+		       struct buffer_ext *da,
+		       struct buffer_ext *di)
+{
+	struct debug_entry *ent = debug;
+	int i;
+
+	for (i = 0; i < nr_debug_entries; i++) {
+		ent->addr = ent->addr - code_addr;
+		ent = debug_entry_next(ent);
+	}
+	add_compilation_unit(di, buffer_ext_size(dl));
+	add_debug_line(dl, debug, nr_debug_entries, 0);
+	add_debug_abbrev(da);
+	if (0) buffer_ext_dump(da, "abbrev");
+
+	return 0;
+}
+
+int
+jit_add_debug_info(Elf *e, uint64_t code_addr, void *debug, int nr_debug_entries)
+{
+	Elf_Data *d;
+	Elf_Scn *scn;
+	Elf_Shdr *shdr;
+	struct buffer_ext dl, di, da;
+	int ret;
+
+	buffer_ext_init(&dl);
+	buffer_ext_init(&di);
+	buffer_ext_init(&da);
+
+	ret = jit_process_debug_info(code_addr, debug, nr_debug_entries, &dl, &da, &di);
+	if (ret)
+		return -1;
+	/*
+	 * setup .debug_line section
+	 */
+	scn = elf_newscn(e);
+	if (!scn) {
+		warnx("cannot create section");
+		return -1;
+	}
+
+	d = elf_newdata(scn);
+	if (!d) {
+		warnx("cannot get new data");
+		return -1;
+	}
+
+	d->d_align = 1;
+	d->d_off = 0LL;
+	d->d_buf = buffer_ext_addr(&dl);
+	d->d_type = ELF_T_BYTE;
+	d->d_size = buffer_ext_size(&dl);
+	d->d_version = EV_CURRENT;
+
+	shdr = elf_getshdr(scn);
+	if (!shdr) {
+		warnx("cannot get section header");
+		return -1;
+	}
+
+	shdr->sh_name = 52; /* .debug_line */
+	shdr->sh_type = SHT_PROGBITS;
+	shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+	shdr->sh_flags = 0;
+	shdr->sh_entsize = 0;
+
+	/*
+	 * setup .debug_info section
+	 */
+	scn = elf_newscn(e);
+	if (!scn) {
+		warnx("cannot create section");
+		return -1;
+	}
+
+	d = elf_newdata(scn);
+	if (!d) {
+		warnx("cannot get new data");
+		return -1;
+	}
+
+	d->d_align = 1;
+	d->d_off = 0LL;
+	d->d_buf = buffer_ext_addr(&di);
+	d->d_type = ELF_T_BYTE;
+	d->d_size = buffer_ext_size(&di);
+	d->d_version = EV_CURRENT;
+
+	shdr = elf_getshdr(scn);
+	if (!shdr) {
+		warnx("cannot get section header");
+		return -1;
+	}
+
+	shdr->sh_name = 64; /* .debug_info */
+	shdr->sh_type = SHT_PROGBITS;
+	shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+	shdr->sh_flags = 0;
+	shdr->sh_entsize = 0;
+
+	/*
+	 * setup .debug_abbrev section
+	 */
+	scn = elf_newscn(e);
+	if (!scn) {
+		warnx("cannot create section");
+		return -1;
+	}
+
+	d = elf_newdata(scn);
+	if (!d) {
+		warnx("cannot get new data");
+		return -1;
+	}
+
+	d->d_align = 1;
+	d->d_off = 0LL;
+	d->d_buf = buffer_ext_addr(&da);
+	d->d_type = ELF_T_BYTE;
+	d->d_size = buffer_ext_size(&da);
+	d->d_version = EV_CURRENT;
+
+	shdr = elf_getshdr(scn);
+	if (!shdr) {
+		warnx("cannot get section header");
+		return -1;
+	}
+
+	shdr->sh_name = 76; /* .debug_info */
+	shdr->sh_type = SHT_PROGBITS;
+	shdr->sh_addr = 0; /* must be zero or == sh_offset -> dynamic object */
+	shdr->sh_flags = 0;
+	shdr->sh_entsize = 0;
+
+	/*
+	 * now we update the ELF image with all the sections
+	 */
+	if (elf_update(e, ELF_C_WRITE) < 0) {
+		warnx("elf_update debug failed");
+		return -1;
+	}
+	return 0;
+}
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index 9f7a012..99fa5ee 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -63,7 +63,9 @@ jit_emit_elf(char *filename,
 	     const char *sym,
 	     uint64_t code_addr,
 	     const void *code,
-	     int csize)
+	     int csize,
+	     void *debug,
+	     int nr_debug_entries)
 {
 	int ret, fd;
 
@@ -76,7 +78,7 @@ jit_emit_elf(char *filename,
 		return -1;
 	}
 
-        ret = jit_write_elf(fd, code_addr, sym, (const void *)code, csize);
+        ret = jit_write_elf(fd, code_addr, sym, (const void *)code, csize, debug, nr_debug_entries);
 
         close(fd);
 
@@ -347,7 +349,7 @@ static int jit_repipe_code_load(struct jit_buf_desc *jd, union jr_entry *jr)
 
 	size = PERF_ALIGN(size, sizeof(u64));
 	uaddr = (uintptr_t)code;
-	ret = jit_emit_elf(filename, sym, addr, (const void *)uaddr, csize);
+	ret = jit_emit_elf(filename, sym, addr, (const void *)uaddr, csize, jd->debug_data, jd->nr_debug_entries);
 
 	if (jd->debug_data && jd->nr_debug_entries) {
 		free(jd->debug_data);
-- 
cgit v0.10.2


From c1a0bf347c40dd4b0a5bb10fdf4de76a1fbbbe8c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 8 Feb 2016 09:57:34 +0100
Subject: x86/mm/numa: Clean up numa_clear_kernel_node_hotplug()

So we fixed an overflow bug in numa_clear_kernel_node_hotplug():

  2b54ab3c66d4 ("x86/mm/numa: Fix memory corruption on 32-bit NUMA kernels")

... and the bug was indirectly caused by poor coding style,
such as using start/end local variables unnecessarily, which
lost the physaddr_t type.

So make the code more readable and try to fully comment all
the thinking behind the logic.

No change in functionality.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Chen Tang <imtangchen@gmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: y14sg1 <y14sg1@comcast.net>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index d04f809..ede4506 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -465,46 +465,65 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
 	return true;
 }
 
+/*
+ * Mark all currently memblock-reserved physical memory (which covers the
+ * kernel's own memory ranges) as hot-unswappable.
+ */
 static void __init numa_clear_kernel_node_hotplug(void)
 {
-	int i, nid;
-	nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
-	phys_addr_t start, end;
-	struct memblock_region *r;
+	nodemask_t reserved_nodemask = NODE_MASK_NONE;
+	struct memblock_region *mb_region;
+	int i;
 
 	/*
+	 * We have to do some preprocessing of memblock regions, to
+	 * make them suitable for reservation.
+	 *
 	 * At this time, all memory regions reserved by memblock are
-	 * used by the kernel. Set the nid in memblock.reserved will
-	 * mark out all the nodes the kernel resides in.
+	 * used by the kernel, but those regions are not split up
+	 * along node boundaries yet, and don't necessarily have their
+	 * node ID set yet either.
+	 *
+	 * So iterate over all memory known to the x86 architecture,
+	 * and use those ranges to set the nid in memblock.reserved.
+	 * This will split up the memblock regions along node
+	 * boundaries and will set the node IDs as well.
 	 */
 	for (i = 0; i < numa_meminfo.nr_blks; i++) {
-		struct numa_memblk *mb = &numa_meminfo.blk[i];
+		struct numa_memblk *mb = numa_meminfo.blk + i;
 
-		memblock_set_node(mb->start, mb->end - mb->start,
-				  &memblock.reserved, mb->nid);
+		memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
 	}
 
 	/*
-	 * Mark all kernel nodes.
+	 * Now go over all reserved memblock regions, to construct a
+	 * node mask of all kernel reserved memory areas.
 	 *
-	 * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
-	 * may not include all the memblock.reserved memory ranges because
-	 * trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
+	 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
+	 *   numa_meminfo might not include all memblock.reserved
+	 *   memory ranges, because quirks such as trim_snb_memory()
+	 *   reserve specific pages for Sandy Bridge graphics. ]
 	 */
-	for_each_memblock(reserved, r)
-		if (r->nid != MAX_NUMNODES)
-			node_set(r->nid, numa_kernel_nodes);
+	for_each_memblock(reserved, mb_region) {
+		if (mb_region->nid != MAX_NUMNODES)
+			node_set(mb_region->nid, reserved_nodemask);
+	}
 
-	/* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+	/*
+	 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
+	 * belonging to the reserved node mask.
+	 *
+	 * Note that this will include memory regions that reside
+	 * on nodes that contain kernel memory - entire nodes
+	 * become hot-unpluggable:
+	 */
 	for (i = 0; i < numa_meminfo.nr_blks; i++) {
-		nid = numa_meminfo.blk[i].nid;
-		if (!node_isset(nid, numa_kernel_nodes))
-			continue;
+		struct numa_memblk *mb = numa_meminfo.blk + i;
 
-		start = numa_meminfo.blk[i].start;
-		end = numa_meminfo.blk[i].end;
+		if (!node_isset(mb->nid, reserved_nodemask))
+			continue;
 
-		memblock_clear_hotplug(start, end - start);
+		memblock_clear_hotplug(mb->start, mb->end - mb->start);
 	}
 }
 
-- 
cgit v0.10.2


From 5f7ee246850ba18a6a7bcb3d5eddb6db68354688 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 8 Feb 2016 12:14:55 +0100
Subject: x86/mm/numa: Check for failures in numa_clear_kernel_node_hotplug()

numa_clear_kernel_node_hotplug() uses memblock_set_node() without
checking for failures.

memblock_set_node() is a complex function that might extend the
memblock array - which extension might fail - so check for this
possibility.

It's not supposed to happen (because realistically if we have so
little memory that this fails then we likely won't be able to
boot anyway), but do the check nevertheless.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Chen Tang <imtangchen@gmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: y14sg1 <y14sg1@comcast.net>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ede4506..f70c1ff 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -491,8 +491,10 @@ static void __init numa_clear_kernel_node_hotplug(void)
 	 */
 	for (i = 0; i < numa_meminfo.nr_blks; i++) {
 		struct numa_memblk *mb = numa_meminfo.blk + i;
+		int ret;
 
-		memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
+		ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
+		WARN_ON_ONCE(ret);
 	}
 
 	/*
-- 
cgit v0.10.2


From dd85c79150079339b3ded62dda5f6985d192900a Mon Sep 17 00:00:00 2001
From: Milo Kim <milo.kim@ti.com>
Date: Wed, 13 Jan 2016 16:19:49 +0900
Subject: irqchip/atmel-aic: Handle aic_common_irq_fixup in aic_common_of_init

AIC IRQ fixup is handled in each IRQ chip driver.
It can be moved into aic_common_of_init() before returning the result.
Then, aic_common_irq_fixup() can be changed to static type.

Signed-off-by: Milo Kim <milo.kim@ti.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Ludovic Desroches <ludovic.desroches@atmel.com>
Cc: Nicholas Ferre <nicolas.ferre@atmel.com>
Link: http://lkml.kernel.org/r/1452669592-3401-1-git-send-email-milo.kim@ti.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c
index 37199b9..661840b 100644
--- a/drivers/irqchip/irq-atmel-aic-common.c
+++ b/drivers/irqchip/irq-atmel-aic-common.c
@@ -193,7 +193,7 @@ void __init aic_common_rtt_irq_fixup(struct device_node *root)
 	}
 }
 
-void __init aic_common_irq_fixup(const struct of_device_id *matches)
+static void __init aic_common_irq_fixup(const struct of_device_id *matches)
 {
 	struct device_node *root = of_find_node_by_path("/");
 	const struct of_device_id *match;
@@ -214,7 +214,8 @@ void __init aic_common_irq_fixup(const struct of_device_id *matches)
 
 struct irq_domain *__init aic_common_of_init(struct device_node *node,
 					     const struct irq_domain_ops *ops,
-					     const char *name, int nirqs)
+					     const char *name, int nirqs,
+					     const struct of_device_id *matches)
 {
 	struct irq_chip_generic *gc;
 	struct irq_domain *domain;
@@ -264,6 +265,7 @@ struct irq_domain *__init aic_common_of_init(struct device_node *node,
 	}
 
 	aic_common_ext_irq_of_init(domain);
+	aic_common_irq_fixup(matches);
 
 	return domain;
 
diff --git a/drivers/irqchip/irq-atmel-aic-common.h b/drivers/irqchip/irq-atmel-aic-common.h
index 603f0a9..046bcc8 100644
--- a/drivers/irqchip/irq-atmel-aic-common.h
+++ b/drivers/irqchip/irq-atmel-aic-common.h
@@ -30,12 +30,11 @@ int aic_common_irq_domain_xlate(struct irq_domain *d,
 
 struct irq_domain *__init aic_common_of_init(struct device_node *node,
 					     const struct irq_domain_ops *ops,
-					     const char *name, int nirqs);
+					     const char *name, int nirqs,
+					     const struct of_device_id *matches);
 
 void __init aic_common_rtc_irq_fixup(struct device_node *root);
 
 void __init aic_common_rtt_irq_fixup(struct device_node *root);
 
-void __init aic_common_irq_fixup(const struct of_device_id *matches);
-
 #endif /* __IRQ_ATMEL_AIC_COMMON_H */
diff --git a/drivers/irqchip/irq-atmel-aic.c b/drivers/irqchip/irq-atmel-aic.c
index 8a0c7f2..799834d 100644
--- a/drivers/irqchip/irq-atmel-aic.c
+++ b/drivers/irqchip/irq-atmel-aic.c
@@ -248,12 +248,10 @@ static int __init aic_of_init(struct device_node *node,
 		return -EEXIST;
 
 	domain = aic_common_of_init(node, &aic_irq_ops, "atmel-aic",
-				    NR_AIC_IRQS);
+				    NR_AIC_IRQS, aic_irq_fixups);
 	if (IS_ERR(domain))
 		return PTR_ERR(domain);
 
-	aic_common_irq_fixup(aic_irq_fixups);
-
 	aic_domain = domain;
 	gc = irq_get_domain_generic_chip(domain, 0);
 
diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c
index 62bb840..a7e8fc8 100644
--- a/drivers/irqchip/irq-atmel-aic5.c
+++ b/drivers/irqchip/irq-atmel-aic5.c
@@ -312,12 +312,10 @@ static int __init aic5_of_init(struct device_node *node,
 		return -EEXIST;
 
 	domain = aic_common_of_init(node, &aic5_irq_ops, "atmel-aic5",
-				    nirqs);
+				    nirqs, aic5_irq_fixups);
 	if (IS_ERR(domain))
 		return PTR_ERR(domain);
 
-	aic_common_irq_fixup(aic5_irq_fixups);
-
 	aic5_domain = domain;
 	nchips = aic5_domain->revmap_size / 32;
 	for (i = 0; i < nchips; i++) {
-- 
cgit v0.10.2


From 5fd26a0bb1e479014adf024df779172d33defdd5 Mon Sep 17 00:00:00 2001
From: Milo Kim <milo.kim@ti.com>
Date: Wed, 13 Jan 2016 16:19:51 +0900
Subject: irqchip/atmel-aic: Change return type of aic_common_set_priority()

Priority validation is not necessary because aic_common_irq_domain_xlate()
already handles it. With this removal, return type can be changed to void.

Signed-off-by: Milo Kim <milo.kim@ti.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Ludovic Desroches <ludovic.desroches@atmel.com>
Cc: Nicholas Ferre <nicolas.ferre@atmel.com>
Link: http://lkml.kernel.org/r/1452669592-3401-3-git-send-email-milo.kim@ti.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c
index 661840b..28b26c8 100644
--- a/drivers/irqchip/irq-atmel-aic-common.c
+++ b/drivers/irqchip/irq-atmel-aic-common.c
@@ -80,16 +80,10 @@ int aic_common_set_type(struct irq_data *d, unsigned type, unsigned *val)
 	return 0;
 }
 
-int aic_common_set_priority(int priority, unsigned *val)
+void aic_common_set_priority(int priority, unsigned *val)
 {
-	if (priority < AT91_AIC_IRQ_MIN_PRIORITY ||
-	    priority > AT91_AIC_IRQ_MAX_PRIORITY)
-		return -EINVAL;
-
 	*val &= ~AT91_AIC_PRIOR;
 	*val |= priority;
-
-	return 0;
 }
 
 int aic_common_irq_domain_xlate(struct irq_domain *d,
diff --git a/drivers/irqchip/irq-atmel-aic-common.h b/drivers/irqchip/irq-atmel-aic-common.h
index 046bcc8..af60376 100644
--- a/drivers/irqchip/irq-atmel-aic-common.h
+++ b/drivers/irqchip/irq-atmel-aic-common.h
@@ -19,7 +19,7 @@
 
 int aic_common_set_type(struct irq_data *d, unsigned type, unsigned *val);
 
-int aic_common_set_priority(int priority, unsigned *val);
+void aic_common_set_priority(int priority, unsigned *val);
 
 int aic_common_irq_domain_xlate(struct irq_domain *d,
 				struct device_node *ctrlr,
diff --git a/drivers/irqchip/irq-atmel-aic.c b/drivers/irqchip/irq-atmel-aic.c
index 799834d..112e17c 100644
--- a/drivers/irqchip/irq-atmel-aic.c
+++ b/drivers/irqchip/irq-atmel-aic.c
@@ -196,9 +196,8 @@ static int aic_irq_domain_xlate(struct irq_domain *d,
 
 	irq_gc_lock(gc);
 	smr = irq_reg_readl(gc, AT91_AIC_SMR(*out_hwirq));
-	ret = aic_common_set_priority(intspec[2], &smr);
-	if (!ret)
-		irq_reg_writel(gc, smr, AT91_AIC_SMR(*out_hwirq));
+	aic_common_set_priority(intspec[2], &smr);
+	irq_reg_writel(gc, smr, AT91_AIC_SMR(*out_hwirq));
 	irq_gc_unlock(gc);
 
 	return ret;
diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c
index a7e8fc8..f36f426 100644
--- a/drivers/irqchip/irq-atmel-aic5.c
+++ b/drivers/irqchip/irq-atmel-aic5.c
@@ -272,9 +272,8 @@ static int aic5_irq_domain_xlate(struct irq_domain *d,
 	irq_gc_lock(bgc);
 	irq_reg_writel(bgc, *out_hwirq, AT91_AIC5_SSR);
 	smr = irq_reg_readl(bgc, AT91_AIC5_SMR);
-	ret = aic_common_set_priority(intspec[2], &smr);
-	if (!ret)
-		irq_reg_writel(bgc, intspec[2] | smr, AT91_AIC5_SMR);
+	aic_common_set_priority(intspec[2], &smr);
+	irq_reg_writel(bgc, intspec[2] | smr, AT91_AIC5_SMR);
 	irq_gc_unlock(bgc);
 
 	return ret;
-- 
cgit v0.10.2


From 4b5ce20b5429eb08ae0776962a4aff7f00017800 Mon Sep 17 00:00:00 2001
From: Milo Kim <milo.kim@ti.com>
Date: Wed, 13 Jan 2016 16:19:52 +0900
Subject: irqchip/atmel-aic: Remove duplicate bit operation

AIC5 priority value is updated twice -
in aic_common_set_priority() and when updating AT91_AIC5_SMR.
Variable, 'smr' has updated priority value (intspec[2]) in the first step,
so no need to update it again in the second step.

Signed-off-by: Milo Kim <milo.kim@ti.com>
Acked-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Ludovic Desroches <ludovic.desroches@atmel.com>
Cc: Nicholas Ferre <nicolas.ferre@atmel.com>
Link: http://lkml.kernel.org/r/1452669592-3401-4-git-send-email-milo.kim@ti.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/irqchip/irq-atmel-aic5.c b/drivers/irqchip/irq-atmel-aic5.c
index f36f426..4f0d068 100644
--- a/drivers/irqchip/irq-atmel-aic5.c
+++ b/drivers/irqchip/irq-atmel-aic5.c
@@ -273,7 +273,7 @@ static int aic5_irq_domain_xlate(struct irq_domain *d,
 	irq_reg_writel(bgc, *out_hwirq, AT91_AIC5_SSR);
 	smr = irq_reg_readl(bgc, AT91_AIC5_SMR);
 	aic_common_set_priority(intspec[2], &smr);
-	irq_reg_writel(bgc, intspec[2] | smr, AT91_AIC5_SMR);
+	irq_reg_writel(bgc, smr, AT91_AIC5_SMR);
 	irq_gc_unlock(bgc);
 
 	return ret;
-- 
cgit v0.10.2


From c7c42ec2baa1de7ab3965e4f1bf5073bee6065e4 Mon Sep 17 00:00:00 2001
From: Simon Arlott <simon@fire.lp0.eu>
Date: Sun, 22 Nov 2015 14:30:14 +0000
Subject: irqchips/bmips: Add bcm6345-l1 interrupt controller

Add the BCM6345 interrupt controller based on the SMP-capable BCM7038
and the BCM3380 but with packed interrupt registers.

Add the BCM6345 interrupt controller to a list with the existing BCM7038
so that interrupts on CPU1 are not ignored.

Update the maintainers file list for BMIPS to include this driver.

Signed-off-by: Simon Arlott <simon@fire.lp0.eu>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: devicetree@vger.kernel.org
Cc: Ian Campbell <ijc+devicetree@hellion.org.uk>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Pawel Moll <pawel.moll@arm.com>
Cc: linux-mips@linux-mips.org
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Jonas Gorski <jogo@openwrt.org>
Cc: Kumar Gala <galak@codeaurora.org>
Cc: Rob Herring <robh@kernel.org>
Link: http://lkml.kernel.org/r/5651D176.6030908@simon.arlott.org.uk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/MAINTAINERS b/MAINTAINERS
index 7f1fa4f..b5fab14f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2423,6 +2423,7 @@ F:	arch/mips/bmips/*
 F:	arch/mips/include/asm/mach-bmips/*
 F:	arch/mips/kernel/*bmips*
 F:	arch/mips/boot/dts/brcm/bcm*.dts*
+F:	drivers/irqchip/irq-bcm63*
 F:	drivers/irqchip/irq-bcm7*
 F:	drivers/irqchip/irq-brcmstb*
 F:	include/linux/bcm963xx_nvram.h
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 57a945e..eb079ac 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -151,6 +151,7 @@ config BMIPS_GENERIC
 	select CSRC_R4K
 	select SYNC_R4K
 	select COMMON_CLK
+	select BCM6345_L1_IRQ
 	select BCM7038_L1_IRQ
 	select BCM7120_L2_IRQ
 	select BRCMSTB_L2_IRQ
diff --git a/arch/mips/bmips/irq.c b/arch/mips/bmips/irq.c
index e7fc6f934..7efefcf 100644
--- a/arch/mips/bmips/irq.c
+++ b/arch/mips/bmips/irq.c
@@ -15,6 +15,12 @@
 #include <asm/irq_cpu.h>
 #include <asm/time.h>
 
+static const struct of_device_id smp_intc_dt_match[] = {
+	{ .compatible = "brcm,bcm7038-l1-intc" },
+	{ .compatible = "brcm,bcm6345-l1-intc" },
+	{}
+};
+
 unsigned int get_c0_compare_int(void)
 {
 	return CP0_LEGACY_COMPARE_IRQ;
@@ -24,8 +30,8 @@ void __init arch_init_irq(void)
 {
 	struct device_node *dn;
 
-	/* Only the STB (bcm7038) controller supports SMP IRQ affinity */
-	dn = of_find_compatible_node(NULL, NULL, "brcm,bcm7038-l1-intc");
+	/* Only these controllers support SMP IRQ affinity */
+	dn = of_find_matching_node(NULL, smp_intc_dt_match);
 	if (dn)
 		of_node_put(dn);
 	else
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index fb50911..abe7a57 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -78,6 +78,11 @@ config I8259
 	bool
 	select IRQ_DOMAIN
 
+config BCM6345_L1_IRQ
+	bool
+	select GENERIC_IRQ_CHIP
+	select IRQ_DOMAIN
+
 config BCM7038_L1_IRQ
 	bool
 	select GENERIC_IRQ_CHIP
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 18caacb..d91d99d 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -46,6 +46,7 @@ obj-$(CONFIG_XTENSA)			+= irq-xtensa-pic.o
 obj-$(CONFIG_XTENSA_MX)			+= irq-xtensa-mx.o
 obj-$(CONFIG_IRQ_CROSSBAR)		+= irq-crossbar.o
 obj-$(CONFIG_SOC_VF610)			+= irq-vf610-mscm-ir.o
+obj-$(CONFIG_BCM6345_L1_IRQ)		+= irq-bcm6345-l1.o
 obj-$(CONFIG_BCM7038_L1_IRQ)		+= irq-bcm7038-l1.o
 obj-$(CONFIG_BCM7120_L2_IRQ)		+= irq-bcm7120-l2.o
 obj-$(CONFIG_BRCMSTB_L2_IRQ)		+= irq-brcmstb-l2.o
diff --git a/drivers/irqchip/irq-bcm6345-l1.c b/drivers/irqchip/irq-bcm6345-l1.c
new file mode 100644
index 0000000..b844c89
--- /dev/null
+++ b/drivers/irqchip/irq-bcm6345-l1.c
@@ -0,0 +1,364 @@
+/*
+ * Broadcom BCM6345 style Level 1 interrupt controller driver
+ *
+ * Copyright (C) 2014 Broadcom Corporation
+ * Copyright 2015 Simon Arlott
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This is based on the BCM7038 (which supports SMP) but with a single
+ * enable register instead of separate mask/set/clear registers.
+ *
+ * The BCM3380 has a similar mask/status register layout, but each pair
+ * of words is at separate locations (and SMP is not supported).
+ *
+ * ENABLE/STATUS words are packed next to each other for each CPU:
+ *
+ * BCM6368:
+ *   0x1000_0020: CPU0_W0_ENABLE
+ *   0x1000_0024: CPU0_W1_ENABLE
+ *   0x1000_0028: CPU0_W0_STATUS		IRQs 31-63
+ *   0x1000_002c: CPU0_W1_STATUS		IRQs 0-31
+ *   0x1000_0030: CPU1_W0_ENABLE
+ *   0x1000_0034: CPU1_W1_ENABLE
+ *   0x1000_0038: CPU1_W0_STATUS		IRQs 31-63
+ *   0x1000_003c: CPU1_W1_STATUS		IRQs 0-31
+ *
+ * BCM63168:
+ *   0x1000_0020: CPU0_W0_ENABLE
+ *   0x1000_0024: CPU0_W1_ENABLE
+ *   0x1000_0028: CPU0_W2_ENABLE
+ *   0x1000_002c: CPU0_W3_ENABLE
+ *   0x1000_0030: CPU0_W0_STATUS	IRQs 96-127
+ *   0x1000_0034: CPU0_W1_STATUS	IRQs 64-95
+ *   0x1000_0038: CPU0_W2_STATUS	IRQs 32-63
+ *   0x1000_003c: CPU0_W3_STATUS	IRQs 0-31
+ *   0x1000_0040: CPU1_W0_ENABLE
+ *   0x1000_0044: CPU1_W1_ENABLE
+ *   0x1000_0048: CPU1_W2_ENABLE
+ *   0x1000_004c: CPU1_W3_ENABLE
+ *   0x1000_0050: CPU1_W0_STATUS	IRQs 96-127
+ *   0x1000_0054: CPU1_W1_STATUS	IRQs 64-95
+ *   0x1000_0058: CPU1_W2_STATUS	IRQs 32-63
+ *   0x1000_005c: CPU1_W3_STATUS	IRQs 0-31
+ *
+ * IRQs are numbered in CPU native endian order
+ * (which is big-endian in these examples)
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME	": " fmt
+
+#include <linux/bitops.h>
+#include <linux/cpumask.h>
+#include <linux/kconfig.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/types.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+
+#define IRQS_PER_WORD		32
+#define REG_BYTES_PER_IRQ_WORD	(sizeof(u32) * 2)
+
+struct bcm6345_l1_cpu;
+
+struct bcm6345_l1_chip {
+	raw_spinlock_t		lock;
+	unsigned int		n_words;
+	struct irq_domain	*domain;
+	struct cpumask		cpumask;
+	struct bcm6345_l1_cpu	*cpus[NR_CPUS];
+};
+
+struct bcm6345_l1_cpu {
+	void __iomem		*map_base;
+	unsigned int		parent_irq;
+	u32			enable_cache[];
+};
+
+static inline unsigned int reg_enable(struct bcm6345_l1_chip *intc,
+					   unsigned int word)
+{
+#ifdef __BIG_ENDIAN
+	return (1 * intc->n_words - word - 1) * sizeof(u32);
+#else
+	return (0 * intc->n_words + word) * sizeof(u32);
+#endif
+}
+
+static inline unsigned int reg_status(struct bcm6345_l1_chip *intc,
+				      unsigned int word)
+{
+#ifdef __BIG_ENDIAN
+	return (2 * intc->n_words - word - 1) * sizeof(u32);
+#else
+	return (1 * intc->n_words + word) * sizeof(u32);
+#endif
+}
+
+static inline unsigned int cpu_for_irq(struct bcm6345_l1_chip *intc,
+					struct irq_data *d)
+{
+	return cpumask_first_and(&intc->cpumask, irq_data_get_affinity_mask(d));
+}
+
+static void bcm6345_l1_irq_handle(struct irq_desc *desc)
+{
+	struct bcm6345_l1_chip *intc = irq_desc_get_handler_data(desc);
+	struct bcm6345_l1_cpu *cpu;
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	unsigned int idx;
+
+#ifdef CONFIG_SMP
+	cpu = intc->cpus[cpu_logical_map(smp_processor_id())];
+#else
+	cpu = intc->cpus[0];
+#endif
+
+	chained_irq_enter(chip, desc);
+
+	for (idx = 0; idx < intc->n_words; idx++) {
+		int base = idx * IRQS_PER_WORD;
+		unsigned long pending;
+		irq_hw_number_t hwirq;
+		unsigned int irq;
+
+		pending = __raw_readl(cpu->map_base + reg_status(intc, idx));
+		pending &= __raw_readl(cpu->map_base + reg_enable(intc, idx));
+
+		for_each_set_bit(hwirq, &pending, IRQS_PER_WORD) {
+			irq = irq_linear_revmap(intc->domain, base + hwirq);
+			if (irq)
+				do_IRQ(irq);
+			else
+				spurious_interrupt();
+		}
+	}
+
+	chained_irq_exit(chip, desc);
+}
+
+static inline void __bcm6345_l1_unmask(struct irq_data *d)
+{
+	struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
+	u32 word = d->hwirq / IRQS_PER_WORD;
+	u32 mask = BIT(d->hwirq % IRQS_PER_WORD);
+	unsigned int cpu_idx = cpu_for_irq(intc, d);
+
+	intc->cpus[cpu_idx]->enable_cache[word] |= mask;
+	__raw_writel(intc->cpus[cpu_idx]->enable_cache[word],
+		intc->cpus[cpu_idx]->map_base + reg_enable(intc, word));
+}
+
+static inline void __bcm6345_l1_mask(struct irq_data *d)
+{
+	struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
+	u32 word = d->hwirq / IRQS_PER_WORD;
+	u32 mask = BIT(d->hwirq % IRQS_PER_WORD);
+	unsigned int cpu_idx = cpu_for_irq(intc, d);
+
+	intc->cpus[cpu_idx]->enable_cache[word] &= ~mask;
+	__raw_writel(intc->cpus[cpu_idx]->enable_cache[word],
+		intc->cpus[cpu_idx]->map_base + reg_enable(intc, word));
+}
+
+static void bcm6345_l1_unmask(struct irq_data *d)
+{
+	struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&intc->lock, flags);
+	__bcm6345_l1_unmask(d);
+	raw_spin_unlock_irqrestore(&intc->lock, flags);
+}
+
+static void bcm6345_l1_mask(struct irq_data *d)
+{
+	struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&intc->lock, flags);
+	__bcm6345_l1_mask(d);
+	raw_spin_unlock_irqrestore(&intc->lock, flags);
+}
+
+static int bcm6345_l1_set_affinity(struct irq_data *d,
+				   const struct cpumask *dest,
+				   bool force)
+{
+	struct bcm6345_l1_chip *intc = irq_data_get_irq_chip_data(d);
+	u32 word = d->hwirq / IRQS_PER_WORD;
+	u32 mask = BIT(d->hwirq % IRQS_PER_WORD);
+	unsigned int old_cpu = cpu_for_irq(intc, d);
+	unsigned int new_cpu;
+	struct cpumask valid;
+	unsigned long flags;
+	bool enabled;
+
+	if (!cpumask_and(&valid, &intc->cpumask, dest))
+		return -EINVAL;
+
+	new_cpu = cpumask_any_and(&valid, cpu_online_mask);
+	if (new_cpu >= nr_cpu_ids)
+		return -EINVAL;
+
+	dest = cpumask_of(new_cpu);
+
+	raw_spin_lock_irqsave(&intc->lock, flags);
+	if (old_cpu != new_cpu) {
+		enabled = intc->cpus[old_cpu]->enable_cache[word] & mask;
+		if (enabled)
+			__bcm6345_l1_mask(d);
+		cpumask_copy(irq_data_get_affinity_mask(d), dest);
+		if (enabled)
+			__bcm6345_l1_unmask(d);
+	} else {
+		cpumask_copy(irq_data_get_affinity_mask(d), dest);
+	}
+	raw_spin_unlock_irqrestore(&intc->lock, flags);
+
+	return IRQ_SET_MASK_OK_NOCOPY;
+}
+
+static int __init bcm6345_l1_init_one(struct device_node *dn,
+				      unsigned int idx,
+				      struct bcm6345_l1_chip *intc)
+{
+	struct resource res;
+	resource_size_t sz;
+	struct bcm6345_l1_cpu *cpu;
+	unsigned int i, n_words;
+
+	if (of_address_to_resource(dn, idx, &res))
+		return -EINVAL;
+	sz = resource_size(&res);
+	n_words = sz / REG_BYTES_PER_IRQ_WORD;
+
+	if (!intc->n_words)
+		intc->n_words = n_words;
+	else if (intc->n_words != n_words)
+		return -EINVAL;
+
+	cpu = intc->cpus[idx] = kzalloc(sizeof(*cpu) + n_words * sizeof(u32),
+					GFP_KERNEL);
+	if (!cpu)
+		return -ENOMEM;
+
+	cpu->map_base = ioremap(res.start, sz);
+	if (!cpu->map_base)
+		return -ENOMEM;
+
+	for (i = 0; i < n_words; i++) {
+		cpu->enable_cache[i] = 0;
+		__raw_writel(0, cpu->map_base + reg_enable(intc, i));
+	}
+
+	cpu->parent_irq = irq_of_parse_and_map(dn, idx);
+	if (!cpu->parent_irq) {
+		pr_err("failed to map parent interrupt %d\n", cpu->parent_irq);
+		return -EINVAL;
+	}
+	irq_set_chained_handler_and_data(cpu->parent_irq,
+						bcm6345_l1_irq_handle, intc);
+
+	return 0;
+}
+
+static struct irq_chip bcm6345_l1_irq_chip = {
+	.name			= "bcm6345-l1",
+	.irq_mask		= bcm6345_l1_mask,
+	.irq_unmask		= bcm6345_l1_unmask,
+	.irq_set_affinity	= bcm6345_l1_set_affinity,
+};
+
+static int bcm6345_l1_map(struct irq_domain *d, unsigned int virq,
+			  irq_hw_number_t hw_irq)
+{
+	irq_set_chip_and_handler(virq,
+		&bcm6345_l1_irq_chip, handle_percpu_irq);
+	irq_set_chip_data(virq, d->host_data);
+	return 0;
+}
+
+static const struct irq_domain_ops bcm6345_l1_domain_ops = {
+	.xlate			= irq_domain_xlate_onecell,
+	.map			= bcm6345_l1_map,
+};
+
+static int __init bcm6345_l1_of_init(struct device_node *dn,
+			      struct device_node *parent)
+{
+	struct bcm6345_l1_chip *intc;
+	unsigned int idx;
+	int ret;
+
+	intc = kzalloc(sizeof(*intc), GFP_KERNEL);
+	if (!intc)
+		return -ENOMEM;
+
+	for_each_possible_cpu(idx) {
+		ret = bcm6345_l1_init_one(dn, idx, intc);
+		if (ret)
+			pr_err("failed to init intc L1 for cpu %d: %d\n",
+				idx, ret);
+		else
+			cpumask_set_cpu(idx, &intc->cpumask);
+	}
+
+	if (!cpumask_weight(&intc->cpumask)) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	raw_spin_lock_init(&intc->lock);
+
+	intc->domain = irq_domain_add_linear(dn, IRQS_PER_WORD * intc->n_words,
+					     &bcm6345_l1_domain_ops,
+					     intc);
+	if (!intc->domain) {
+		ret = -ENOMEM;
+		goto out_unmap;
+	}
+
+	pr_info("registered BCM6345 L1 intc (IRQs: %d)\n",
+			IRQS_PER_WORD * intc->n_words);
+	for_each_cpu(idx, &intc->cpumask) {
+		struct bcm6345_l1_cpu *cpu = intc->cpus[idx];
+
+		pr_info("  CPU%u at MMIO 0x%p (irq = %d)\n", idx,
+				cpu->map_base, cpu->parent_irq);
+	}
+
+	return 0;
+
+out_unmap:
+	for_each_possible_cpu(idx) {
+		struct bcm6345_l1_cpu *cpu = intc->cpus[idx];
+
+		if (cpu) {
+			if (cpu->map_base)
+				iounmap(cpu->map_base);
+			kfree(cpu);
+		}
+	}
+out_free:
+	kfree(intc);
+	return ret;
+}
+
+IRQCHIP_DECLARE(bcm6345_l1, "brcm,bcm6345-l1-intc", bcm6345_l1_of_init);
-- 
cgit v0.10.2


From fbf198030e0b027538c290300cfaf9e2efdce122 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 3 Feb 2016 19:52:23 +0100
Subject: genirq: Add default affinity mask command line option

If we isolate CPUs, then we don't want random device interrupts on them. Even
w/o the user space irq balancer enabled we can end up with irqs on non boot
cpus and chasing newly requested interrupts is a tedious task.

Allow to restrict the default irq affinity mask.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1602031948190.25254@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 551ecf0..87298f8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1687,6 +1687,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	ip=		[IP_PNP]
 			See Documentation/filesystems/nfs/nfsroot.txt.
 
+	irqaffinity=	[SMP] Set the default irq affinity mask
+			Format:
+			<cpu number>,...,<cpu number>
+			or
+			<cpu number>-<cpu number>
+			(must be a positive range in ascending order)
+			or a mixture
+			<cpu number>,...,<cpu number>-<cpu number>
+
 	irqfixup	[HW]
 			When an interrupt is not handled search all handlers
 			for it. Intended to get systems with badly broken
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0409da0..0ccd028 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -24,10 +24,27 @@
 static struct lock_class_key irq_desc_lock_class;
 
 #if defined(CONFIG_SMP)
+static int __init irq_affinity_setup(char *str)
+{
+	zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+	cpulist_parse(str, irq_default_affinity);
+	/*
+	 * Set at least the boot cpu. We don't want to end up with
+	 * bugreports caused by random comandline masks
+	 */
+	cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
+	return 1;
+}
+__setup("irqaffinity=", irq_affinity_setup);
+
 static void __init init_irq_default_affinity(void)
 {
-	alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-	cpumask_setall(irq_default_affinity);
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	if (!irq_default_affinity)
+		zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+#endif
+	if (cpumask_empty(irq_default_affinity))
+		cpumask_setall(irq_default_affinity);
 }
 #else
 static void __init init_irq_default_affinity(void)
-- 
cgit v0.10.2


From fa9cbf320e996eaa3d219344b6f7013b096cafd9 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 8 Feb 2016 17:09:04 +0100
Subject: perf/x86: Move perf_event.c ............... => x86/events/core.c

Also, keep the churn at minimum by adjusting the include "perf_event.h"
when each file gets moved.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1454947748-28629-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 1538562..eb3abf8 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,6 +1,7 @@
-
 obj-y += entry/
 
+obj-$(CONFIG_PERF_EVENTS) += events/
+
 obj-$(CONFIG_KVM) += kvm/
 
 # Xen paravirtualization support
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
new file mode 100644
index 0000000..3fad3ce
--- /dev/null
+++ b/arch/x86/events/Makefile
@@ -0,0 +1 @@
+obj-y			+= core.o
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
new file mode 100644
index 0000000..90ca601
--- /dev/null
+++ b/arch/x86/events/core.c
@@ -0,0 +1,2429 @@
+/*
+ * Performance events x86 architecture code
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/module.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+#include <linux/device.h>
+
+#include <asm/apic.h>
+#include <asm/stacktrace.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+#include <asm/alternative.h>
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+#include <asm/timer.h>
+#include <asm/desc.h>
+#include <asm/ldt.h>
+
+#include "../kernel/cpu/perf_event.h"
+
+struct x86_pmu x86_pmu __read_mostly;
+
+DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
+	.enabled = 1,
+};
+
+struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
+
+u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+u64 __read_mostly hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+/*
+ * Propagate event elapsed time into the generic event.
+ * Can only be executed on the CPU where the event is active.
+ * Returns the delta events processed.
+ */
+u64 x86_perf_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int shift = 64 - x86_pmu.cntval_bits;
+	u64 prev_raw_count, new_raw_count;
+	int idx = hwc->idx;
+	s64 delta;
+
+	if (idx == INTEL_PMC_IDX_FIXED_BTS)
+		return 0;
+
+	/*
+	 * Careful: an NMI might modify the previous event value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic event atomically:
+	 */
+again:
+	prev_raw_count = local64_read(&hwc->prev_count);
+	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
+
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
+}
+
+/*
+ * Find and validate any extra registers to set up.
+ */
+static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg;
+	struct extra_reg *er;
+
+	reg = &event->hw.extra_reg;
+
+	if (!x86_pmu.extra_regs)
+		return 0;
+
+	for (er = x86_pmu.extra_regs; er->msr; er++) {
+		if (er->event != (config & er->config_mask))
+			continue;
+		if (event->attr.config1 & ~er->valid_mask)
+			return -EINVAL;
+		/* Check if the extra msrs can be safely accessed*/
+		if (!er->extra_msr_access)
+			return -ENXIO;
+
+		reg->idx = er->idx;
+		reg->config = event->attr.config1;
+		reg->reg = er->msr;
+		break;
+	}
+	return 0;
+}
+
+static atomic_t active_events;
+static atomic_t pmc_refcount;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+
+static bool reserve_pmc_hardware(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
+			goto perfctr_fail;
+	}
+
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
+			goto eventsel_fail;
+	}
+
+	return true;
+
+eventsel_fail:
+	for (i--; i >= 0; i--)
+		release_evntsel_nmi(x86_pmu_config_addr(i));
+
+	i = x86_pmu.num_counters;
+
+perfctr_fail:
+	for (i--; i >= 0; i--)
+		release_perfctr_nmi(x86_pmu_event_addr(i));
+
+	return false;
+}
+
+static void release_pmc_hardware(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		release_perfctr_nmi(x86_pmu_event_addr(i));
+		release_evntsel_nmi(x86_pmu_config_addr(i));
+	}
+}
+
+#else
+
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
+
+#endif
+
+static bool check_hw_exists(void)
+{
+	u64 val, val_fail, val_new= ~0;
+	int i, reg, reg_fail, ret = 0;
+	int bios_fail = 0;
+	int reg_safe = -1;
+
+	/*
+	 * Check to see if the BIOS enabled any of the counters, if so
+	 * complain and bail.
+	 */
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		reg = x86_pmu_config_addr(i);
+		ret = rdmsrl_safe(reg, &val);
+		if (ret)
+			goto msr_fail;
+		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
+			bios_fail = 1;
+			val_fail = val;
+			reg_fail = reg;
+		} else {
+			reg_safe = i;
+		}
+	}
+
+	if (x86_pmu.num_counters_fixed) {
+		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+		ret = rdmsrl_safe(reg, &val);
+		if (ret)
+			goto msr_fail;
+		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+			if (val & (0x03 << i*4)) {
+				bios_fail = 1;
+				val_fail = val;
+				reg_fail = reg;
+			}
+		}
+	}
+
+	/*
+	 * If all the counters are enabled, the below test will always
+	 * fail.  The tools will also become useless in this scenario.
+	 * Just fail and disable the hardware counters.
+	 */
+
+	if (reg_safe == -1) {
+		reg = reg_safe;
+		goto msr_fail;
+	}
+
+	/*
+	 * Read the current value, change it and read it back to see if it
+	 * matches, this is needed to detect certain hardware emulators
+	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+	 */
+	reg = x86_pmu_event_addr(reg_safe);
+	if (rdmsrl_safe(reg, &val))
+		goto msr_fail;
+	val ^= 0xffffUL;
+	ret = wrmsrl_safe(reg, val);
+	ret |= rdmsrl_safe(reg, &val_new);
+	if (ret || val != val_new)
+		goto msr_fail;
+
+	/*
+	 * We still allow the PMU driver to operate:
+	 */
+	if (bios_fail) {
+		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
+		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
+			      reg_fail, val_fail);
+	}
+
+	return true;
+
+msr_fail:
+	pr_cont("Broken PMU hardware detected, using software events only.\n");
+	pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
+		boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
+		reg, val_new);
+
+	return false;
+}
+
+static void hw_perf_event_destroy(struct perf_event *event)
+{
+	x86_release_hardware();
+	atomic_dec(&active_events);
+}
+
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+	hw_perf_event_destroy(event);
+
+	/* undo the lbr/bts event accounting */
+	x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
+static inline int x86_pmu_initialized(void)
+{
+	return x86_pmu.handle_irq != NULL;
+}
+
+static inline int
+set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	unsigned int cache_type, cache_op, cache_result;
+	u64 config, val;
+
+	config = attr->config;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+	if (val == 0)
+		return -ENOENT;
+
+	if (val == -1)
+		return -EINVAL;
+
+	hwc->config |= val;
+	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
+	return x86_pmu_extra_regs(val, event);
+}
+
+int x86_reserve_hardware(void)
+{
+	int err = 0;
+
+	if (!atomic_inc_not_zero(&pmc_refcount)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&pmc_refcount) == 0) {
+			if (!reserve_pmc_hardware())
+				err = -EBUSY;
+			else
+				reserve_ds_buffers();
+		}
+		if (!err)
+			atomic_inc(&pmc_refcount);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+
+	return err;
+}
+
+void x86_release_hardware(void)
+{
+	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
+		release_pmc_hardware();
+		release_ds_buffers();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+	int i;
+
+	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
+		mutex_lock(&pmc_reserve_mutex);
+		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
+			if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+				goto fail_unlock;
+		}
+		atomic_inc(&x86_pmu.lbr_exclusive[what]);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+
+	atomic_inc(&active_events);
+	return 0;
+
+fail_unlock:
+	mutex_unlock(&pmc_reserve_mutex);
+	return -EBUSY;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+	atomic_dec(&x86_pmu.lbr_exclusive[what]);
+	atomic_dec(&active_events);
+}
+
+int x86_setup_perfctr(struct perf_event *event)
+{
+	struct perf_event_attr *attr = &event->attr;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+
+	if (!is_sampling_event(event)) {
+		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
+		local64_set(&hwc->period_left, hwc->sample_period);
+	}
+
+	if (attr->type == PERF_TYPE_RAW)
+		return x86_pmu_extra_regs(event->attr.config, event);
+
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, event);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+
+	/*
+	 * The generic map:
+	 */
+	config = x86_pmu.event_map(attr->config);
+
+	if (config == 0)
+		return -ENOENT;
+
+	if (config == -1LL)
+		return -EINVAL;
+
+	/*
+	 * Branch tracing:
+	 */
+	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+	    !attr->freq && hwc->sample_period == 1) {
+		/* BTS is not supported by this architecture. */
+		if (!x86_pmu.bts_active)
+			return -EOPNOTSUPP;
+
+		/* BTS is currently only allowed for user-mode. */
+		if (!attr->exclude_kernel)
+			return -EOPNOTSUPP;
+
+		/* disallow bts if conflicting events are present */
+		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+			return -EBUSY;
+
+		event->destroy = hw_perf_lbr_event_destroy;
+	}
+
+	hwc->config |= config;
+
+	return 0;
+}
+
+/*
+ * check that branch_sample_type is compatible with
+ * settings needed for precise_ip > 1 which implies
+ * using the LBR to capture ALL taken branches at the
+ * priv levels of the measurement
+ */
+static inline int precise_br_compat(struct perf_event *event)
+{
+	u64 m = event->attr.branch_sample_type;
+	u64 b = 0;
+
+	/* must capture all branches */
+	if (!(m & PERF_SAMPLE_BRANCH_ANY))
+		return 0;
+
+	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
+
+	if (!event->attr.exclude_user)
+		b |= PERF_SAMPLE_BRANCH_USER;
+
+	if (!event->attr.exclude_kernel)
+		b |= PERF_SAMPLE_BRANCH_KERNEL;
+
+	/*
+	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
+	 */
+
+	return m == b;
+}
+
+int x86_pmu_hw_config(struct perf_event *event)
+{
+	if (event->attr.precise_ip) {
+		int precise = 0;
+
+		/* Support for constant skid */
+		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
+			precise++;
+
+			/* Support for IP fixup */
+			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
+				precise++;
+
+			if (x86_pmu.pebs_prec_dist)
+				precise++;
+		}
+
+		if (event->attr.precise_ip > precise)
+			return -EOPNOTSUPP;
+	}
+	/*
+	 * check that PEBS LBR correction does not conflict with
+	 * whatever the user is asking with attr->branch_sample_type
+	 */
+	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+		u64 *br_type = &event->attr.branch_sample_type;
+
+		if (has_branch_stack(event)) {
+			if (!precise_br_compat(event))
+				return -EOPNOTSUPP;
+
+			/* branch_sample_type is compatible */
+
+		} else {
+			/*
+			 * user did not specify  branch_sample_type
+			 *
+			 * For PEBS fixups, we capture all
+			 * the branches at the priv level of the
+			 * event.
+			 */
+			*br_type = PERF_SAMPLE_BRANCH_ANY;
+
+			if (!event->attr.exclude_user)
+				*br_type |= PERF_SAMPLE_BRANCH_USER;
+
+			if (!event->attr.exclude_kernel)
+				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
+		}
+	}
+
+	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+		event->attach_state |= PERF_ATTACH_TASK_DATA;
+
+	/*
+	 * Generate PMC IRQs:
+	 * (keep 'enabled' bit clear for now)
+	 */
+	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
+
+	/*
+	 * Count user and OS events unless requested not to
+	 */
+	if (!event->attr.exclude_user)
+		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!event->attr.exclude_kernel)
+		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
+
+	if (event->attr.type == PERF_TYPE_RAW)
+		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
+	return x86_setup_perfctr(event);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __x86_pmu_event_init(struct perf_event *event)
+{
+	int err;
+
+	if (!x86_pmu_initialized())
+		return -ENODEV;
+
+	err = x86_reserve_hardware();
+	if (err)
+		return err;
+
+	atomic_inc(&active_events);
+	event->destroy = hw_perf_event_destroy;
+
+	event->hw.idx = -1;
+	event->hw.last_cpu = -1;
+	event->hw.last_tag = ~0ULL;
+
+	/* mark unused */
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+	event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+	return x86_pmu.hw_config(event);
+}
+
+void x86_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		u64 val;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		rdmsrl(x86_pmu_config_addr(idx), val);
+		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
+			continue;
+		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+		wrmsrl(x86_pmu_config_addr(idx), val);
+	}
+}
+
+static void x86_pmu_disable(struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (!x86_pmu_initialized())
+		return;
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->n_added = 0;
+	cpuc->enabled = 0;
+	barrier();
+
+	x86_pmu.disable_all();
+}
+
+void x86_pmu_enable_all(int added)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+	}
+}
+
+static struct pmu pmu;
+
+static inline int is_x86_event(struct perf_event *event)
+{
+	return event->pmu == &pmu;
+}
+
+/*
+ * Event scheduler state:
+ *
+ * Assign events iterating over all events and counters, beginning
+ * with events with least weights first. Keep the current iterator
+ * state in struct sched_state.
+ */
+struct sched_state {
+	int	weight;
+	int	event;		/* event index */
+	int	counter;	/* counter index */
+	int	unassigned;	/* number of events to be assigned left */
+	int	nr_gp;		/* number of GP counters used */
+	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+};
+
+/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
+#define	SCHED_STATES_MAX	2
+
+struct perf_sched {
+	int			max_weight;
+	int			max_events;
+	int			max_gp;
+	int			saved_states;
+	struct event_constraint	**constraints;
+	struct sched_state	state;
+	struct sched_state	saved[SCHED_STATES_MAX];
+};
+
+/*
+ * Initialize interator that runs through all events and counters.
+ */
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+			    int num, int wmin, int wmax, int gpmax)
+{
+	int idx;
+
+	memset(sched, 0, sizeof(*sched));
+	sched->max_events	= num;
+	sched->max_weight	= wmax;
+	sched->max_gp		= gpmax;
+	sched->constraints	= constraints;
+
+	for (idx = 0; idx < num; idx++) {
+		if (constraints[idx]->weight == wmin)
+			break;
+	}
+
+	sched->state.event	= idx;		/* start with min weight */
+	sched->state.weight	= wmin;
+	sched->state.unassigned	= num;
+}
+
+static void perf_sched_save_state(struct perf_sched *sched)
+{
+	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
+		return;
+
+	sched->saved[sched->saved_states] = sched->state;
+	sched->saved_states++;
+}
+
+static bool perf_sched_restore_state(struct perf_sched *sched)
+{
+	if (!sched->saved_states)
+		return false;
+
+	sched->saved_states--;
+	sched->state = sched->saved[sched->saved_states];
+
+	/* continue with next counter: */
+	clear_bit(sched->state.counter++, sched->state.used);
+
+	return true;
+}
+
+/*
+ * Select a counter for the current event to schedule. Return true on
+ * success.
+ */
+static bool __perf_sched_find_counter(struct perf_sched *sched)
+{
+	struct event_constraint *c;
+	int idx;
+
+	if (!sched->state.unassigned)
+		return false;
+
+	if (sched->state.event >= sched->max_events)
+		return false;
+
+	c = sched->constraints[sched->state.event];
+	/* Prefer fixed purpose counters */
+	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
+		idx = INTEL_PMC_IDX_FIXED;
+		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+			if (!__test_and_set_bit(idx, sched->state.used))
+				goto done;
+		}
+	}
+
+	/* Grab the first unused counter starting with idx */
+	idx = sched->state.counter;
+	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
+		if (!__test_and_set_bit(idx, sched->state.used)) {
+			if (sched->state.nr_gp++ >= sched->max_gp)
+				return false;
+
+			goto done;
+		}
+	}
+
+	return false;
+
+done:
+	sched->state.counter = idx;
+
+	if (c->overlap)
+		perf_sched_save_state(sched);
+
+	return true;
+}
+
+static bool perf_sched_find_counter(struct perf_sched *sched)
+{
+	while (!__perf_sched_find_counter(sched)) {
+		if (!perf_sched_restore_state(sched))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Go through all unassigned events and find the next one to schedule.
+ * Take events with the least weight first. Return true on success.
+ */
+static bool perf_sched_next_event(struct perf_sched *sched)
+{
+	struct event_constraint *c;
+
+	if (!sched->state.unassigned || !--sched->state.unassigned)
+		return false;
+
+	do {
+		/* next event */
+		sched->state.event++;
+		if (sched->state.event >= sched->max_events) {
+			/* next weight */
+			sched->state.event = 0;
+			sched->state.weight++;
+			if (sched->state.weight > sched->max_weight)
+				return false;
+		}
+		c = sched->constraints[sched->state.event];
+	} while (c->weight != sched->state.weight);
+
+	sched->state.counter = 0;	/* start with first counter */
+
+	return true;
+}
+
+/*
+ * Assign a counter for each event.
+ */
+int perf_assign_events(struct event_constraint **constraints, int n,
+			int wmin, int wmax, int gpmax, int *assign)
+{
+	struct perf_sched sched;
+
+	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
+
+	do {
+		if (!perf_sched_find_counter(&sched))
+			break;	/* failed */
+		if (assign)
+			assign[sched.state.event] = sched.state.counter;
+	} while (perf_sched_next_event(&sched));
+
+	return sched.state.unassigned;
+}
+EXPORT_SYMBOL_GPL(perf_assign_events);
+
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+	struct event_constraint *c;
+	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	struct perf_event *e;
+	int i, wmin, wmax, unsched = 0;
+	struct hw_perf_event *hwc;
+
+	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+
+	if (x86_pmu.start_scheduling)
+		x86_pmu.start_scheduling(cpuc);
+
+	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+		cpuc->event_constraint[i] = NULL;
+		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
+		cpuc->event_constraint[i] = c;
+
+		wmin = min(wmin, c->weight);
+		wmax = max(wmax, c->weight);
+	}
+
+	/*
+	 * fastpath, try to reuse previous register
+	 */
+	for (i = 0; i < n; i++) {
+		hwc = &cpuc->event_list[i]->hw;
+		c = cpuc->event_constraint[i];
+
+		/* never assigned */
+		if (hwc->idx == -1)
+			break;
+
+		/* constraint still honored */
+		if (!test_bit(hwc->idx, c->idxmsk))
+			break;
+
+		/* not already used */
+		if (test_bit(hwc->idx, used_mask))
+			break;
+
+		__set_bit(hwc->idx, used_mask);
+		if (assign)
+			assign[i] = hwc->idx;
+	}
+
+	/* slow path */
+	if (i != n) {
+		int gpmax = x86_pmu.num_counters;
+
+		/*
+		 * Do not allow scheduling of more than half the available
+		 * generic counters.
+		 *
+		 * This helps avoid counter starvation of sibling thread by
+		 * ensuring at most half the counters cannot be in exclusive
+		 * mode. There is no designated counters for the limits. Any
+		 * N/2 counters can be used. This helps with events with
+		 * specific counter constraints.
+		 */
+		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+			gpmax /= 2;
+
+		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
+					     wmax, gpmax, assign);
+	}
+
+	/*
+	 * In case of success (unsched = 0), mark events as committed,
+	 * so we do not put_constraint() in case new events are added
+	 * and fail to be scheduled
+	 *
+	 * We invoke the lower level commit callback to lock the resource
+	 *
+	 * We do not need to do all of this in case we are called to
+	 * validate an event group (assign == NULL)
+	 */
+	if (!unsched && assign) {
+		for (i = 0; i < n; i++) {
+			e = cpuc->event_list[i];
+			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+			if (x86_pmu.commit_scheduling)
+				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
+		}
+	} else {
+		for (i = 0; i < n; i++) {
+			e = cpuc->event_list[i];
+			/*
+			 * do not put_constraint() on comitted events,
+			 * because they are good to go
+			 */
+			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
+				continue;
+
+			/*
+			 * release events that failed scheduling
+			 */
+			if (x86_pmu.put_event_constraints)
+				x86_pmu.put_event_constraints(cpuc, e);
+		}
+	}
+
+	if (x86_pmu.stop_scheduling)
+		x86_pmu.stop_scheduling(cpuc);
+
+	return unsched ? -EINVAL : 0;
+}
+
+/*
+ * dogrp: true if must collect siblings events (group)
+ * returns total number of events and error code
+ */
+static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
+{
+	struct perf_event *event;
+	int n, max_count;
+
+	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
+
+	/* current number of events already accepted */
+	n = cpuc->n_events;
+
+	if (is_x86_event(leader)) {
+		if (n >= max_count)
+			return -EINVAL;
+		cpuc->event_list[n] = leader;
+		n++;
+	}
+	if (!dogrp)
+		return n;
+
+	list_for_each_entry(event, &leader->sibling_list, group_entry) {
+		if (!is_x86_event(event) ||
+		    event->state <= PERF_EVENT_STATE_OFF)
+			continue;
+
+		if (n >= max_count)
+			return -EINVAL;
+
+		cpuc->event_list[n] = event;
+		n++;
+	}
+	return n;
+}
+
+static inline void x86_assign_hw_event(struct perf_event *event,
+				struct cpu_hw_events *cpuc, int i)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->idx = cpuc->assign[i];
+	hwc->last_cpu = smp_processor_id();
+	hwc->last_tag = ++cpuc->tags[i];
+
+	if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
+		hwc->config_base = 0;
+		hwc->event_base	= 0;
+	} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
+		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
+		hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
+	} else {
+		hwc->config_base = x86_pmu_config_addr(hwc->idx);
+		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
+		hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
+	}
+}
+
+static inline int match_prev_assignment(struct hw_perf_event *hwc,
+					struct cpu_hw_events *cpuc,
+					int i)
+{
+	return hwc->idx == cpuc->assign[i] &&
+		hwc->last_cpu == smp_processor_id() &&
+		hwc->last_tag == cpuc->tags[i];
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags);
+
+static void x86_pmu_enable(struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	int i, added = cpuc->n_added;
+
+	if (!x86_pmu_initialized())
+		return;
+
+	if (cpuc->enabled)
+		return;
+
+	if (cpuc->n_added) {
+		int n_running = cpuc->n_events - cpuc->n_added;
+		/*
+		 * apply assignment obtained either from
+		 * hw_perf_group_sched_in() or x86_pmu_enable()
+		 *
+		 * step1: save events moving to new counters
+		 */
+		for (i = 0; i < n_running; i++) {
+			event = cpuc->event_list[i];
+			hwc = &event->hw;
+
+			/*
+			 * we can avoid reprogramming counter if:
+			 * - assigned same counter as last time
+			 * - running on same CPU as last time
+			 * - no other event has used the counter since
+			 */
+			if (hwc->idx == -1 ||
+			    match_prev_assignment(hwc, cpuc, i))
+				continue;
+
+			/*
+			 * Ensure we don't accidentally enable a stopped
+			 * counter simply because we rescheduled.
+			 */
+			if (hwc->state & PERF_HES_STOPPED)
+				hwc->state |= PERF_HES_ARCH;
+
+			x86_pmu_stop(event, PERF_EF_UPDATE);
+		}
+
+		/*
+		 * step2: reprogram moved events into new counters
+		 */
+		for (i = 0; i < cpuc->n_events; i++) {
+			event = cpuc->event_list[i];
+			hwc = &event->hw;
+
+			if (!match_prev_assignment(hwc, cpuc, i))
+				x86_assign_hw_event(event, cpuc, i);
+			else if (i < n_running)
+				continue;
+
+			if (hwc->state & PERF_HES_ARCH)
+				continue;
+
+			x86_pmu_start(event, PERF_EF_RELOAD);
+		}
+		cpuc->n_added = 0;
+		perf_events_lapic_init();
+	}
+
+	cpuc->enabled = 1;
+	barrier();
+
+	x86_pmu.enable_all(added);
+}
+
+static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the event disabled in hw:
+ */
+int x86_perf_event_set_period(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int ret = 0, idx = hwc->idx;
+
+	if (idx == INTEL_PMC_IDX_FIXED_BTS)
+		return 0;
+
+	/*
+	 * If we are way outside a reasonable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		ret = 1;
+	}
+	/*
+	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
+	 */
+	if (unlikely(left < 2))
+		left = 2;
+
+	if (left > x86_pmu.max_period)
+		left = x86_pmu.max_period;
+
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
+	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+
+	if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+	    local64_read(&hwc->prev_count) != (u64)-left) {
+		/*
+		 * The hw event starts counting from this event offset,
+		 * mark it to be able to extra future deltas:
+		 */
+		local64_set(&hwc->prev_count, (u64)-left);
+
+		wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+	}
+
+	/*
+	 * Due to erratum on certan cpu we need
+	 * a second write to be sure the register
+	 * is updated properly
+	 */
+	if (x86_pmu.perfctr_second_write) {
+		wrmsrl(hwc->event_base,
+			(u64)(-left) & x86_pmu.cntval_mask);
+	}
+
+	perf_event_update_userpage(event);
+
+	return ret;
+}
+
+void x86_pmu_enable_event(struct perf_event *event)
+{
+	if (__this_cpu_read(cpu_hw_events.enabled))
+		__x86_pmu_enable_event(&event->hw,
+				       ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Add a single event to the PMU.
+ *
+ * The event is added to the group of enabled events
+ * but only if it can be scehduled with existing events.
+ */
+static int x86_pmu_add(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc;
+	int assign[X86_PMC_IDX_MAX];
+	int n, n0, ret;
+
+	hwc = &event->hw;
+
+	n0 = cpuc->n_events;
+	ret = n = collect_events(cpuc, event, false);
+	if (ret < 0)
+		goto out;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	/*
+	 * If group events scheduling transaction was started,
+	 * skip the schedulability test here, it will be performed
+	 * at commit time (->commit_txn) as a whole.
+	 */
+	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+		goto done_collect;
+
+	ret = x86_pmu.schedule_events(cpuc, n, assign);
+	if (ret)
+		goto out;
+	/*
+	 * copy new assignment, now we know it is possible
+	 * will be used by hw_perf_enable()
+	 */
+	memcpy(cpuc->assign, assign, n*sizeof(int));
+
+done_collect:
+	/*
+	 * Commit the collect_events() state. See x86_pmu_del() and
+	 * x86_pmu_*_txn().
+	 */
+	cpuc->n_events = n;
+	cpuc->n_added += n - n0;
+	cpuc->n_txn += n - n0;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static void x86_pmu_start(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx = event->hw.idx;
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	if (WARN_ON_ONCE(idx == -1))
+		return;
+
+	if (flags & PERF_EF_RELOAD) {
+		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
+		x86_perf_event_set_period(event);
+	}
+
+	event->hw.state = 0;
+
+	cpuc->events[idx] = event;
+	__set_bit(idx, cpuc->active_mask);
+	__set_bit(idx, cpuc->running);
+	x86_pmu.enable(event);
+	perf_event_update_userpage(event);
+}
+
+void perf_event_print_debug(void)
+{
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+	u64 pebs, debugctl;
+	struct cpu_hw_events *cpuc;
+	unsigned long flags;
+	int cpu, idx;
+
+	if (!x86_pmu.num_counters)
+		return;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_events, cpu);
+
+	if (x86_pmu.version >= 2) {
+		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+		pr_info("\n");
+		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
+		if (x86_pmu.pebs_constraints) {
+			rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+			pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		}
+		if (x86_pmu.lbr_nr) {
+			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
+		}
+	}
+	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
+		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
+
+		prev_left = per_cpu(pmc_prev_left[idx], cpu);
+
+		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+			cpu, idx, pmc_ctrl);
+		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
+			cpu, idx, prev_left);
+	}
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+	}
+	local_irq_restore(flags);
+}
+
+void x86_pmu_stop(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
+		x86_pmu.disable(event);
+		cpuc->events[hwc->idx] = NULL;
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		x86_perf_event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static void x86_pmu_del(struct perf_event *event, int flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int i;
+
+	/*
+	 * event is descheduled
+	 */
+	event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
+
+	/*
+	 * If we're called during a txn, we don't need to do anything.
+	 * The events never got scheduled and ->cancel_txn will truncate
+	 * the event_list.
+	 *
+	 * XXX assumes any ->del() called during a TXN will only be on
+	 * an event added during that same TXN.
+	 */
+	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
+		return;
+
+	/*
+	 * Not a TXN, therefore cleanup properly.
+	 */
+	x86_pmu_stop(event, PERF_EF_UPDATE);
+
+	for (i = 0; i < cpuc->n_events; i++) {
+		if (event == cpuc->event_list[i])
+			break;
+	}
+
+	if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
+		return;
+
+	/* If we have a newly added event; make sure to decrease n_added. */
+	if (i >= cpuc->n_events - cpuc->n_added)
+		--cpuc->n_added;
+
+	if (x86_pmu.put_event_constraints)
+		x86_pmu.put_event_constraints(cpuc, event);
+
+	/* Delete the array entry. */
+	while (++i < cpuc->n_events) {
+		cpuc->event_list[i-1] = cpuc->event_list[i];
+		cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+	}
+	--cpuc->n_events;
+
+	perf_event_update_userpage(event);
+}
+
+int x86_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct perf_event *event;
+	int idx, handled = 0;
+	u64 val;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	/*
+	 * Some chipsets need to unmask the LVTPC in a particular spot
+	 * inside the nmi handler.  As a result, the unmasking was pushed
+	 * into all the nmi handlers.
+	 *
+	 * This generic handler doesn't seem to have any issues where the
+	 * unmasking occurs so it was left at the top.
+	 */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		if (!test_bit(idx, cpuc->active_mask)) {
+			/*
+			 * Though we deactivated the counter some cpus
+			 * might still deliver spurious interrupts still
+			 * in flight. Catch them:
+			 */
+			if (__test_and_clear_bit(idx, cpuc->running))
+				handled++;
+			continue;
+		}
+
+		event = cpuc->events[idx];
+
+		val = x86_perf_event_update(event);
+		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+			continue;
+
+		/*
+		 * event overflow
+		 */
+		handled++;
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	return handled;
+}
+
+void perf_events_lapic_init(void)
+{
+	if (!x86_pmu.apic || !x86_pmu_initialized())
+		return;
+
+	/*
+	 * Always use NMI for PMU
+	 */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+}
+
+static int
+perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+	u64 start_clock;
+	u64 finish_clock;
+	int ret;
+
+	/*
+	 * All PMUs/events that share this PMI handler should make sure to
+	 * increment active_events for their events.
+	 */
+	if (!atomic_read(&active_events))
+		return NMI_DONE;
+
+	start_clock = sched_clock();
+	ret = x86_pmu.handle_irq(regs);
+	finish_clock = sched_clock();
+
+	perf_sample_event_took(finish_clock - start_clock);
+
+	return ret;
+}
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
+
+struct event_constraint emptyconstraint;
+struct event_constraint unconstrained;
+
+static int
+x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	int i, ret = NOTIFY_OK;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+			cpuc->kfree_on_online[i] = NULL;
+		if (x86_pmu.cpu_prepare)
+			ret = x86_pmu.cpu_prepare(cpu);
+		break;
+
+	case CPU_STARTING:
+		if (x86_pmu.cpu_starting)
+			x86_pmu.cpu_starting(cpu);
+		break;
+
+	case CPU_ONLINE:
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+			kfree(cpuc->kfree_on_online[i]);
+			cpuc->kfree_on_online[i] = NULL;
+		}
+		break;
+
+	case CPU_DYING:
+		if (x86_pmu.cpu_dying)
+			x86_pmu.cpu_dying(cpu);
+		break;
+
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		if (x86_pmu.cpu_dead)
+			x86_pmu.cpu_dead(cpu);
+		break;
+
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static void __init pmu_check_apic(void)
+{
+	if (cpu_has_apic)
+		return;
+
+	x86_pmu.apic = 0;
+	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
+	pr_info("no hardware sampling interrupt available.\n");
+
+	/*
+	 * If we have a PMU initialized but no APIC
+	 * interrupts, we cannot sample hardware
+	 * events (user-space has to fall back and
+	 * sample via a hrtimer based software event):
+	 */
+	pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
+}
+
+static struct attribute_group x86_pmu_format_group = {
+	.name = "format",
+	.attrs = NULL,
+};
+
+/*
+ * Remove all undefined events (x86_pmu.event_map(id) == 0)
+ * out of events_attr attributes.
+ */
+static void __init filter_events(struct attribute **attrs)
+{
+	struct device_attribute *d;
+	struct perf_pmu_events_attr *pmu_attr;
+	int offset = 0;
+	int i, j;
+
+	for (i = 0; attrs[i]; i++) {
+		d = (struct device_attribute *)attrs[i];
+		pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
+		/* str trumps id */
+		if (pmu_attr->event_str)
+			continue;
+		if (x86_pmu.event_map(i + offset))
+			continue;
+
+		for (j = i; attrs[j]; j++)
+			attrs[j] = attrs[j + 1];
+
+		/* Check the shifted attr. */
+		i--;
+
+		/*
+		 * event_map() is index based, the attrs array is organized
+		 * by increasing event index. If we shift the events, then
+		 * we need to compensate for the event_map(), otherwise
+		 * we are looking up the wrong event in the map
+		 */
+		offset++;
+	}
+}
+
+/* Merge two pointer arrays */
+__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+{
+	struct attribute **new;
+	int j, i;
+
+	for (j = 0; a[j]; j++)
+		;
+	for (i = 0; b[i]; i++)
+		j++;
+	j++;
+
+	new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	j = 0;
+	for (i = 0; a[i]; i++)
+		new[j++] = a[i];
+	for (i = 0; b[i]; i++)
+		new[j++] = b[i];
+	new[j] = NULL;
+
+	return new;
+}
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+			  char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr = \
+		container_of(attr, struct perf_pmu_events_attr, attr);
+	u64 config = x86_pmu.event_map(pmu_attr->id);
+
+	/* string trumps id */
+	if (pmu_attr->event_str)
+		return sprintf(page, "%s", pmu_attr->event_str);
+
+	return x86_pmu.events_sysfs_show(page, config);
+}
+
+EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
+EVENT_ATTR(instructions,		INSTRUCTIONS		);
+EVENT_ATTR(cache-references,		CACHE_REFERENCES	);
+EVENT_ATTR(cache-misses, 		CACHE_MISSES		);
+EVENT_ATTR(branch-instructions,		BRANCH_INSTRUCTIONS	);
+EVENT_ATTR(branch-misses,		BRANCH_MISSES		);
+EVENT_ATTR(bus-cycles,			BUS_CYCLES		);
+EVENT_ATTR(stalled-cycles-frontend,	STALLED_CYCLES_FRONTEND	);
+EVENT_ATTR(stalled-cycles-backend,	STALLED_CYCLES_BACKEND	);
+EVENT_ATTR(ref-cycles,			REF_CPU_CYCLES		);
+
+static struct attribute *empty_attrs;
+
+static struct attribute *events_attr[] = {
+	EVENT_PTR(CPU_CYCLES),
+	EVENT_PTR(INSTRUCTIONS),
+	EVENT_PTR(CACHE_REFERENCES),
+	EVENT_PTR(CACHE_MISSES),
+	EVENT_PTR(BRANCH_INSTRUCTIONS),
+	EVENT_PTR(BRANCH_MISSES),
+	EVENT_PTR(BUS_CYCLES),
+	EVENT_PTR(STALLED_CYCLES_FRONTEND),
+	EVENT_PTR(STALLED_CYCLES_BACKEND),
+	EVENT_PTR(REF_CPU_CYCLES),
+	NULL,
+};
+
+static struct attribute_group x86_pmu_events_group = {
+	.name = "events",
+	.attrs = events_attr,
+};
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
+{
+	u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+	u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
+	bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
+	bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
+	bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
+	bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
+	ssize_t ret;
+
+	/*
+	* We have whole page size to spend and just little data
+	* to write, so we can safely use sprintf.
+	*/
+	ret = sprintf(page, "event=0x%02llx", event);
+
+	if (umask)
+		ret += sprintf(page + ret, ",umask=0x%02llx", umask);
+
+	if (edge)
+		ret += sprintf(page + ret, ",edge");
+
+	if (pc)
+		ret += sprintf(page + ret, ",pc");
+
+	if (any)
+		ret += sprintf(page + ret, ",any");
+
+	if (inv)
+		ret += sprintf(page + ret, ",inv");
+
+	if (cmask)
+		ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
+
+	ret += sprintf(page + ret, "\n");
+
+	return ret;
+}
+
+static int __init init_hw_perf_events(void)
+{
+	struct x86_pmu_quirk *quirk;
+	int err;
+
+	pr_info("Performance Events: ");
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		err = intel_pmu_init();
+		break;
+	case X86_VENDOR_AMD:
+		err = amd_pmu_init();
+		break;
+	default:
+		err = -ENOTSUPP;
+	}
+	if (err != 0) {
+		pr_cont("no PMU driver, software events only.\n");
+		return 0;
+	}
+
+	pmu_check_apic();
+
+	/* sanity check that the hardware exists or is emulated */
+	if (!check_hw_exists())
+		return 0;
+
+	pr_cont("%s PMU driver.\n", x86_pmu.name);
+
+	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+
+	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
+		quirk->func();
+
+	if (!x86_pmu.intel_ctrl)
+		x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+	perf_events_lapic_init();
+	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
+
+	unconstrained = (struct event_constraint)
+		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
+				   0, x86_pmu.num_counters, 0, 0);
+
+	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
+
+	if (x86_pmu.event_attrs)
+		x86_pmu_events_group.attrs = x86_pmu.event_attrs;
+
+	if (!x86_pmu.events_sysfs_show)
+		x86_pmu_events_group.attrs = &empty_attrs;
+	else
+		filter_events(x86_pmu_events_group.attrs);
+
+	if (x86_pmu.cpu_events) {
+		struct attribute **tmp;
+
+		tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
+		if (!WARN_ON(!tmp))
+			x86_pmu_events_group.attrs = tmp;
+	}
+
+	pr_info("... version:                %d\n",     x86_pmu.version);
+	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
+	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
+	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
+	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
+	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
+	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
+
+	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
+	perf_cpu_notifier(x86_pmu_notifier);
+
+	return 0;
+}
+early_initcall(init_hw_perf_events);
+
+static inline void x86_pmu_read(struct perf_event *event)
+{
+	x86_perf_event_update(event);
+}
+
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ *
+ * We only support PERF_PMU_TXN_ADD transactions. Save the
+ * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
+ * transactions.
+ */
+static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	WARN_ON_ONCE(cpuc->txn_flags);		/* txn already in flight */
+
+	cpuc->txn_flags = txn_flags;
+	if (txn_flags & ~PERF_PMU_TXN_ADD)
+		return;
+
+	perf_pmu_disable(pmu);
+	__this_cpu_write(cpu_hw_events.n_txn, 0);
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(struct pmu *pmu)
+{
+	unsigned int txn_flags;
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
+
+	txn_flags = cpuc->txn_flags;
+	cpuc->txn_flags = 0;
+	if (txn_flags & ~PERF_PMU_TXN_ADD)
+		return;
+
+	/*
+	 * Truncate collected array by the number of events added in this
+	 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
+	 */
+	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
+	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
+	perf_pmu_enable(pmu);
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ *
+ * Does not cancel the transaction on failure; expects the caller to do this.
+ */
+static int x86_pmu_commit_txn(struct pmu *pmu)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int assign[X86_PMC_IDX_MAX];
+	int n, ret;
+
+	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
+
+	if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
+		cpuc->txn_flags = 0;
+		return 0;
+	}
+
+	n = cpuc->n_events;
+
+	if (!x86_pmu_initialized())
+		return -EAGAIN;
+
+	ret = x86_pmu.schedule_events(cpuc, n, assign);
+	if (ret)
+		return ret;
+
+	/*
+	 * copy new assignment, now we know it is possible
+	 * will be used by hw_perf_enable()
+	 */
+	memcpy(cpuc->assign, assign, n*sizeof(int));
+
+	cpuc->txn_flags = 0;
+	perf_pmu_enable(pmu);
+	return 0;
+}
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+	kfree(cpuc->shared_regs);
+	kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+	struct cpu_hw_events *cpuc;
+	int cpu = raw_smp_processor_id();
+
+	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+	if (!cpuc)
+		return ERR_PTR(-ENOMEM);
+
+	/* only needed, if we have extra_regs */
+	if (x86_pmu.extra_regs) {
+		cpuc->shared_regs = allocate_shared_regs(cpu);
+		if (!cpuc->shared_regs)
+			goto error;
+	}
+	cpuc->is_fake = 1;
+	return cpuc;
+error:
+	free_fake_cpuc(cpuc);
+	return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+	struct cpu_hw_events *fake_cpuc;
+	struct event_constraint *c;
+	int ret = 0;
+
+	fake_cpuc = allocate_fake_cpuc();
+	if (IS_ERR(fake_cpuc))
+		return PTR_ERR(fake_cpuc);
+
+	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
+
+	if (!c || !c->weight)
+		ret = -EINVAL;
+
+	if (x86_pmu.put_event_constraints)
+		x86_pmu.put_event_constraints(fake_cpuc, event);
+
+	free_fake_cpuc(fake_cpuc);
+
+	return ret;
+}
+
+/*
+ * validate a single event group
+ *
+ * validation include:
+ *	- check events are compatible which each other
+ *	- events do not compete for the same counter
+ *	- number of events <= number of counters
+ *
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int validate_group(struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+	struct cpu_hw_events *fake_cpuc;
+	int ret = -EINVAL, n;
+
+	fake_cpuc = allocate_fake_cpuc();
+	if (IS_ERR(fake_cpuc))
+		return PTR_ERR(fake_cpuc);
+	/*
+	 * the event is not yet connected with its
+	 * siblings therefore we must first collect
+	 * existing siblings, then add the new event
+	 * before we can simulate the scheduling
+	 */
+	n = collect_events(fake_cpuc, leader, true);
+	if (n < 0)
+		goto out;
+
+	fake_cpuc->n_events = n;
+	n = collect_events(fake_cpuc, event, false);
+	if (n < 0)
+		goto out;
+
+	fake_cpuc->n_events = n;
+
+	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
+
+out:
+	free_fake_cpuc(fake_cpuc);
+	return ret;
+}
+
+static int x86_pmu_event_init(struct perf_event *event)
+{
+	struct pmu *tmp;
+	int err;
+
+	switch (event->attr.type) {
+	case PERF_TYPE_RAW:
+	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
+		break;
+
+	default:
+		return -ENOENT;
+	}
+
+	err = __x86_pmu_event_init(event);
+	if (!err) {
+		/*
+		 * we temporarily connect event to its pmu
+		 * such that validate_group() can classify
+		 * it as an x86 event using is_x86_event()
+		 */
+		tmp = event->pmu;
+		event->pmu = &pmu;
+
+		if (event->group_leader != event)
+			err = validate_group(event);
+		else
+			err = validate_event(event);
+
+		event->pmu = tmp;
+	}
+	if (err) {
+		if (event->destroy)
+			event->destroy(event);
+	}
+
+	if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
+		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+
+	return err;
+}
+
+static void refresh_pce(void *ignored)
+{
+	if (current->mm)
+		load_mm_cr4(current->mm);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event)
+{
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+		return;
+
+	if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
+		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event)
+{
+	if (!current->mm)
+		return;
+
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+		return;
+
+	if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
+		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static int x86_pmu_event_idx(struct perf_event *event)
+{
+	int idx = event->hw.idx;
+
+	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+		return 0;
+
+	if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
+		idx -= INTEL_PMC_IDX_FIXED;
+		idx |= 1 << 30;
+	}
+
+	return idx + 1;
+}
+
+static ssize_t get_attr_rdpmc(struct device *cdev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
+}
+
+static ssize_t set_attr_rdpmc(struct device *cdev,
+			      struct device_attribute *attr,
+			      const char *buf, size_t count)
+{
+	unsigned long val;
+	ssize_t ret;
+
+	ret = kstrtoul(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val > 2)
+		return -EINVAL;
+
+	if (x86_pmu.attr_rdpmc_broken)
+		return -ENOTSUPP;
+
+	if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
+		/*
+		 * Changing into or out of always available, aka
+		 * perf-event-bypassing mode.  This path is extremely slow,
+		 * but only root can trigger it, so it's okay.
+		 */
+		if (val == 2)
+			static_key_slow_inc(&rdpmc_always_available);
+		else
+			static_key_slow_dec(&rdpmc_always_available);
+		on_each_cpu(refresh_pce, NULL, 1);
+	}
+
+	x86_pmu.attr_rdpmc = val;
+
+	return count;
+}
+
+static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
+
+static struct attribute *x86_pmu_attrs[] = {
+	&dev_attr_rdpmc.attr,
+	NULL,
+};
+
+static struct attribute_group x86_pmu_attr_group = {
+	.attrs = x86_pmu_attrs,
+};
+
+static const struct attribute_group *x86_pmu_attr_groups[] = {
+	&x86_pmu_attr_group,
+	&x86_pmu_format_group,
+	&x86_pmu_events_group,
+	NULL,
+};
+
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	if (x86_pmu.sched_task)
+		x86_pmu.sched_task(ctx, sched_in);
+}
+
+void perf_check_microcode(void)
+{
+	if (x86_pmu.check_microcode)
+		x86_pmu.check_microcode();
+}
+EXPORT_SYMBOL_GPL(perf_check_microcode);
+
+static struct pmu pmu = {
+	.pmu_enable		= x86_pmu_enable,
+	.pmu_disable		= x86_pmu_disable,
+
+	.attr_groups		= x86_pmu_attr_groups,
+
+	.event_init		= x86_pmu_event_init,
+
+	.event_mapped		= x86_pmu_event_mapped,
+	.event_unmapped		= x86_pmu_event_unmapped,
+
+	.add			= x86_pmu_add,
+	.del			= x86_pmu_del,
+	.start			= x86_pmu_start,
+	.stop			= x86_pmu_stop,
+	.read			= x86_pmu_read,
+
+	.start_txn		= x86_pmu_start_txn,
+	.cancel_txn		= x86_pmu_cancel_txn,
+	.commit_txn		= x86_pmu_commit_txn,
+
+	.event_idx		= x86_pmu_event_idx,
+	.sched_task		= x86_pmu_sched_task,
+	.task_ctx_size          = sizeof(struct x86_perf_task_context),
+};
+
+void arch_perf_update_userpage(struct perf_event *event,
+			       struct perf_event_mmap_page *userpg, u64 now)
+{
+	struct cyc2ns_data *data;
+
+	userpg->cap_user_time = 0;
+	userpg->cap_user_time_zero = 0;
+	userpg->cap_user_rdpmc =
+		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+	userpg->pmc_width = x86_pmu.cntval_bits;
+
+	if (!sched_clock_stable())
+		return;
+
+	data = cyc2ns_read_begin();
+
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always in the local_clock domain.
+	 */
+	userpg->cap_user_time = 1;
+	userpg->time_mult = data->cyc2ns_mul;
+	userpg->time_shift = data->cyc2ns_shift;
+	userpg->time_offset = data->cyc2ns_offset - now;
+
+	/*
+	 * cap_user_time_zero doesn't make sense when we're using a different
+	 * time base for the records.
+	 */
+	if (event->clock == &local_clock) {
+		userpg->cap_user_time_zero = 1;
+		userpg->time_zero = data->cyc2ns_offset;
+	}
+
+	cyc2ns_read_end(data);
+}
+
+/*
+ * callchain support
+ */
+
+static int backtrace_stack(void *data, char *name)
+{
+	return 0;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+	struct perf_callchain_entry *entry = data;
+
+	perf_callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+	.stack			= backtrace_stack,
+	.address		= backtrace_address,
+	.walk_stack		= print_context_stack_bp,
+};
+
+void
+perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return;
+	}
+
+	perf_callchain_store(entry, regs->ip);
+
+	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+}
+
+static inline int
+valid_user_frame(const void __user *fp, unsigned long size)
+{
+	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
+}
+
+static unsigned long get_segment_base(unsigned int segment)
+{
+	struct desc_struct *desc;
+	int idx = segment >> 3;
+
+	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+		struct ldt_struct *ldt;
+
+		if (idx > LDT_ENTRIES)
+			return 0;
+
+		/* IRQs are off, so this synchronizes with smp_store_release */
+		ldt = lockless_dereference(current->active_mm->context.ldt);
+		if (!ldt || idx > ldt->size)
+			return 0;
+
+		desc = &ldt->entries[idx];
+#else
+		return 0;
+#endif
+	} else {
+		if (idx > GDT_ENTRIES)
+			return 0;
+
+		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+	}
+
+	return get_desc_base(desc);
+}
+
+#ifdef CONFIG_IA32_EMULATION
+
+#include <asm/compat.h>
+
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	/* 32-bit process in 64-bit kernel. */
+	unsigned long ss_base, cs_base;
+	struct stack_frame_ia32 frame;
+	const void __user *fp;
+
+	if (!test_thread_flag(TIF_IA32))
+		return 0;
+
+	cs_base = get_segment_base(regs->cs);
+	ss_base = get_segment_base(regs->ss);
+
+	fp = compat_ptr(ss_base + regs->bp);
+	pagefault_disable();
+	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+		unsigned long bytes;
+		frame.next_frame     = 0;
+		frame.return_address = 0;
+
+		if (!access_ok(VERIFY_READ, fp, 8))
+			break;
+
+		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
+		if (bytes != 0)
+			break;
+		bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
+		if (bytes != 0)
+			break;
+
+		if (!valid_user_frame(fp, sizeof(frame)))
+			break;
+
+		perf_callchain_store(entry, cs_base + frame.return_address);
+		fp = compat_ptr(ss_base + frame.next_frame);
+	}
+	pagefault_enable();
+	return 1;
+}
+#else
+static inline int
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+    return 0;
+}
+#endif
+
+void
+perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+{
+	struct stack_frame frame;
+	const void __user *fp;
+
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		/* TODO: We don't support guest os callchain now */
+		return;
+	}
+
+	/*
+	 * We don't know what to do with VM86 stacks.. ignore them for now.
+	 */
+	if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
+		return;
+
+	fp = (void __user *)regs->bp;
+
+	perf_callchain_store(entry, regs->ip);
+
+	if (!current->mm)
+		return;
+
+	if (perf_callchain_user32(regs, entry))
+		return;
+
+	pagefault_disable();
+	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+		unsigned long bytes;
+		frame.next_frame	     = NULL;
+		frame.return_address = 0;
+
+		if (!access_ok(VERIFY_READ, fp, 16))
+			break;
+
+		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
+		if (bytes != 0)
+			break;
+		bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
+		if (bytes != 0)
+			break;
+
+		if (!valid_user_frame(fp, sizeof(frame)))
+			break;
+
+		perf_callchain_store(entry, frame.return_address);
+		fp = (void __user *)frame.next_frame;
+	}
+	pagefault_enable();
+}
+
+/*
+ * Deal with code segment offsets for the various execution modes:
+ *
+ *   VM86 - the good olde 16 bit days, where the linear address is
+ *          20 bits and we use regs->ip + 0x10 * regs->cs.
+ *
+ *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
+ *          to figure out what the 32bit base address is.
+ *
+ *    X32 - has TIF_X32 set, but is running in x86_64
+ *
+ * X86_64 - CS,DS,SS,ES are all zero based.
+ */
+static unsigned long code_segment_base(struct pt_regs *regs)
+{
+	/*
+	 * For IA32 we look at the GDT/LDT segment base to convert the
+	 * effective IP to a linear address.
+	 */
+
+#ifdef CONFIG_X86_32
+	/*
+	 * If we are in VM86 mode, add the segment offset to convert to a
+	 * linear address.
+	 */
+	if (regs->flags & X86_VM_MASK)
+		return 0x10 * regs->cs;
+
+	if (user_mode(regs) && regs->cs != __USER_CS)
+		return get_segment_base(regs->cs);
+#else
+	if (user_mode(regs) && !user_64bit_mode(regs) &&
+	    regs->cs != __USER32_CS)
+		return get_segment_base(regs->cs);
+#endif
+	return 0;
+}
+
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+		return perf_guest_cbs->get_guest_ip();
+
+	return regs->ip + code_segment_base(regs);
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	int misc = 0;
+
+	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+		if (perf_guest_cbs->is_user_mode())
+			misc |= PERF_RECORD_MISC_GUEST_USER;
+		else
+			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+	} else {
+		if (user_mode(regs))
+			misc |= PERF_RECORD_MISC_USER;
+		else
+			misc |= PERF_RECORD_MISC_KERNEL;
+	}
+
+	if (regs->flags & PERF_EFLAGS_EXACT)
+		misc |= PERF_RECORD_MISC_EXACT_IP;
+
+	return misc;
+}
+
+void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+	cap->version		= x86_pmu.version;
+	cap->num_counters_gp	= x86_pmu.num_counters;
+	cap->num_counters_fixed	= x86_pmu.num_counters_fixed;
+	cap->bit_width_gp	= x86_pmu.cntval_bits;
+	cap->bit_width_fixed	= x86_pmu.cntval_bits;
+	cap->events_mask	= (unsigned int)x86_pmu.events_maskl;
+	cap->events_mask_len	= x86_pmu.events_mask_len;
+}
+EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 5803130..77000d5 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,8 +30,6 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
-
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd.o perf_event_amd_uncore.o
 ifdef CONFIG_AMD_IOMMU
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
deleted file mode 100644
index 7402c818..0000000
--- a/arch/x86/kernel/cpu/perf_event.c
+++ /dev/null
@@ -1,2429 +0,0 @@
-/*
- * Performance events x86 architecture code
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *  Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/capability.h>
-#include <linux/notifier.h>
-#include <linux/hardirq.h>
-#include <linux/kprobes.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/bitops.h>
-#include <linux/device.h>
-
-#include <asm/apic.h>
-#include <asm/stacktrace.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/alternative.h>
-#include <asm/mmu_context.h>
-#include <asm/tlbflush.h>
-#include <asm/timer.h>
-#include <asm/desc.h>
-#include <asm/ldt.h>
-
-#include "perf_event.h"
-
-struct x86_pmu x86_pmu __read_mostly;
-
-DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
-	.enabled = 1,
-};
-
-struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
-
-u64 __read_mostly hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX];
-u64 __read_mostly hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-/*
- * Propagate event elapsed time into the generic event.
- * Can only be executed on the CPU where the event is active.
- * Returns the delta events processed.
- */
-u64 x86_perf_event_update(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	int shift = 64 - x86_pmu.cntval_bits;
-	u64 prev_raw_count, new_raw_count;
-	int idx = hwc->idx;
-	s64 delta;
-
-	if (idx == INTEL_PMC_IDX_FIXED_BTS)
-		return 0;
-
-	/*
-	 * Careful: an NMI might modify the previous event value.
-	 *
-	 * Our tactic to handle this is to first atomically read and
-	 * exchange a new raw count - then add that new-prev delta
-	 * count to the generic event atomically:
-	 */
-again:
-	prev_raw_count = local64_read(&hwc->prev_count);
-	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
-
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-					new_raw_count) != prev_raw_count)
-		goto again;
-
-	/*
-	 * Now we have the new raw value and have updated the prev
-	 * timestamp already. We can now calculate the elapsed delta
-	 * (event-)time and add that to the generic event.
-	 *
-	 * Careful, not all hw sign-extends above the physical width
-	 * of the count.
-	 */
-	delta = (new_raw_count << shift) - (prev_raw_count << shift);
-	delta >>= shift;
-
-	local64_add(delta, &event->count);
-	local64_sub(delta, &hwc->period_left);
-
-	return new_raw_count;
-}
-
-/*
- * Find and validate any extra registers to set up.
- */
-static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg;
-	struct extra_reg *er;
-
-	reg = &event->hw.extra_reg;
-
-	if (!x86_pmu.extra_regs)
-		return 0;
-
-	for (er = x86_pmu.extra_regs; er->msr; er++) {
-		if (er->event != (config & er->config_mask))
-			continue;
-		if (event->attr.config1 & ~er->valid_mask)
-			return -EINVAL;
-		/* Check if the extra msrs can be safely accessed*/
-		if (!er->extra_msr_access)
-			return -ENXIO;
-
-		reg->idx = er->idx;
-		reg->config = event->attr.config1;
-		reg->reg = er->msr;
-		break;
-	}
-	return 0;
-}
-
-static atomic_t active_events;
-static atomic_t pmc_refcount;
-static DEFINE_MUTEX(pmc_reserve_mutex);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-
-static bool reserve_pmc_hardware(void)
-{
-	int i;
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
-			goto perfctr_fail;
-	}
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
-			goto eventsel_fail;
-	}
-
-	return true;
-
-eventsel_fail:
-	for (i--; i >= 0; i--)
-		release_evntsel_nmi(x86_pmu_config_addr(i));
-
-	i = x86_pmu.num_counters;
-
-perfctr_fail:
-	for (i--; i >= 0; i--)
-		release_perfctr_nmi(x86_pmu_event_addr(i));
-
-	return false;
-}
-
-static void release_pmc_hardware(void)
-{
-	int i;
-
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		release_perfctr_nmi(x86_pmu_event_addr(i));
-		release_evntsel_nmi(x86_pmu_config_addr(i));
-	}
-}
-
-#else
-
-static bool reserve_pmc_hardware(void) { return true; }
-static void release_pmc_hardware(void) {}
-
-#endif
-
-static bool check_hw_exists(void)
-{
-	u64 val, val_fail, val_new= ~0;
-	int i, reg, reg_fail, ret = 0;
-	int bios_fail = 0;
-	int reg_safe = -1;
-
-	/*
-	 * Check to see if the BIOS enabled any of the counters, if so
-	 * complain and bail.
-	 */
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		reg = x86_pmu_config_addr(i);
-		ret = rdmsrl_safe(reg, &val);
-		if (ret)
-			goto msr_fail;
-		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
-			bios_fail = 1;
-			val_fail = val;
-			reg_fail = reg;
-		} else {
-			reg_safe = i;
-		}
-	}
-
-	if (x86_pmu.num_counters_fixed) {
-		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-		ret = rdmsrl_safe(reg, &val);
-		if (ret)
-			goto msr_fail;
-		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
-			if (val & (0x03 << i*4)) {
-				bios_fail = 1;
-				val_fail = val;
-				reg_fail = reg;
-			}
-		}
-	}
-
-	/*
-	 * If all the counters are enabled, the below test will always
-	 * fail.  The tools will also become useless in this scenario.
-	 * Just fail and disable the hardware counters.
-	 */
-
-	if (reg_safe == -1) {
-		reg = reg_safe;
-		goto msr_fail;
-	}
-
-	/*
-	 * Read the current value, change it and read it back to see if it
-	 * matches, this is needed to detect certain hardware emulators
-	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
-	 */
-	reg = x86_pmu_event_addr(reg_safe);
-	if (rdmsrl_safe(reg, &val))
-		goto msr_fail;
-	val ^= 0xffffUL;
-	ret = wrmsrl_safe(reg, val);
-	ret |= rdmsrl_safe(reg, &val_new);
-	if (ret || val != val_new)
-		goto msr_fail;
-
-	/*
-	 * We still allow the PMU driver to operate:
-	 */
-	if (bios_fail) {
-		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
-		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
-			      reg_fail, val_fail);
-	}
-
-	return true;
-
-msr_fail:
-	pr_cont("Broken PMU hardware detected, using software events only.\n");
-	pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
-		boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
-		reg, val_new);
-
-	return false;
-}
-
-static void hw_perf_event_destroy(struct perf_event *event)
-{
-	x86_release_hardware();
-	atomic_dec(&active_events);
-}
-
-void hw_perf_lbr_event_destroy(struct perf_event *event)
-{
-	hw_perf_event_destroy(event);
-
-	/* undo the lbr/bts event accounting */
-	x86_del_exclusive(x86_lbr_exclusive_lbr);
-}
-
-static inline int x86_pmu_initialized(void)
-{
-	return x86_pmu.handle_irq != NULL;
-}
-
-static inline int
-set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	unsigned int cache_type, cache_op, cache_result;
-	u64 config, val;
-
-	config = attr->config;
-
-	cache_type = (config >>  0) & 0xff;
-	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
-		return -EINVAL;
-
-	cache_op = (config >>  8) & 0xff;
-	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
-		return -EINVAL;
-
-	cache_result = (config >> 16) & 0xff;
-	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
-		return -EINVAL;
-
-	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
-	if (val == 0)
-		return -ENOENT;
-
-	if (val == -1)
-		return -EINVAL;
-
-	hwc->config |= val;
-	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
-	return x86_pmu_extra_regs(val, event);
-}
-
-int x86_reserve_hardware(void)
-{
-	int err = 0;
-
-	if (!atomic_inc_not_zero(&pmc_refcount)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&pmc_refcount) == 0) {
-			if (!reserve_pmc_hardware())
-				err = -EBUSY;
-			else
-				reserve_ds_buffers();
-		}
-		if (!err)
-			atomic_inc(&pmc_refcount);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-
-	return err;
-}
-
-void x86_release_hardware(void)
-{
-	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
-		release_pmc_hardware();
-		release_ds_buffers();
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-}
-
-/*
- * Check if we can create event of a certain type (that no conflicting events
- * are present).
- */
-int x86_add_exclusive(unsigned int what)
-{
-	int i;
-
-	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
-		mutex_lock(&pmc_reserve_mutex);
-		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
-			if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
-				goto fail_unlock;
-		}
-		atomic_inc(&x86_pmu.lbr_exclusive[what]);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
-
-	atomic_inc(&active_events);
-	return 0;
-
-fail_unlock:
-	mutex_unlock(&pmc_reserve_mutex);
-	return -EBUSY;
-}
-
-void x86_del_exclusive(unsigned int what)
-{
-	atomic_dec(&x86_pmu.lbr_exclusive[what]);
-	atomic_dec(&active_events);
-}
-
-int x86_setup_perfctr(struct perf_event *event)
-{
-	struct perf_event_attr *attr = &event->attr;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
-
-	if (!is_sampling_event(event)) {
-		hwc->sample_period = x86_pmu.max_period;
-		hwc->last_period = hwc->sample_period;
-		local64_set(&hwc->period_left, hwc->sample_period);
-	}
-
-	if (attr->type == PERF_TYPE_RAW)
-		return x86_pmu_extra_regs(event->attr.config, event);
-
-	if (attr->type == PERF_TYPE_HW_CACHE)
-		return set_ext_hw_attr(hwc, event);
-
-	if (attr->config >= x86_pmu.max_events)
-		return -EINVAL;
-
-	/*
-	 * The generic map:
-	 */
-	config = x86_pmu.event_map(attr->config);
-
-	if (config == 0)
-		return -ENOENT;
-
-	if (config == -1LL)
-		return -EINVAL;
-
-	/*
-	 * Branch tracing:
-	 */
-	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-	    !attr->freq && hwc->sample_period == 1) {
-		/* BTS is not supported by this architecture. */
-		if (!x86_pmu.bts_active)
-			return -EOPNOTSUPP;
-
-		/* BTS is currently only allowed for user-mode. */
-		if (!attr->exclude_kernel)
-			return -EOPNOTSUPP;
-
-		/* disallow bts if conflicting events are present */
-		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
-			return -EBUSY;
-
-		event->destroy = hw_perf_lbr_event_destroy;
-	}
-
-	hwc->config |= config;
-
-	return 0;
-}
-
-/*
- * check that branch_sample_type is compatible with
- * settings needed for precise_ip > 1 which implies
- * using the LBR to capture ALL taken branches at the
- * priv levels of the measurement
- */
-static inline int precise_br_compat(struct perf_event *event)
-{
-	u64 m = event->attr.branch_sample_type;
-	u64 b = 0;
-
-	/* must capture all branches */
-	if (!(m & PERF_SAMPLE_BRANCH_ANY))
-		return 0;
-
-	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
-
-	if (!event->attr.exclude_user)
-		b |= PERF_SAMPLE_BRANCH_USER;
-
-	if (!event->attr.exclude_kernel)
-		b |= PERF_SAMPLE_BRANCH_KERNEL;
-
-	/*
-	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
-	 */
-
-	return m == b;
-}
-
-int x86_pmu_hw_config(struct perf_event *event)
-{
-	if (event->attr.precise_ip) {
-		int precise = 0;
-
-		/* Support for constant skid */
-		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
-			precise++;
-
-			/* Support for IP fixup */
-			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
-				precise++;
-
-			if (x86_pmu.pebs_prec_dist)
-				precise++;
-		}
-
-		if (event->attr.precise_ip > precise)
-			return -EOPNOTSUPP;
-	}
-	/*
-	 * check that PEBS LBR correction does not conflict with
-	 * whatever the user is asking with attr->branch_sample_type
-	 */
-	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
-		u64 *br_type = &event->attr.branch_sample_type;
-
-		if (has_branch_stack(event)) {
-			if (!precise_br_compat(event))
-				return -EOPNOTSUPP;
-
-			/* branch_sample_type is compatible */
-
-		} else {
-			/*
-			 * user did not specify  branch_sample_type
-			 *
-			 * For PEBS fixups, we capture all
-			 * the branches at the priv level of the
-			 * event.
-			 */
-			*br_type = PERF_SAMPLE_BRANCH_ANY;
-
-			if (!event->attr.exclude_user)
-				*br_type |= PERF_SAMPLE_BRANCH_USER;
-
-			if (!event->attr.exclude_kernel)
-				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
-		}
-	}
-
-	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
-		event->attach_state |= PERF_ATTACH_TASK_DATA;
-
-	/*
-	 * Generate PMC IRQs:
-	 * (keep 'enabled' bit clear for now)
-	 */
-	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
-
-	/*
-	 * Count user and OS events unless requested not to
-	 */
-	if (!event->attr.exclude_user)
-		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!event->attr.exclude_kernel)
-		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
-
-	if (event->attr.type == PERF_TYPE_RAW)
-		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
-
-	if (event->attr.sample_period && x86_pmu.limit_period) {
-		if (x86_pmu.limit_period(event, event->attr.sample_period) >
-				event->attr.sample_period)
-			return -EINVAL;
-	}
-
-	return x86_setup_perfctr(event);
-}
-
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __x86_pmu_event_init(struct perf_event *event)
-{
-	int err;
-
-	if (!x86_pmu_initialized())
-		return -ENODEV;
-
-	err = x86_reserve_hardware();
-	if (err)
-		return err;
-
-	atomic_inc(&active_events);
-	event->destroy = hw_perf_event_destroy;
-
-	event->hw.idx = -1;
-	event->hw.last_cpu = -1;
-	event->hw.last_tag = ~0ULL;
-
-	/* mark unused */
-	event->hw.extra_reg.idx = EXTRA_REG_NONE;
-	event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
-	return x86_pmu.hw_config(event);
-}
-
-void x86_pmu_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		u64 val;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-		rdmsrl(x86_pmu_config_addr(idx), val);
-		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
-			continue;
-		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-		wrmsrl(x86_pmu_config_addr(idx), val);
-	}
-}
-
-static void x86_pmu_disable(struct pmu *pmu)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (!x86_pmu_initialized())
-		return;
-
-	if (!cpuc->enabled)
-		return;
-
-	cpuc->n_added = 0;
-	cpuc->enabled = 0;
-	barrier();
-
-	x86_pmu.disable_all();
-}
-
-void x86_pmu_enable_all(int added)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-
-		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-	}
-}
-
-static struct pmu pmu;
-
-static inline int is_x86_event(struct perf_event *event)
-{
-	return event->pmu == &pmu;
-}
-
-/*
- * Event scheduler state:
- *
- * Assign events iterating over all events and counters, beginning
- * with events with least weights first. Keep the current iterator
- * state in struct sched_state.
- */
-struct sched_state {
-	int	weight;
-	int	event;		/* event index */
-	int	counter;	/* counter index */
-	int	unassigned;	/* number of events to be assigned left */
-	int	nr_gp;		/* number of GP counters used */
-	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-};
-
-/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
-#define	SCHED_STATES_MAX	2
-
-struct perf_sched {
-	int			max_weight;
-	int			max_events;
-	int			max_gp;
-	int			saved_states;
-	struct event_constraint	**constraints;
-	struct sched_state	state;
-	struct sched_state	saved[SCHED_STATES_MAX];
-};
-
-/*
- * Initialize interator that runs through all events and counters.
- */
-static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
-			    int num, int wmin, int wmax, int gpmax)
-{
-	int idx;
-
-	memset(sched, 0, sizeof(*sched));
-	sched->max_events	= num;
-	sched->max_weight	= wmax;
-	sched->max_gp		= gpmax;
-	sched->constraints	= constraints;
-
-	for (idx = 0; idx < num; idx++) {
-		if (constraints[idx]->weight == wmin)
-			break;
-	}
-
-	sched->state.event	= idx;		/* start with min weight */
-	sched->state.weight	= wmin;
-	sched->state.unassigned	= num;
-}
-
-static void perf_sched_save_state(struct perf_sched *sched)
-{
-	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
-		return;
-
-	sched->saved[sched->saved_states] = sched->state;
-	sched->saved_states++;
-}
-
-static bool perf_sched_restore_state(struct perf_sched *sched)
-{
-	if (!sched->saved_states)
-		return false;
-
-	sched->saved_states--;
-	sched->state = sched->saved[sched->saved_states];
-
-	/* continue with next counter: */
-	clear_bit(sched->state.counter++, sched->state.used);
-
-	return true;
-}
-
-/*
- * Select a counter for the current event to schedule. Return true on
- * success.
- */
-static bool __perf_sched_find_counter(struct perf_sched *sched)
-{
-	struct event_constraint *c;
-	int idx;
-
-	if (!sched->state.unassigned)
-		return false;
-
-	if (sched->state.event >= sched->max_events)
-		return false;
-
-	c = sched->constraints[sched->state.event];
-	/* Prefer fixed purpose counters */
-	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
-		idx = INTEL_PMC_IDX_FIXED;
-		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
-			if (!__test_and_set_bit(idx, sched->state.used))
-				goto done;
-		}
-	}
-
-	/* Grab the first unused counter starting with idx */
-	idx = sched->state.counter;
-	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
-		if (!__test_and_set_bit(idx, sched->state.used)) {
-			if (sched->state.nr_gp++ >= sched->max_gp)
-				return false;
-
-			goto done;
-		}
-	}
-
-	return false;
-
-done:
-	sched->state.counter = idx;
-
-	if (c->overlap)
-		perf_sched_save_state(sched);
-
-	return true;
-}
-
-static bool perf_sched_find_counter(struct perf_sched *sched)
-{
-	while (!__perf_sched_find_counter(sched)) {
-		if (!perf_sched_restore_state(sched))
-			return false;
-	}
-
-	return true;
-}
-
-/*
- * Go through all unassigned events and find the next one to schedule.
- * Take events with the least weight first. Return true on success.
- */
-static bool perf_sched_next_event(struct perf_sched *sched)
-{
-	struct event_constraint *c;
-
-	if (!sched->state.unassigned || !--sched->state.unassigned)
-		return false;
-
-	do {
-		/* next event */
-		sched->state.event++;
-		if (sched->state.event >= sched->max_events) {
-			/* next weight */
-			sched->state.event = 0;
-			sched->state.weight++;
-			if (sched->state.weight > sched->max_weight)
-				return false;
-		}
-		c = sched->constraints[sched->state.event];
-	} while (c->weight != sched->state.weight);
-
-	sched->state.counter = 0;	/* start with first counter */
-
-	return true;
-}
-
-/*
- * Assign a counter for each event.
- */
-int perf_assign_events(struct event_constraint **constraints, int n,
-			int wmin, int wmax, int gpmax, int *assign)
-{
-	struct perf_sched sched;
-
-	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
-
-	do {
-		if (!perf_sched_find_counter(&sched))
-			break;	/* failed */
-		if (assign)
-			assign[sched.state.event] = sched.state.counter;
-	} while (perf_sched_next_event(&sched));
-
-	return sched.state.unassigned;
-}
-EXPORT_SYMBOL_GPL(perf_assign_events);
-
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
-	struct event_constraint *c;
-	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	struct perf_event *e;
-	int i, wmin, wmax, unsched = 0;
-	struct hw_perf_event *hwc;
-
-	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-
-	if (x86_pmu.start_scheduling)
-		x86_pmu.start_scheduling(cpuc);
-
-	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-		cpuc->event_constraint[i] = NULL;
-		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
-		cpuc->event_constraint[i] = c;
-
-		wmin = min(wmin, c->weight);
-		wmax = max(wmax, c->weight);
-	}
-
-	/*
-	 * fastpath, try to reuse previous register
-	 */
-	for (i = 0; i < n; i++) {
-		hwc = &cpuc->event_list[i]->hw;
-		c = cpuc->event_constraint[i];
-
-		/* never assigned */
-		if (hwc->idx == -1)
-			break;
-
-		/* constraint still honored */
-		if (!test_bit(hwc->idx, c->idxmsk))
-			break;
-
-		/* not already used */
-		if (test_bit(hwc->idx, used_mask))
-			break;
-
-		__set_bit(hwc->idx, used_mask);
-		if (assign)
-			assign[i] = hwc->idx;
-	}
-
-	/* slow path */
-	if (i != n) {
-		int gpmax = x86_pmu.num_counters;
-
-		/*
-		 * Do not allow scheduling of more than half the available
-		 * generic counters.
-		 *
-		 * This helps avoid counter starvation of sibling thread by
-		 * ensuring at most half the counters cannot be in exclusive
-		 * mode. There is no designated counters for the limits. Any
-		 * N/2 counters can be used. This helps with events with
-		 * specific counter constraints.
-		 */
-		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
-		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
-			gpmax /= 2;
-
-		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
-					     wmax, gpmax, assign);
-	}
-
-	/*
-	 * In case of success (unsched = 0), mark events as committed,
-	 * so we do not put_constraint() in case new events are added
-	 * and fail to be scheduled
-	 *
-	 * We invoke the lower level commit callback to lock the resource
-	 *
-	 * We do not need to do all of this in case we are called to
-	 * validate an event group (assign == NULL)
-	 */
-	if (!unsched && assign) {
-		for (i = 0; i < n; i++) {
-			e = cpuc->event_list[i];
-			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
-			if (x86_pmu.commit_scheduling)
-				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
-		}
-	} else {
-		for (i = 0; i < n; i++) {
-			e = cpuc->event_list[i];
-			/*
-			 * do not put_constraint() on comitted events,
-			 * because they are good to go
-			 */
-			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
-				continue;
-
-			/*
-			 * release events that failed scheduling
-			 */
-			if (x86_pmu.put_event_constraints)
-				x86_pmu.put_event_constraints(cpuc, e);
-		}
-	}
-
-	if (x86_pmu.stop_scheduling)
-		x86_pmu.stop_scheduling(cpuc);
-
-	return unsched ? -EINVAL : 0;
-}
-
-/*
- * dogrp: true if must collect siblings events (group)
- * returns total number of events and error code
- */
-static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
-{
-	struct perf_event *event;
-	int n, max_count;
-
-	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
-
-	/* current number of events already accepted */
-	n = cpuc->n_events;
-
-	if (is_x86_event(leader)) {
-		if (n >= max_count)
-			return -EINVAL;
-		cpuc->event_list[n] = leader;
-		n++;
-	}
-	if (!dogrp)
-		return n;
-
-	list_for_each_entry(event, &leader->sibling_list, group_entry) {
-		if (!is_x86_event(event) ||
-		    event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-
-		if (n >= max_count)
-			return -EINVAL;
-
-		cpuc->event_list[n] = event;
-		n++;
-	}
-	return n;
-}
-
-static inline void x86_assign_hw_event(struct perf_event *event,
-				struct cpu_hw_events *cpuc, int i)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	hwc->idx = cpuc->assign[i];
-	hwc->last_cpu = smp_processor_id();
-	hwc->last_tag = ++cpuc->tags[i];
-
-	if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
-		hwc->config_base = 0;
-		hwc->event_base	= 0;
-	} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
-		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
-		hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
-	} else {
-		hwc->config_base = x86_pmu_config_addr(hwc->idx);
-		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
-		hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
-	}
-}
-
-static inline int match_prev_assignment(struct hw_perf_event *hwc,
-					struct cpu_hw_events *cpuc,
-					int i)
-{
-	return hwc->idx == cpuc->assign[i] &&
-		hwc->last_cpu == smp_processor_id() &&
-		hwc->last_tag == cpuc->tags[i];
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags);
-
-static void x86_pmu_enable(struct pmu *pmu)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct perf_event *event;
-	struct hw_perf_event *hwc;
-	int i, added = cpuc->n_added;
-
-	if (!x86_pmu_initialized())
-		return;
-
-	if (cpuc->enabled)
-		return;
-
-	if (cpuc->n_added) {
-		int n_running = cpuc->n_events - cpuc->n_added;
-		/*
-		 * apply assignment obtained either from
-		 * hw_perf_group_sched_in() or x86_pmu_enable()
-		 *
-		 * step1: save events moving to new counters
-		 */
-		for (i = 0; i < n_running; i++) {
-			event = cpuc->event_list[i];
-			hwc = &event->hw;
-
-			/*
-			 * we can avoid reprogramming counter if:
-			 * - assigned same counter as last time
-			 * - running on same CPU as last time
-			 * - no other event has used the counter since
-			 */
-			if (hwc->idx == -1 ||
-			    match_prev_assignment(hwc, cpuc, i))
-				continue;
-
-			/*
-			 * Ensure we don't accidentally enable a stopped
-			 * counter simply because we rescheduled.
-			 */
-			if (hwc->state & PERF_HES_STOPPED)
-				hwc->state |= PERF_HES_ARCH;
-
-			x86_pmu_stop(event, PERF_EF_UPDATE);
-		}
-
-		/*
-		 * step2: reprogram moved events into new counters
-		 */
-		for (i = 0; i < cpuc->n_events; i++) {
-			event = cpuc->event_list[i];
-			hwc = &event->hw;
-
-			if (!match_prev_assignment(hwc, cpuc, i))
-				x86_assign_hw_event(event, cpuc, i);
-			else if (i < n_running)
-				continue;
-
-			if (hwc->state & PERF_HES_ARCH)
-				continue;
-
-			x86_pmu_start(event, PERF_EF_RELOAD);
-		}
-		cpuc->n_added = 0;
-		perf_events_lapic_init();
-	}
-
-	cpuc->enabled = 1;
-	barrier();
-
-	x86_pmu.enable_all(added);
-}
-
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
-
-/*
- * Set the next IRQ period, based on the hwc->period_left value.
- * To be called with the event disabled in hw:
- */
-int x86_perf_event_set_period(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	s64 left = local64_read(&hwc->period_left);
-	s64 period = hwc->sample_period;
-	int ret = 0, idx = hwc->idx;
-
-	if (idx == INTEL_PMC_IDX_FIXED_BTS)
-		return 0;
-
-	/*
-	 * If we are way outside a reasonable range then just skip forward:
-	 */
-	if (unlikely(left <= -period)) {
-		left = period;
-		local64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-
-	if (unlikely(left <= 0)) {
-		left += period;
-		local64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		ret = 1;
-	}
-	/*
-	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
-	 */
-	if (unlikely(left < 2))
-		left = 2;
-
-	if (left > x86_pmu.max_period)
-		left = x86_pmu.max_period;
-
-	if (x86_pmu.limit_period)
-		left = x86_pmu.limit_period(event, left);
-
-	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
-
-	if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
-	    local64_read(&hwc->prev_count) != (u64)-left) {
-		/*
-		 * The hw event starts counting from this event offset,
-		 * mark it to be able to extra future deltas:
-		 */
-		local64_set(&hwc->prev_count, (u64)-left);
-
-		wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
-	}
-
-	/*
-	 * Due to erratum on certan cpu we need
-	 * a second write to be sure the register
-	 * is updated properly
-	 */
-	if (x86_pmu.perfctr_second_write) {
-		wrmsrl(hwc->event_base,
-			(u64)(-left) & x86_pmu.cntval_mask);
-	}
-
-	perf_event_update_userpage(event);
-
-	return ret;
-}
-
-void x86_pmu_enable_event(struct perf_event *event)
-{
-	if (__this_cpu_read(cpu_hw_events.enabled))
-		__x86_pmu_enable_event(&event->hw,
-				       ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Add a single event to the PMU.
- *
- * The event is added to the group of enabled events
- * but only if it can be scehduled with existing events.
- */
-static int x86_pmu_add(struct perf_event *event, int flags)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct hw_perf_event *hwc;
-	int assign[X86_PMC_IDX_MAX];
-	int n, n0, ret;
-
-	hwc = &event->hw;
-
-	n0 = cpuc->n_events;
-	ret = n = collect_events(cpuc, event, false);
-	if (ret < 0)
-		goto out;
-
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-	if (!(flags & PERF_EF_START))
-		hwc->state |= PERF_HES_ARCH;
-
-	/*
-	 * If group events scheduling transaction was started,
-	 * skip the schedulability test here, it will be performed
-	 * at commit time (->commit_txn) as a whole.
-	 */
-	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-		goto done_collect;
-
-	ret = x86_pmu.schedule_events(cpuc, n, assign);
-	if (ret)
-		goto out;
-	/*
-	 * copy new assignment, now we know it is possible
-	 * will be used by hw_perf_enable()
-	 */
-	memcpy(cpuc->assign, assign, n*sizeof(int));
-
-done_collect:
-	/*
-	 * Commit the collect_events() state. See x86_pmu_del() and
-	 * x86_pmu_*_txn().
-	 */
-	cpuc->n_events = n;
-	cpuc->n_added += n - n0;
-	cpuc->n_txn += n - n0;
-
-	ret = 0;
-out:
-	return ret;
-}
-
-static void x86_pmu_start(struct perf_event *event, int flags)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx = event->hw.idx;
-
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-		return;
-
-	if (WARN_ON_ONCE(idx == -1))
-		return;
-
-	if (flags & PERF_EF_RELOAD) {
-		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
-		x86_perf_event_set_period(event);
-	}
-
-	event->hw.state = 0;
-
-	cpuc->events[idx] = event;
-	__set_bit(idx, cpuc->active_mask);
-	__set_bit(idx, cpuc->running);
-	x86_pmu.enable(event);
-	perf_event_update_userpage(event);
-}
-
-void perf_event_print_debug(void)
-{
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-	u64 pebs, debugctl;
-	struct cpu_hw_events *cpuc;
-	unsigned long flags;
-	int cpu, idx;
-
-	if (!x86_pmu.num_counters)
-		return;
-
-	local_irq_save(flags);
-
-	cpu = smp_processor_id();
-	cpuc = &per_cpu(cpu_hw_events, cpu);
-
-	if (x86_pmu.version >= 2) {
-		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-		pr_info("\n");
-		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
-		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
-		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-		if (x86_pmu.pebs_constraints) {
-			rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
-			pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
-		}
-		if (x86_pmu.lbr_nr) {
-			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
-		}
-	}
-	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
-		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
-
-		prev_left = per_cpu(pmc_prev_left[idx], cpu);
-
-		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
-			cpu, idx, pmc_ctrl);
-		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
-			cpu, idx, pmc_count);
-		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
-			cpu, idx, prev_left);
-	}
-	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
-
-		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
-			cpu, idx, pmc_count);
-	}
-	local_irq_restore(flags);
-}
-
-void x86_pmu_stop(struct perf_event *event, int flags)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
-		x86_pmu.disable(event);
-		cpuc->events[hwc->idx] = NULL;
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-	}
-
-	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		/*
-		 * Drain the remaining delta count out of a event
-		 * that we are disabling:
-		 */
-		x86_perf_event_update(event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-}
-
-static void x86_pmu_del(struct perf_event *event, int flags)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int i;
-
-	/*
-	 * event is descheduled
-	 */
-	event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
-
-	/*
-	 * If we're called during a txn, we don't need to do anything.
-	 * The events never got scheduled and ->cancel_txn will truncate
-	 * the event_list.
-	 *
-	 * XXX assumes any ->del() called during a TXN will only be on
-	 * an event added during that same TXN.
-	 */
-	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
-		return;
-
-	/*
-	 * Not a TXN, therefore cleanup properly.
-	 */
-	x86_pmu_stop(event, PERF_EF_UPDATE);
-
-	for (i = 0; i < cpuc->n_events; i++) {
-		if (event == cpuc->event_list[i])
-			break;
-	}
-
-	if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
-		return;
-
-	/* If we have a newly added event; make sure to decrease n_added. */
-	if (i >= cpuc->n_events - cpuc->n_added)
-		--cpuc->n_added;
-
-	if (x86_pmu.put_event_constraints)
-		x86_pmu.put_event_constraints(cpuc, event);
-
-	/* Delete the array entry. */
-	while (++i < cpuc->n_events) {
-		cpuc->event_list[i-1] = cpuc->event_list[i];
-		cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
-	}
-	--cpuc->n_events;
-
-	perf_event_update_userpage(event);
-}
-
-int x86_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_events *cpuc;
-	struct perf_event *event;
-	int idx, handled = 0;
-	u64 val;
-
-	cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	/*
-	 * Some chipsets need to unmask the LVTPC in a particular spot
-	 * inside the nmi handler.  As a result, the unmasking was pushed
-	 * into all the nmi handlers.
-	 *
-	 * This generic handler doesn't seem to have any issues where the
-	 * unmasking occurs so it was left at the top.
-	 */
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		if (!test_bit(idx, cpuc->active_mask)) {
-			/*
-			 * Though we deactivated the counter some cpus
-			 * might still deliver spurious interrupts still
-			 * in flight. Catch them:
-			 */
-			if (__test_and_clear_bit(idx, cpuc->running))
-				handled++;
-			continue;
-		}
-
-		event = cpuc->events[idx];
-
-		val = x86_perf_event_update(event);
-		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
-			continue;
-
-		/*
-		 * event overflow
-		 */
-		handled++;
-		perf_sample_data_init(&data, 0, event->hw.last_period);
-
-		if (!x86_perf_event_set_period(event))
-			continue;
-
-		if (perf_event_overflow(event, &data, regs))
-			x86_pmu_stop(event, 0);
-	}
-
-	if (handled)
-		inc_irq_stat(apic_perf_irqs);
-
-	return handled;
-}
-
-void perf_events_lapic_init(void)
-{
-	if (!x86_pmu.apic || !x86_pmu_initialized())
-		return;
-
-	/*
-	 * Always use NMI for PMU
-	 */
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-}
-
-static int
-perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
-	u64 start_clock;
-	u64 finish_clock;
-	int ret;
-
-	/*
-	 * All PMUs/events that share this PMI handler should make sure to
-	 * increment active_events for their events.
-	 */
-	if (!atomic_read(&active_events))
-		return NMI_DONE;
-
-	start_clock = sched_clock();
-	ret = x86_pmu.handle_irq(regs);
-	finish_clock = sched_clock();
-
-	perf_sample_event_took(finish_clock - start_clock);
-
-	return ret;
-}
-NOKPROBE_SYMBOL(perf_event_nmi_handler);
-
-struct event_constraint emptyconstraint;
-struct event_constraint unconstrained;
-
-static int
-x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	int i, ret = NOTIFY_OK;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
-			cpuc->kfree_on_online[i] = NULL;
-		if (x86_pmu.cpu_prepare)
-			ret = x86_pmu.cpu_prepare(cpu);
-		break;
-
-	case CPU_STARTING:
-		if (x86_pmu.cpu_starting)
-			x86_pmu.cpu_starting(cpu);
-		break;
-
-	case CPU_ONLINE:
-		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
-			kfree(cpuc->kfree_on_online[i]);
-			cpuc->kfree_on_online[i] = NULL;
-		}
-		break;
-
-	case CPU_DYING:
-		if (x86_pmu.cpu_dying)
-			x86_pmu.cpu_dying(cpu);
-		break;
-
-	case CPU_UP_CANCELED:
-	case CPU_DEAD:
-		if (x86_pmu.cpu_dead)
-			x86_pmu.cpu_dead(cpu);
-		break;
-
-	default:
-		break;
-	}
-
-	return ret;
-}
-
-static void __init pmu_check_apic(void)
-{
-	if (cpu_has_apic)
-		return;
-
-	x86_pmu.apic = 0;
-	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
-	pr_info("no hardware sampling interrupt available.\n");
-
-	/*
-	 * If we have a PMU initialized but no APIC
-	 * interrupts, we cannot sample hardware
-	 * events (user-space has to fall back and
-	 * sample via a hrtimer based software event):
-	 */
-	pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
-
-}
-
-static struct attribute_group x86_pmu_format_group = {
-	.name = "format",
-	.attrs = NULL,
-};
-
-/*
- * Remove all undefined events (x86_pmu.event_map(id) == 0)
- * out of events_attr attributes.
- */
-static void __init filter_events(struct attribute **attrs)
-{
-	struct device_attribute *d;
-	struct perf_pmu_events_attr *pmu_attr;
-	int offset = 0;
-	int i, j;
-
-	for (i = 0; attrs[i]; i++) {
-		d = (struct device_attribute *)attrs[i];
-		pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
-		/* str trumps id */
-		if (pmu_attr->event_str)
-			continue;
-		if (x86_pmu.event_map(i + offset))
-			continue;
-
-		for (j = i; attrs[j]; j++)
-			attrs[j] = attrs[j + 1];
-
-		/* Check the shifted attr. */
-		i--;
-
-		/*
-		 * event_map() is index based, the attrs array is organized
-		 * by increasing event index. If we shift the events, then
-		 * we need to compensate for the event_map(), otherwise
-		 * we are looking up the wrong event in the map
-		 */
-		offset++;
-	}
-}
-
-/* Merge two pointer arrays */
-__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
-{
-	struct attribute **new;
-	int j, i;
-
-	for (j = 0; a[j]; j++)
-		;
-	for (i = 0; b[i]; i++)
-		j++;
-	j++;
-
-	new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL);
-	if (!new)
-		return NULL;
-
-	j = 0;
-	for (i = 0; a[i]; i++)
-		new[j++] = a[i];
-	for (i = 0; b[i]; i++)
-		new[j++] = b[i];
-	new[j] = NULL;
-
-	return new;
-}
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
-			  char *page)
-{
-	struct perf_pmu_events_attr *pmu_attr = \
-		container_of(attr, struct perf_pmu_events_attr, attr);
-	u64 config = x86_pmu.event_map(pmu_attr->id);
-
-	/* string trumps id */
-	if (pmu_attr->event_str)
-		return sprintf(page, "%s", pmu_attr->event_str);
-
-	return x86_pmu.events_sysfs_show(page, config);
-}
-
-EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
-EVENT_ATTR(instructions,		INSTRUCTIONS		);
-EVENT_ATTR(cache-references,		CACHE_REFERENCES	);
-EVENT_ATTR(cache-misses, 		CACHE_MISSES		);
-EVENT_ATTR(branch-instructions,		BRANCH_INSTRUCTIONS	);
-EVENT_ATTR(branch-misses,		BRANCH_MISSES		);
-EVENT_ATTR(bus-cycles,			BUS_CYCLES		);
-EVENT_ATTR(stalled-cycles-frontend,	STALLED_CYCLES_FRONTEND	);
-EVENT_ATTR(stalled-cycles-backend,	STALLED_CYCLES_BACKEND	);
-EVENT_ATTR(ref-cycles,			REF_CPU_CYCLES		);
-
-static struct attribute *empty_attrs;
-
-static struct attribute *events_attr[] = {
-	EVENT_PTR(CPU_CYCLES),
-	EVENT_PTR(INSTRUCTIONS),
-	EVENT_PTR(CACHE_REFERENCES),
-	EVENT_PTR(CACHE_MISSES),
-	EVENT_PTR(BRANCH_INSTRUCTIONS),
-	EVENT_PTR(BRANCH_MISSES),
-	EVENT_PTR(BUS_CYCLES),
-	EVENT_PTR(STALLED_CYCLES_FRONTEND),
-	EVENT_PTR(STALLED_CYCLES_BACKEND),
-	EVENT_PTR(REF_CPU_CYCLES),
-	NULL,
-};
-
-static struct attribute_group x86_pmu_events_group = {
-	.name = "events",
-	.attrs = events_attr,
-};
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
-{
-	u64 umask  = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
-	u64 cmask  = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
-	bool edge  = (config & ARCH_PERFMON_EVENTSEL_EDGE);
-	bool pc    = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
-	bool any   = (config & ARCH_PERFMON_EVENTSEL_ANY);
-	bool inv   = (config & ARCH_PERFMON_EVENTSEL_INV);
-	ssize_t ret;
-
-	/*
-	* We have whole page size to spend and just little data
-	* to write, so we can safely use sprintf.
-	*/
-	ret = sprintf(page, "event=0x%02llx", event);
-
-	if (umask)
-		ret += sprintf(page + ret, ",umask=0x%02llx", umask);
-
-	if (edge)
-		ret += sprintf(page + ret, ",edge");
-
-	if (pc)
-		ret += sprintf(page + ret, ",pc");
-
-	if (any)
-		ret += sprintf(page + ret, ",any");
-
-	if (inv)
-		ret += sprintf(page + ret, ",inv");
-
-	if (cmask)
-		ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
-
-	ret += sprintf(page + ret, "\n");
-
-	return ret;
-}
-
-static int __init init_hw_perf_events(void)
-{
-	struct x86_pmu_quirk *quirk;
-	int err;
-
-	pr_info("Performance Events: ");
-
-	switch (boot_cpu_data.x86_vendor) {
-	case X86_VENDOR_INTEL:
-		err = intel_pmu_init();
-		break;
-	case X86_VENDOR_AMD:
-		err = amd_pmu_init();
-		break;
-	default:
-		err = -ENOTSUPP;
-	}
-	if (err != 0) {
-		pr_cont("no PMU driver, software events only.\n");
-		return 0;
-	}
-
-	pmu_check_apic();
-
-	/* sanity check that the hardware exists or is emulated */
-	if (!check_hw_exists())
-		return 0;
-
-	pr_cont("%s PMU driver.\n", x86_pmu.name);
-
-	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
-
-	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
-		quirk->func();
-
-	if (!x86_pmu.intel_ctrl)
-		x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
-	perf_events_lapic_init();
-	register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
-
-	unconstrained = (struct event_constraint)
-		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
-				   0, x86_pmu.num_counters, 0, 0);
-
-	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
-
-	if (x86_pmu.event_attrs)
-		x86_pmu_events_group.attrs = x86_pmu.event_attrs;
-
-	if (!x86_pmu.events_sysfs_show)
-		x86_pmu_events_group.attrs = &empty_attrs;
-	else
-		filter_events(x86_pmu_events_group.attrs);
-
-	if (x86_pmu.cpu_events) {
-		struct attribute **tmp;
-
-		tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events);
-		if (!WARN_ON(!tmp))
-			x86_pmu_events_group.attrs = tmp;
-	}
-
-	pr_info("... version:                %d\n",     x86_pmu.version);
-	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
-	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
-	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
-	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
-	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
-	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
-
-	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
-	perf_cpu_notifier(x86_pmu_notifier);
-
-	return 0;
-}
-early_initcall(init_hw_perf_events);
-
-static inline void x86_pmu_read(struct perf_event *event)
-{
-	x86_perf_event_update(event);
-}
-
-/*
- * Start group events scheduling transaction
- * Set the flag to make pmu::enable() not perform the
- * schedulability test, it will be performed at commit time
- *
- * We only support PERF_PMU_TXN_ADD transactions. Save the
- * transaction flags but otherwise ignore non-PERF_PMU_TXN_ADD
- * transactions.
- */
-static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	WARN_ON_ONCE(cpuc->txn_flags);		/* txn already in flight */
-
-	cpuc->txn_flags = txn_flags;
-	if (txn_flags & ~PERF_PMU_TXN_ADD)
-		return;
-
-	perf_pmu_disable(pmu);
-	__this_cpu_write(cpu_hw_events.n_txn, 0);
-}
-
-/*
- * Stop group events scheduling transaction
- * Clear the flag and pmu::enable() will perform the
- * schedulability test.
- */
-static void x86_pmu_cancel_txn(struct pmu *pmu)
-{
-	unsigned int txn_flags;
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
-
-	txn_flags = cpuc->txn_flags;
-	cpuc->txn_flags = 0;
-	if (txn_flags & ~PERF_PMU_TXN_ADD)
-		return;
-
-	/*
-	 * Truncate collected array by the number of events added in this
-	 * transaction. See x86_pmu_add() and x86_pmu_*_txn().
-	 */
-	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
-	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
-	perf_pmu_enable(pmu);
-}
-
-/*
- * Commit group events scheduling transaction
- * Perform the group schedulability test as a whole
- * Return 0 if success
- *
- * Does not cancel the transaction on failure; expects the caller to do this.
- */
-static int x86_pmu_commit_txn(struct pmu *pmu)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int assign[X86_PMC_IDX_MAX];
-	int n, ret;
-
-	WARN_ON_ONCE(!cpuc->txn_flags);	/* no txn in flight */
-
-	if (cpuc->txn_flags & ~PERF_PMU_TXN_ADD) {
-		cpuc->txn_flags = 0;
-		return 0;
-	}
-
-	n = cpuc->n_events;
-
-	if (!x86_pmu_initialized())
-		return -EAGAIN;
-
-	ret = x86_pmu.schedule_events(cpuc, n, assign);
-	if (ret)
-		return ret;
-
-	/*
-	 * copy new assignment, now we know it is possible
-	 * will be used by hw_perf_enable()
-	 */
-	memcpy(cpuc->assign, assign, n*sizeof(int));
-
-	cpuc->txn_flags = 0;
-	perf_pmu_enable(pmu);
-	return 0;
-}
-/*
- * a fake_cpuc is used to validate event groups. Due to
- * the extra reg logic, we need to also allocate a fake
- * per_core and per_cpu structure. Otherwise, group events
- * using extra reg may conflict without the kernel being
- * able to catch this when the last event gets added to
- * the group.
- */
-static void free_fake_cpuc(struct cpu_hw_events *cpuc)
-{
-	kfree(cpuc->shared_regs);
-	kfree(cpuc);
-}
-
-static struct cpu_hw_events *allocate_fake_cpuc(void)
-{
-	struct cpu_hw_events *cpuc;
-	int cpu = raw_smp_processor_id();
-
-	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
-	if (!cpuc)
-		return ERR_PTR(-ENOMEM);
-
-	/* only needed, if we have extra_regs */
-	if (x86_pmu.extra_regs) {
-		cpuc->shared_regs = allocate_shared_regs(cpu);
-		if (!cpuc->shared_regs)
-			goto error;
-	}
-	cpuc->is_fake = 1;
-	return cpuc;
-error:
-	free_fake_cpuc(cpuc);
-	return ERR_PTR(-ENOMEM);
-}
-
-/*
- * validate that we can schedule this event
- */
-static int validate_event(struct perf_event *event)
-{
-	struct cpu_hw_events *fake_cpuc;
-	struct event_constraint *c;
-	int ret = 0;
-
-	fake_cpuc = allocate_fake_cpuc();
-	if (IS_ERR(fake_cpuc))
-		return PTR_ERR(fake_cpuc);
-
-	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
-
-	if (!c || !c->weight)
-		ret = -EINVAL;
-
-	if (x86_pmu.put_event_constraints)
-		x86_pmu.put_event_constraints(fake_cpuc, event);
-
-	free_fake_cpuc(fake_cpuc);
-
-	return ret;
-}
-
-/*
- * validate a single event group
- *
- * validation include:
- *	- check events are compatible which each other
- *	- events do not compete for the same counter
- *	- number of events <= number of counters
- *
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int validate_group(struct perf_event *event)
-{
-	struct perf_event *leader = event->group_leader;
-	struct cpu_hw_events *fake_cpuc;
-	int ret = -EINVAL, n;
-
-	fake_cpuc = allocate_fake_cpuc();
-	if (IS_ERR(fake_cpuc))
-		return PTR_ERR(fake_cpuc);
-	/*
-	 * the event is not yet connected with its
-	 * siblings therefore we must first collect
-	 * existing siblings, then add the new event
-	 * before we can simulate the scheduling
-	 */
-	n = collect_events(fake_cpuc, leader, true);
-	if (n < 0)
-		goto out;
-
-	fake_cpuc->n_events = n;
-	n = collect_events(fake_cpuc, event, false);
-	if (n < 0)
-		goto out;
-
-	fake_cpuc->n_events = n;
-
-	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
-
-out:
-	free_fake_cpuc(fake_cpuc);
-	return ret;
-}
-
-static int x86_pmu_event_init(struct perf_event *event)
-{
-	struct pmu *tmp;
-	int err;
-
-	switch (event->attr.type) {
-	case PERF_TYPE_RAW:
-	case PERF_TYPE_HARDWARE:
-	case PERF_TYPE_HW_CACHE:
-		break;
-
-	default:
-		return -ENOENT;
-	}
-
-	err = __x86_pmu_event_init(event);
-	if (!err) {
-		/*
-		 * we temporarily connect event to its pmu
-		 * such that validate_group() can classify
-		 * it as an x86 event using is_x86_event()
-		 */
-		tmp = event->pmu;
-		event->pmu = &pmu;
-
-		if (event->group_leader != event)
-			err = validate_group(event);
-		else
-			err = validate_event(event);
-
-		event->pmu = tmp;
-	}
-	if (err) {
-		if (event->destroy)
-			event->destroy(event);
-	}
-
-	if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
-		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
-
-	return err;
-}
-
-static void refresh_pce(void *ignored)
-{
-	if (current->mm)
-		load_mm_cr4(current->mm);
-}
-
-static void x86_pmu_event_mapped(struct perf_event *event)
-{
-	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-		return;
-
-	if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
-		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
-}
-
-static void x86_pmu_event_unmapped(struct perf_event *event)
-{
-	if (!current->mm)
-		return;
-
-	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-		return;
-
-	if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
-		on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
-}
-
-static int x86_pmu_event_idx(struct perf_event *event)
-{
-	int idx = event->hw.idx;
-
-	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
-		return 0;
-
-	if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
-		idx -= INTEL_PMC_IDX_FIXED;
-		idx |= 1 << 30;
-	}
-
-	return idx + 1;
-}
-
-static ssize_t get_attr_rdpmc(struct device *cdev,
-			      struct device_attribute *attr,
-			      char *buf)
-{
-	return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
-}
-
-static ssize_t set_attr_rdpmc(struct device *cdev,
-			      struct device_attribute *attr,
-			      const char *buf, size_t count)
-{
-	unsigned long val;
-	ssize_t ret;
-
-	ret = kstrtoul(buf, 0, &val);
-	if (ret)
-		return ret;
-
-	if (val > 2)
-		return -EINVAL;
-
-	if (x86_pmu.attr_rdpmc_broken)
-		return -ENOTSUPP;
-
-	if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
-		/*
-		 * Changing into or out of always available, aka
-		 * perf-event-bypassing mode.  This path is extremely slow,
-		 * but only root can trigger it, so it's okay.
-		 */
-		if (val == 2)
-			static_key_slow_inc(&rdpmc_always_available);
-		else
-			static_key_slow_dec(&rdpmc_always_available);
-		on_each_cpu(refresh_pce, NULL, 1);
-	}
-
-	x86_pmu.attr_rdpmc = val;
-
-	return count;
-}
-
-static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
-
-static struct attribute *x86_pmu_attrs[] = {
-	&dev_attr_rdpmc.attr,
-	NULL,
-};
-
-static struct attribute_group x86_pmu_attr_group = {
-	.attrs = x86_pmu_attrs,
-};
-
-static const struct attribute_group *x86_pmu_attr_groups[] = {
-	&x86_pmu_attr_group,
-	&x86_pmu_format_group,
-	&x86_pmu_events_group,
-	NULL,
-};
-
-static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-	if (x86_pmu.sched_task)
-		x86_pmu.sched_task(ctx, sched_in);
-}
-
-void perf_check_microcode(void)
-{
-	if (x86_pmu.check_microcode)
-		x86_pmu.check_microcode();
-}
-EXPORT_SYMBOL_GPL(perf_check_microcode);
-
-static struct pmu pmu = {
-	.pmu_enable		= x86_pmu_enable,
-	.pmu_disable		= x86_pmu_disable,
-
-	.attr_groups		= x86_pmu_attr_groups,
-
-	.event_init		= x86_pmu_event_init,
-
-	.event_mapped		= x86_pmu_event_mapped,
-	.event_unmapped		= x86_pmu_event_unmapped,
-
-	.add			= x86_pmu_add,
-	.del			= x86_pmu_del,
-	.start			= x86_pmu_start,
-	.stop			= x86_pmu_stop,
-	.read			= x86_pmu_read,
-
-	.start_txn		= x86_pmu_start_txn,
-	.cancel_txn		= x86_pmu_cancel_txn,
-	.commit_txn		= x86_pmu_commit_txn,
-
-	.event_idx		= x86_pmu_event_idx,
-	.sched_task		= x86_pmu_sched_task,
-	.task_ctx_size          = sizeof(struct x86_perf_task_context),
-};
-
-void arch_perf_update_userpage(struct perf_event *event,
-			       struct perf_event_mmap_page *userpg, u64 now)
-{
-	struct cyc2ns_data *data;
-
-	userpg->cap_user_time = 0;
-	userpg->cap_user_time_zero = 0;
-	userpg->cap_user_rdpmc =
-		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
-	userpg->pmc_width = x86_pmu.cntval_bits;
-
-	if (!sched_clock_stable())
-		return;
-
-	data = cyc2ns_read_begin();
-
-	/*
-	 * Internal timekeeping for enabled/running/stopped times
-	 * is always in the local_clock domain.
-	 */
-	userpg->cap_user_time = 1;
-	userpg->time_mult = data->cyc2ns_mul;
-	userpg->time_shift = data->cyc2ns_shift;
-	userpg->time_offset = data->cyc2ns_offset - now;
-
-	/*
-	 * cap_user_time_zero doesn't make sense when we're using a different
-	 * time base for the records.
-	 */
-	if (event->clock == &local_clock) {
-		userpg->cap_user_time_zero = 1;
-		userpg->time_zero = data->cyc2ns_offset;
-	}
-
-	cyc2ns_read_end(data);
-}
-
-/*
- * callchain support
- */
-
-static int backtrace_stack(void *data, char *name)
-{
-	return 0;
-}
-
-static void backtrace_address(void *data, unsigned long addr, int reliable)
-{
-	struct perf_callchain_entry *entry = data;
-
-	perf_callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
-	.stack			= backtrace_stack,
-	.address		= backtrace_address,
-	.walk_stack		= print_context_stack_bp,
-};
-
-void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		/* TODO: We don't support guest os callchain now */
-		return;
-	}
-
-	perf_callchain_store(entry, regs->ip);
-
-	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
-}
-
-static inline int
-valid_user_frame(const void __user *fp, unsigned long size)
-{
-	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
-}
-
-static unsigned long get_segment_base(unsigned int segment)
-{
-	struct desc_struct *desc;
-	int idx = segment >> 3;
-
-	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
-		struct ldt_struct *ldt;
-
-		if (idx > LDT_ENTRIES)
-			return 0;
-
-		/* IRQs are off, so this synchronizes with smp_store_release */
-		ldt = lockless_dereference(current->active_mm->context.ldt);
-		if (!ldt || idx > ldt->size)
-			return 0;
-
-		desc = &ldt->entries[idx];
-#else
-		return 0;
-#endif
-	} else {
-		if (idx > GDT_ENTRIES)
-			return 0;
-
-		desc = raw_cpu_ptr(gdt_page.gdt) + idx;
-	}
-
-	return get_desc_base(desc);
-}
-
-#ifdef CONFIG_IA32_EMULATION
-
-#include <asm/compat.h>
-
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-	/* 32-bit process in 64-bit kernel. */
-	unsigned long ss_base, cs_base;
-	struct stack_frame_ia32 frame;
-	const void __user *fp;
-
-	if (!test_thread_flag(TIF_IA32))
-		return 0;
-
-	cs_base = get_segment_base(regs->cs);
-	ss_base = get_segment_base(regs->ss);
-
-	fp = compat_ptr(ss_base + regs->bp);
-	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
-		unsigned long bytes;
-		frame.next_frame     = 0;
-		frame.return_address = 0;
-
-		if (!access_ok(VERIFY_READ, fp, 8))
-			break;
-
-		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4);
-		if (bytes != 0)
-			break;
-		bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4);
-		if (bytes != 0)
-			break;
-
-		if (!valid_user_frame(fp, sizeof(frame)))
-			break;
-
-		perf_callchain_store(entry, cs_base + frame.return_address);
-		fp = compat_ptr(ss_base + frame.next_frame);
-	}
-	pagefault_enable();
-	return 1;
-}
-#else
-static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
-{
-    return 0;
-}
-#endif
-
-void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
-{
-	struct stack_frame frame;
-	const void __user *fp;
-
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		/* TODO: We don't support guest os callchain now */
-		return;
-	}
-
-	/*
-	 * We don't know what to do with VM86 stacks.. ignore them for now.
-	 */
-	if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
-		return;
-
-	fp = (void __user *)regs->bp;
-
-	perf_callchain_store(entry, regs->ip);
-
-	if (!current->mm)
-		return;
-
-	if (perf_callchain_user32(regs, entry))
-		return;
-
-	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
-		unsigned long bytes;
-		frame.next_frame	     = NULL;
-		frame.return_address = 0;
-
-		if (!access_ok(VERIFY_READ, fp, 16))
-			break;
-
-		bytes = __copy_from_user_nmi(&frame.next_frame, fp, 8);
-		if (bytes != 0)
-			break;
-		bytes = __copy_from_user_nmi(&frame.return_address, fp+8, 8);
-		if (bytes != 0)
-			break;
-
-		if (!valid_user_frame(fp, sizeof(frame)))
-			break;
-
-		perf_callchain_store(entry, frame.return_address);
-		fp = (void __user *)frame.next_frame;
-	}
-	pagefault_enable();
-}
-
-/*
- * Deal with code segment offsets for the various execution modes:
- *
- *   VM86 - the good olde 16 bit days, where the linear address is
- *          20 bits and we use regs->ip + 0x10 * regs->cs.
- *
- *   IA32 - Where we need to look at GDT/LDT segment descriptor tables
- *          to figure out what the 32bit base address is.
- *
- *    X32 - has TIF_X32 set, but is running in x86_64
- *
- * X86_64 - CS,DS,SS,ES are all zero based.
- */
-static unsigned long code_segment_base(struct pt_regs *regs)
-{
-	/*
-	 * For IA32 we look at the GDT/LDT segment base to convert the
-	 * effective IP to a linear address.
-	 */
-
-#ifdef CONFIG_X86_32
-	/*
-	 * If we are in VM86 mode, add the segment offset to convert to a
-	 * linear address.
-	 */
-	if (regs->flags & X86_VM_MASK)
-		return 0x10 * regs->cs;
-
-	if (user_mode(regs) && regs->cs != __USER_CS)
-		return get_segment_base(regs->cs);
-#else
-	if (user_mode(regs) && !user_64bit_mode(regs) &&
-	    regs->cs != __USER32_CS)
-		return get_segment_base(regs->cs);
-#endif
-	return 0;
-}
-
-unsigned long perf_instruction_pointer(struct pt_regs *regs)
-{
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
-		return perf_guest_cbs->get_guest_ip();
-
-	return regs->ip + code_segment_base(regs);
-}
-
-unsigned long perf_misc_flags(struct pt_regs *regs)
-{
-	int misc = 0;
-
-	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-		if (perf_guest_cbs->is_user_mode())
-			misc |= PERF_RECORD_MISC_GUEST_USER;
-		else
-			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
-	} else {
-		if (user_mode(regs))
-			misc |= PERF_RECORD_MISC_USER;
-		else
-			misc |= PERF_RECORD_MISC_KERNEL;
-	}
-
-	if (regs->flags & PERF_EFLAGS_EXACT)
-		misc |= PERF_RECORD_MISC_EXACT_IP;
-
-	return misc;
-}
-
-void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
-{
-	cap->version		= x86_pmu.version;
-	cap->num_counters_gp	= x86_pmu.num_counters;
-	cap->num_counters_fixed	= x86_pmu.num_counters_fixed;
-	cap->bit_width_gp	= x86_pmu.cntval_bits;
-	cap->bit_width_fixed	= x86_pmu.cntval_bits;
-	cap->events_mask	= (unsigned int)x86_pmu.events_maskl;
-	cap->events_mask_len	= x86_pmu.events_mask_len;
-}
-EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
-- 
cgit v0.10.2


From 39b0332a215832ce3a8f8f57344da4a64370e3ca Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 8 Feb 2016 17:09:05 +0100
Subject: perf/x86: Move perf_event_amd.c ........... => x86/events/amd/core.c

We distribute those in vendor subdirs, starting with .../events/amd/.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1454947748-28629-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 3fad3ce..e0560b6 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1 +1,3 @@
 obj-y			+= core.o
+
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
new file mode 100644
index 0000000..51b1658
--- /dev/null
+++ b/arch/x86/events/amd/core.c
@@ -0,0 +1,731 @@
+#include <linux/perf_event.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <asm/apicdef.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+static __initconst const u64 amd_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
+		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
+		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
+		[ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+		[ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+		[ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const u64 amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]			= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]			= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]		= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]			= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]		= 0x00c2,
+  [PERF_COUNT_HW_BRANCH_MISSES]			= 0x00c3,
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= 0x00d0, /* "Decoder empty" event */
+  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= 0x00d1, /* "Dispatch stalls" event */
+};
+
+static u64 amd_pmu_event_map(int hw_event)
+{
+	return amd_perfmon_event_map[hw_event];
+}
+
+/*
+ * Previously calculated offsets
+ */
+static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
+static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
+
+/*
+ * Legacy CPUs:
+ *   4 counters starting at 0xc0010000 each offset by 1
+ *
+ * CPUs with core performance counter extensions:
+ *   6 counters starting at 0xc0010200 each offset by 2
+ */
+static inline int amd_pmu_addr_offset(int index, bool eventsel)
+{
+	int offset;
+
+	if (!index)
+		return index;
+
+	if (eventsel)
+		offset = event_offsets[index];
+	else
+		offset = count_offsets[index];
+
+	if (offset)
+		return offset;
+
+	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+		offset = index;
+	else
+		offset = index << 1;
+
+	if (eventsel)
+		event_offsets[index] = offset;
+	else
+		count_offsets[index] = offset;
+
+	return offset;
+}
+
+static int amd_core_hw_config(struct perf_event *event)
+{
+	if (event->attr.exclude_host && event->attr.exclude_guest)
+		/*
+		 * When HO == GO == 1 the hardware treats that as GO == HO == 0
+		 * and will count in both modes. We don't want to count in that
+		 * case so we emulate no-counting by setting US = OS = 0.
+		 */
+		event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
+				      ARCH_PERFMON_EVENTSEL_OS);
+	else if (event->attr.exclude_host)
+		event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
+	else if (event->attr.exclude_guest)
+		event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
+
+	return 0;
+}
+
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+	return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
+
+static inline int amd_is_nb_event(struct hw_perf_event *hwc)
+{
+	return (hwc->config & 0xe0) == 0xe0;
+}
+
+static inline int amd_has_nb(struct cpu_hw_events *cpuc)
+{
+	struct amd_nb *nb = cpuc->amd_nb;
+
+	return nb && nb->nb_id != -1;
+}
+
+static int amd_pmu_hw_config(struct perf_event *event)
+{
+	int ret;
+
+	/* pass precise event sampling to ibs: */
+	if (event->attr.precise_ip && get_ibs_caps())
+		return -ENOENT;
+
+	if (has_branch_stack(event))
+		return -EOPNOTSUPP;
+
+	ret = x86_pmu_hw_config(event);
+	if (ret)
+		return ret;
+
+	if (event->attr.type == PERF_TYPE_RAW)
+		event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+	return amd_core_hw_config(event);
+}
+
+static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
+					   struct perf_event *event)
+{
+	struct amd_nb *nb = cpuc->amd_nb;
+	int i;
+
+	/*
+	 * need to scan whole list because event may not have
+	 * been assigned during scheduling
+	 *
+	 * no race condition possible because event can only
+	 * be removed on one CPU at a time AND PMU is disabled
+	 * when we come here
+	 */
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		if (cmpxchg(nb->owners + i, event, NULL) == event)
+			break;
+	}
+}
+
+ /*
+  * AMD64 NorthBridge events need special treatment because
+  * counter access needs to be synchronized across all cores
+  * of a package. Refer to BKDG section 3.12
+  *
+  * NB events are events measuring L3 cache, Hypertransport
+  * traffic. They are identified by an event code >= 0xe00.
+  * They measure events on the NorthBride which is shared
+  * by all cores on a package. NB events are counted on a
+  * shared set of counters. When a NB event is programmed
+  * in a counter, the data actually comes from a shared
+  * counter. Thus, access to those counters needs to be
+  * synchronized.
+  *
+  * We implement the synchronization such that no two cores
+  * can be measuring NB events using the same counters. Thus,
+  * we maintain a per-NB allocation table. The available slot
+  * is propagated using the event_constraint structure.
+  *
+  * We provide only one choice for each NB event based on
+  * the fact that only NB events have restrictions. Consequently,
+  * if a counter is available, there is a guarantee the NB event
+  * will be assigned to it. If no slot is available, an empty
+  * constraint is returned and scheduling will eventually fail
+  * for this event.
+  *
+  * Note that all cores attached the same NB compete for the same
+  * counters to host NB events, this is why we use atomic ops. Some
+  * multi-chip CPUs may have more than one NB.
+  *
+  * Given that resources are allocated (cmpxchg), they must be
+  * eventually freed for others to use. This is accomplished by
+  * calling __amd_put_nb_event_constraints()
+  *
+  * Non NB events are not impacted by this restriction.
+  */
+static struct event_constraint *
+__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+			       struct event_constraint *c)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct amd_nb *nb = cpuc->amd_nb;
+	struct perf_event *old;
+	int idx, new = -1;
+
+	if (!c)
+		c = &unconstrained;
+
+	if (cpuc->is_fake)
+		return c;
+
+	/*
+	 * detect if already present, if so reuse
+	 *
+	 * cannot merge with actual allocation
+	 * because of possible holes
+	 *
+	 * event can already be present yet not assigned (in hwc->idx)
+	 * because of successive calls to x86_schedule_events() from
+	 * hw_perf_group_sched_in() without hw_perf_enable()
+	 */
+	for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
+		if (new == -1 || hwc->idx == idx)
+			/* assign free slot, prefer hwc->idx */
+			old = cmpxchg(nb->owners + idx, NULL, event);
+		else if (nb->owners[idx] == event)
+			/* event already present */
+			old = event;
+		else
+			continue;
+
+		if (old && old != event)
+			continue;
+
+		/* reassign to this slot */
+		if (new != -1)
+			cmpxchg(nb->owners + new, event, NULL);
+		new = idx;
+
+		/* already present, reuse */
+		if (old == event)
+			break;
+	}
+
+	if (new == -1)
+		return &emptyconstraint;
+
+	return &nb->event_constraints[new];
+}
+
+static struct amd_nb *amd_alloc_nb(int cpu)
+{
+	struct amd_nb *nb;
+	int i;
+
+	nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
+	if (!nb)
+		return NULL;
+
+	nb->nb_id = -1;
+
+	/*
+	 * initialize all possible NB constraints
+	 */
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		__set_bit(i, nb->event_constraints[i].idxmsk);
+		nb->event_constraints[i].weight = 1;
+	}
+	return nb;
+}
+
+static int amd_pmu_cpu_prepare(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+	WARN_ON_ONCE(cpuc->amd_nb);
+
+	if (boot_cpu_data.x86_max_cores < 2)
+		return NOTIFY_OK;
+
+	cpuc->amd_nb = amd_alloc_nb(cpu);
+	if (!cpuc->amd_nb)
+		return NOTIFY_BAD;
+
+	return NOTIFY_OK;
+}
+
+static void amd_pmu_cpu_starting(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+	struct amd_nb *nb;
+	int i, nb_id;
+
+	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+	if (boot_cpu_data.x86_max_cores < 2)
+		return;
+
+	nb_id = amd_get_nb_id(cpu);
+	WARN_ON_ONCE(nb_id == BAD_APICID);
+
+	for_each_online_cpu(i) {
+		nb = per_cpu(cpu_hw_events, i).amd_nb;
+		if (WARN_ON_ONCE(!nb))
+			continue;
+
+		if (nb->nb_id == nb_id) {
+			*onln = cpuc->amd_nb;
+			cpuc->amd_nb = nb;
+			break;
+		}
+	}
+
+	cpuc->amd_nb->nb_id = nb_id;
+	cpuc->amd_nb->refcnt++;
+}
+
+static void amd_pmu_cpu_dead(int cpu)
+{
+	struct cpu_hw_events *cpuhw;
+
+	if (boot_cpu_data.x86_max_cores < 2)
+		return;
+
+	cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+	if (cpuhw->amd_nb) {
+		struct amd_nb *nb = cpuhw->amd_nb;
+
+		if (nb->nb_id == -1 || --nb->refcnt == 0)
+			kfree(nb);
+
+		cpuhw->amd_nb = NULL;
+	}
+}
+
+static struct event_constraint *
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	/*
+	 * if not NB event or no NB, then no constraints
+	 */
+	if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
+		return &unconstrained;
+
+	return __amd_get_nb_event_constraints(cpuc, event, NULL);
+}
+
+static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
+				      struct perf_event *event)
+{
+	if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
+		__amd_put_nb_event_constraints(cpuc, event);
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7,32-35");
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+
+static struct attribute *amd_format_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+/* AMD Family 15h */
+
+#define AMD_EVENT_TYPE_MASK	0x000000F0ULL
+
+#define AMD_EVENT_FP		0x00000000ULL ... 0x00000010ULL
+#define AMD_EVENT_LS		0x00000020ULL ... 0x00000030ULL
+#define AMD_EVENT_DC		0x00000040ULL ... 0x00000050ULL
+#define AMD_EVENT_CU		0x00000060ULL ... 0x00000070ULL
+#define AMD_EVENT_IC_DE		0x00000080ULL ... 0x00000090ULL
+#define AMD_EVENT_EX_LS		0x000000C0ULL
+#define AMD_EVENT_DE		0x000000D0ULL
+#define AMD_EVENT_NB		0x000000E0ULL ... 0x000000F0ULL
+
+/*
+ * AMD family 15h event code/PMC mappings:
+ *
+ * type = event_code & 0x0F0:
+ *
+ * 0x000	FP	PERF_CTL[5:3]
+ * 0x010	FP	PERF_CTL[5:3]
+ * 0x020	LS	PERF_CTL[5:0]
+ * 0x030	LS	PERF_CTL[5:0]
+ * 0x040	DC	PERF_CTL[5:0]
+ * 0x050	DC	PERF_CTL[5:0]
+ * 0x060	CU	PERF_CTL[2:0]
+ * 0x070	CU	PERF_CTL[2:0]
+ * 0x080	IC/DE	PERF_CTL[2:0]
+ * 0x090	IC/DE	PERF_CTL[2:0]
+ * 0x0A0	---
+ * 0x0B0	---
+ * 0x0C0	EX/LS	PERF_CTL[5:0]
+ * 0x0D0	DE	PERF_CTL[2:0]
+ * 0x0E0	NB	NB_PERF_CTL[3:0]
+ * 0x0F0	NB	NB_PERF_CTL[3:0]
+ *
+ * Exceptions:
+ *
+ * 0x000	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x003	FP	PERF_CTL[3]
+ * 0x004	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
+ * 0x00B	FP	PERF_CTL[3]
+ * 0x00D	FP	PERF_CTL[3]
+ * 0x023	DE	PERF_CTL[2:0]
+ * 0x02D	LS	PERF_CTL[3]
+ * 0x02E	LS	PERF_CTL[3,0]
+ * 0x031	LS	PERF_CTL[2:0] (**)
+ * 0x043	CU	PERF_CTL[2:0]
+ * 0x045	CU	PERF_CTL[2:0]
+ * 0x046	CU	PERF_CTL[2:0]
+ * 0x054	CU	PERF_CTL[2:0]
+ * 0x055	CU	PERF_CTL[2:0]
+ * 0x08F	IC	PERF_CTL[0]
+ * 0x187	DE	PERF_CTL[0]
+ * 0x188	DE	PERF_CTL[0]
+ * 0x0DB	EX	PERF_CTL[5:0]
+ * 0x0DC	LS	PERF_CTL[5:0]
+ * 0x0DD	LS	PERF_CTL[5:0]
+ * 0x0DE	LS	PERF_CTL[5:0]
+ * 0x0DF	LS	PERF_CTL[5:0]
+ * 0x1C0	EX	PERF_CTL[5:3]
+ * 0x1D6	EX	PERF_CTL[5:0]
+ * 0x1D8	EX	PERF_CTL[5:0]
+ *
+ * (*)  depending on the umask all FPU counters may be used
+ * (**) only one unitmask enabled at a time
+ */
+
+static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
+static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
+static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
+static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
+
+static struct event_constraint *
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+			       struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned int event_code = amd_get_event_code(hwc);
+
+	switch (event_code & AMD_EVENT_TYPE_MASK) {
+	case AMD_EVENT_FP:
+		switch (event_code) {
+		case 0x000:
+			if (!(hwc->config & 0x0000F000ULL))
+				break;
+			if (!(hwc->config & 0x00000F00ULL))
+				break;
+			return &amd_f15_PMC3;
+		case 0x004:
+			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+				break;
+			return &amd_f15_PMC3;
+		case 0x003:
+		case 0x00B:
+		case 0x00D:
+			return &amd_f15_PMC3;
+		}
+		return &amd_f15_PMC53;
+	case AMD_EVENT_LS:
+	case AMD_EVENT_DC:
+	case AMD_EVENT_EX_LS:
+		switch (event_code) {
+		case 0x023:
+		case 0x043:
+		case 0x045:
+		case 0x046:
+		case 0x054:
+		case 0x055:
+			return &amd_f15_PMC20;
+		case 0x02D:
+			return &amd_f15_PMC3;
+		case 0x02E:
+			return &amd_f15_PMC30;
+		case 0x031:
+			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+				return &amd_f15_PMC20;
+			return &emptyconstraint;
+		case 0x1C0:
+			return &amd_f15_PMC53;
+		default:
+			return &amd_f15_PMC50;
+		}
+	case AMD_EVENT_CU:
+	case AMD_EVENT_IC_DE:
+	case AMD_EVENT_DE:
+		switch (event_code) {
+		case 0x08F:
+		case 0x187:
+		case 0x188:
+			return &amd_f15_PMC0;
+		case 0x0DB ... 0x0DF:
+		case 0x1D6:
+		case 0x1D8:
+			return &amd_f15_PMC50;
+		default:
+			return &amd_f15_PMC20;
+		}
+	case AMD_EVENT_NB:
+		/* moved to perf_event_amd_uncore.c */
+		return &emptyconstraint;
+	default:
+		return &emptyconstraint;
+	}
+}
+
+static ssize_t amd_event_sysfs_show(char *page, u64 config)
+{
+	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
+		    (config & AMD64_EVENTSEL_EVENT) >> 24;
+
+	return x86_event_sysfs_show(page, config, event);
+}
+
+static __initconst const struct x86_pmu amd_pmu = {
+	.name			= "AMD",
+	.handle_irq		= x86_pmu_handle_irq,
+	.disable_all		= x86_pmu_disable_all,
+	.enable_all		= x86_pmu_enable_all,
+	.enable			= x86_pmu_enable_event,
+	.disable		= x86_pmu_disable_event,
+	.hw_config		= amd_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_K7_EVNTSEL0,
+	.perfctr		= MSR_K7_PERFCTR0,
+	.addr_offset            = amd_pmu_addr_offset,
+	.event_map		= amd_pmu_event_map,
+	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+	.num_counters		= AMD64_NUM_COUNTERS,
+	.cntval_bits		= 48,
+	.cntval_mask		= (1ULL << 48) - 1,
+	.apic			= 1,
+	/* use highest bit to detect overflow */
+	.max_period		= (1ULL << 47) - 1,
+	.get_event_constraints	= amd_get_event_constraints,
+	.put_event_constraints	= amd_put_event_constraints,
+
+	.format_attrs		= amd_format_attr,
+	.events_sysfs_show	= amd_event_sysfs_show,
+
+	.cpu_prepare		= amd_pmu_cpu_prepare,
+	.cpu_starting		= amd_pmu_cpu_starting,
+	.cpu_dead		= amd_pmu_cpu_dead,
+};
+
+static int __init amd_core_pmu_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+		return 0;
+
+	switch (boot_cpu_data.x86) {
+	case 0x15:
+		pr_cont("Fam15h ");
+		x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
+		break;
+
+	default:
+		pr_err("core perfctr but no constraints; unknown hardware!\n");
+		return -ENODEV;
+	}
+
+	/*
+	 * If core performance counter extensions exists, we must use
+	 * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
+	 * amd_pmu_addr_offset().
+	 */
+	x86_pmu.eventsel	= MSR_F15H_PERF_CTL;
+	x86_pmu.perfctr		= MSR_F15H_PERF_CTR;
+	x86_pmu.num_counters	= AMD64_NUM_COUNTERS_CORE;
+
+	pr_cont("core perfctr, ");
+	return 0;
+}
+
+__init int amd_pmu_init(void)
+{
+	int ret;
+
+	/* Performance-monitoring supported from K7 and later: */
+	if (boot_cpu_data.x86 < 6)
+		return -ENODEV;
+
+	x86_pmu = amd_pmu;
+
+	ret = amd_core_pmu_init();
+	if (ret)
+		return ret;
+
+	/* Events are common for all AMDs */
+	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
+	       sizeof(hw_cache_event_ids));
+
+	return 0;
+}
+
+void amd_pmu_enable_virt(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	cpuc->perf_ctr_virt_mask = 0;
+
+	/* Reload all events */
+	x86_pmu_disable_all();
+	x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
+
+void amd_pmu_disable_virt(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	/*
+	 * We only mask out the Host-only bit so that host-only counting works
+	 * when SVM is disabled. If someone sets up a guest-only counter when
+	 * SVM is disabled the Guest-only bits still gets set and the counter
+	 * will not count anything.
+	 */
+	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
+
+	/* Reload all events */
+	x86_pmu_disable_all();
+	x86_pmu_enable_all(0);
+}
+EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 77000d5..d549b02 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd.o perf_event_amd_uncore.o
+obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_uncore.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o
 endif
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
deleted file mode 100644
index 5861053..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ /dev/null
@@ -1,731 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/export.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <asm/apicdef.h>
-
-#include "perf_event.h"
-
-static __initconst const u64 amd_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-		[ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts  */
-		[ C(RESULT_MISS)   ] = 0x0167, /* Data Prefetcher :cancelled */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
-		[ C(RESULT_MISS)   ] = 0x037E, /* L2 Cache Misses : IC+DC     */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback           */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-		[ C(RESULT_MISS)   ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
-		[ C(RESULT_MISS)   ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
-		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
-		[ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-/*
- * AMD Performance Monitor K7 and later.
- */
-static const u64 amd_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]			= 0x0076,
-  [PERF_COUNT_HW_INSTRUCTIONS]			= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]		= 0x0080,
-  [PERF_COUNT_HW_CACHE_MISSES]			= 0x0081,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]		= 0x00c2,
-  [PERF_COUNT_HW_BRANCH_MISSES]			= 0x00c3,
-  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= 0x00d0, /* "Decoder empty" event */
-  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= 0x00d1, /* "Dispatch stalls" event */
-};
-
-static u64 amd_pmu_event_map(int hw_event)
-{
-	return amd_perfmon_event_map[hw_event];
-}
-
-/*
- * Previously calculated offsets
- */
-static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
-static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
-
-/*
- * Legacy CPUs:
- *   4 counters starting at 0xc0010000 each offset by 1
- *
- * CPUs with core performance counter extensions:
- *   6 counters starting at 0xc0010200 each offset by 2
- */
-static inline int amd_pmu_addr_offset(int index, bool eventsel)
-{
-	int offset;
-
-	if (!index)
-		return index;
-
-	if (eventsel)
-		offset = event_offsets[index];
-	else
-		offset = count_offsets[index];
-
-	if (offset)
-		return offset;
-
-	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-		offset = index;
-	else
-		offset = index << 1;
-
-	if (eventsel)
-		event_offsets[index] = offset;
-	else
-		count_offsets[index] = offset;
-
-	return offset;
-}
-
-static int amd_core_hw_config(struct perf_event *event)
-{
-	if (event->attr.exclude_host && event->attr.exclude_guest)
-		/*
-		 * When HO == GO == 1 the hardware treats that as GO == HO == 0
-		 * and will count in both modes. We don't want to count in that
-		 * case so we emulate no-counting by setting US = OS = 0.
-		 */
-		event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
-				      ARCH_PERFMON_EVENTSEL_OS);
-	else if (event->attr.exclude_host)
-		event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
-	else if (event->attr.exclude_guest)
-		event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
-
-	return 0;
-}
-
-/*
- * AMD64 events are detected based on their event codes.
- */
-static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
-{
-	return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
-}
-
-static inline int amd_is_nb_event(struct hw_perf_event *hwc)
-{
-	return (hwc->config & 0xe0) == 0xe0;
-}
-
-static inline int amd_has_nb(struct cpu_hw_events *cpuc)
-{
-	struct amd_nb *nb = cpuc->amd_nb;
-
-	return nb && nb->nb_id != -1;
-}
-
-static int amd_pmu_hw_config(struct perf_event *event)
-{
-	int ret;
-
-	/* pass precise event sampling to ibs: */
-	if (event->attr.precise_ip && get_ibs_caps())
-		return -ENOENT;
-
-	if (has_branch_stack(event))
-		return -EOPNOTSUPP;
-
-	ret = x86_pmu_hw_config(event);
-	if (ret)
-		return ret;
-
-	if (event->attr.type == PERF_TYPE_RAW)
-		event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
-
-	return amd_core_hw_config(event);
-}
-
-static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
-					   struct perf_event *event)
-{
-	struct amd_nb *nb = cpuc->amd_nb;
-	int i;
-
-	/*
-	 * need to scan whole list because event may not have
-	 * been assigned during scheduling
-	 *
-	 * no race condition possible because event can only
-	 * be removed on one CPU at a time AND PMU is disabled
-	 * when we come here
-	 */
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		if (cmpxchg(nb->owners + i, event, NULL) == event)
-			break;
-	}
-}
-
- /*
-  * AMD64 NorthBridge events need special treatment because
-  * counter access needs to be synchronized across all cores
-  * of a package. Refer to BKDG section 3.12
-  *
-  * NB events are events measuring L3 cache, Hypertransport
-  * traffic. They are identified by an event code >= 0xe00.
-  * They measure events on the NorthBride which is shared
-  * by all cores on a package. NB events are counted on a
-  * shared set of counters. When a NB event is programmed
-  * in a counter, the data actually comes from a shared
-  * counter. Thus, access to those counters needs to be
-  * synchronized.
-  *
-  * We implement the synchronization such that no two cores
-  * can be measuring NB events using the same counters. Thus,
-  * we maintain a per-NB allocation table. The available slot
-  * is propagated using the event_constraint structure.
-  *
-  * We provide only one choice for each NB event based on
-  * the fact that only NB events have restrictions. Consequently,
-  * if a counter is available, there is a guarantee the NB event
-  * will be assigned to it. If no slot is available, an empty
-  * constraint is returned and scheduling will eventually fail
-  * for this event.
-  *
-  * Note that all cores attached the same NB compete for the same
-  * counters to host NB events, this is why we use atomic ops. Some
-  * multi-chip CPUs may have more than one NB.
-  *
-  * Given that resources are allocated (cmpxchg), they must be
-  * eventually freed for others to use. This is accomplished by
-  * calling __amd_put_nb_event_constraints()
-  *
-  * Non NB events are not impacted by this restriction.
-  */
-static struct event_constraint *
-__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
-			       struct event_constraint *c)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct amd_nb *nb = cpuc->amd_nb;
-	struct perf_event *old;
-	int idx, new = -1;
-
-	if (!c)
-		c = &unconstrained;
-
-	if (cpuc->is_fake)
-		return c;
-
-	/*
-	 * detect if already present, if so reuse
-	 *
-	 * cannot merge with actual allocation
-	 * because of possible holes
-	 *
-	 * event can already be present yet not assigned (in hwc->idx)
-	 * because of successive calls to x86_schedule_events() from
-	 * hw_perf_group_sched_in() without hw_perf_enable()
-	 */
-	for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
-		if (new == -1 || hwc->idx == idx)
-			/* assign free slot, prefer hwc->idx */
-			old = cmpxchg(nb->owners + idx, NULL, event);
-		else if (nb->owners[idx] == event)
-			/* event already present */
-			old = event;
-		else
-			continue;
-
-		if (old && old != event)
-			continue;
-
-		/* reassign to this slot */
-		if (new != -1)
-			cmpxchg(nb->owners + new, event, NULL);
-		new = idx;
-
-		/* already present, reuse */
-		if (old == event)
-			break;
-	}
-
-	if (new == -1)
-		return &emptyconstraint;
-
-	return &nb->event_constraints[new];
-}
-
-static struct amd_nb *amd_alloc_nb(int cpu)
-{
-	struct amd_nb *nb;
-	int i;
-
-	nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu));
-	if (!nb)
-		return NULL;
-
-	nb->nb_id = -1;
-
-	/*
-	 * initialize all possible NB constraints
-	 */
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		__set_bit(i, nb->event_constraints[i].idxmsk);
-		nb->event_constraints[i].weight = 1;
-	}
-	return nb;
-}
-
-static int amd_pmu_cpu_prepare(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
-	WARN_ON_ONCE(cpuc->amd_nb);
-
-	if (boot_cpu_data.x86_max_cores < 2)
-		return NOTIFY_OK;
-
-	cpuc->amd_nb = amd_alloc_nb(cpu);
-	if (!cpuc->amd_nb)
-		return NOTIFY_BAD;
-
-	return NOTIFY_OK;
-}
-
-static void amd_pmu_cpu_starting(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
-	struct amd_nb *nb;
-	int i, nb_id;
-
-	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
-	if (boot_cpu_data.x86_max_cores < 2)
-		return;
-
-	nb_id = amd_get_nb_id(cpu);
-	WARN_ON_ONCE(nb_id == BAD_APICID);
-
-	for_each_online_cpu(i) {
-		nb = per_cpu(cpu_hw_events, i).amd_nb;
-		if (WARN_ON_ONCE(!nb))
-			continue;
-
-		if (nb->nb_id == nb_id) {
-			*onln = cpuc->amd_nb;
-			cpuc->amd_nb = nb;
-			break;
-		}
-	}
-
-	cpuc->amd_nb->nb_id = nb_id;
-	cpuc->amd_nb->refcnt++;
-}
-
-static void amd_pmu_cpu_dead(int cpu)
-{
-	struct cpu_hw_events *cpuhw;
-
-	if (boot_cpu_data.x86_max_cores < 2)
-		return;
-
-	cpuhw = &per_cpu(cpu_hw_events, cpu);
-
-	if (cpuhw->amd_nb) {
-		struct amd_nb *nb = cpuhw->amd_nb;
-
-		if (nb->nb_id == -1 || --nb->refcnt == 0)
-			kfree(nb);
-
-		cpuhw->amd_nb = NULL;
-	}
-}
-
-static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			  struct perf_event *event)
-{
-	/*
-	 * if not NB event or no NB, then no constraints
-	 */
-	if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
-		return &unconstrained;
-
-	return __amd_get_nb_event_constraints(cpuc, event, NULL);
-}
-
-static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
-				      struct perf_event *event)
-{
-	if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
-		__amd_put_nb_event_constraints(cpuc, event);
-}
-
-PMU_FORMAT_ATTR(event,	"config:0-7,32-35");
-PMU_FORMAT_ATTR(umask,	"config:8-15"	);
-PMU_FORMAT_ATTR(edge,	"config:18"	);
-PMU_FORMAT_ATTR(inv,	"config:23"	);
-PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
-
-static struct attribute *amd_format_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask.attr,
-	NULL,
-};
-
-/* AMD Family 15h */
-
-#define AMD_EVENT_TYPE_MASK	0x000000F0ULL
-
-#define AMD_EVENT_FP		0x00000000ULL ... 0x00000010ULL
-#define AMD_EVENT_LS		0x00000020ULL ... 0x00000030ULL
-#define AMD_EVENT_DC		0x00000040ULL ... 0x00000050ULL
-#define AMD_EVENT_CU		0x00000060ULL ... 0x00000070ULL
-#define AMD_EVENT_IC_DE		0x00000080ULL ... 0x00000090ULL
-#define AMD_EVENT_EX_LS		0x000000C0ULL
-#define AMD_EVENT_DE		0x000000D0ULL
-#define AMD_EVENT_NB		0x000000E0ULL ... 0x000000F0ULL
-
-/*
- * AMD family 15h event code/PMC mappings:
- *
- * type = event_code & 0x0F0:
- *
- * 0x000	FP	PERF_CTL[5:3]
- * 0x010	FP	PERF_CTL[5:3]
- * 0x020	LS	PERF_CTL[5:0]
- * 0x030	LS	PERF_CTL[5:0]
- * 0x040	DC	PERF_CTL[5:0]
- * 0x050	DC	PERF_CTL[5:0]
- * 0x060	CU	PERF_CTL[2:0]
- * 0x070	CU	PERF_CTL[2:0]
- * 0x080	IC/DE	PERF_CTL[2:0]
- * 0x090	IC/DE	PERF_CTL[2:0]
- * 0x0A0	---
- * 0x0B0	---
- * 0x0C0	EX/LS	PERF_CTL[5:0]
- * 0x0D0	DE	PERF_CTL[2:0]
- * 0x0E0	NB	NB_PERF_CTL[3:0]
- * 0x0F0	NB	NB_PERF_CTL[3:0]
- *
- * Exceptions:
- *
- * 0x000	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x003	FP	PERF_CTL[3]
- * 0x004	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
- * 0x00B	FP	PERF_CTL[3]
- * 0x00D	FP	PERF_CTL[3]
- * 0x023	DE	PERF_CTL[2:0]
- * 0x02D	LS	PERF_CTL[3]
- * 0x02E	LS	PERF_CTL[3,0]
- * 0x031	LS	PERF_CTL[2:0] (**)
- * 0x043	CU	PERF_CTL[2:0]
- * 0x045	CU	PERF_CTL[2:0]
- * 0x046	CU	PERF_CTL[2:0]
- * 0x054	CU	PERF_CTL[2:0]
- * 0x055	CU	PERF_CTL[2:0]
- * 0x08F	IC	PERF_CTL[0]
- * 0x187	DE	PERF_CTL[0]
- * 0x188	DE	PERF_CTL[0]
- * 0x0DB	EX	PERF_CTL[5:0]
- * 0x0DC	LS	PERF_CTL[5:0]
- * 0x0DD	LS	PERF_CTL[5:0]
- * 0x0DE	LS	PERF_CTL[5:0]
- * 0x0DF	LS	PERF_CTL[5:0]
- * 0x1C0	EX	PERF_CTL[5:3]
- * 0x1D6	EX	PERF_CTL[5:0]
- * 0x1D8	EX	PERF_CTL[5:0]
- *
- * (*)  depending on the umask all FPU counters may be used
- * (**) only one unitmask enabled at a time
- */
-
-static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
-static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
-static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0);
-static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
-static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
-static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
-
-static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
-			       struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned int event_code = amd_get_event_code(hwc);
-
-	switch (event_code & AMD_EVENT_TYPE_MASK) {
-	case AMD_EVENT_FP:
-		switch (event_code) {
-		case 0x000:
-			if (!(hwc->config & 0x0000F000ULL))
-				break;
-			if (!(hwc->config & 0x00000F00ULL))
-				break;
-			return &amd_f15_PMC3;
-		case 0x004:
-			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
-				break;
-			return &amd_f15_PMC3;
-		case 0x003:
-		case 0x00B:
-		case 0x00D:
-			return &amd_f15_PMC3;
-		}
-		return &amd_f15_PMC53;
-	case AMD_EVENT_LS:
-	case AMD_EVENT_DC:
-	case AMD_EVENT_EX_LS:
-		switch (event_code) {
-		case 0x023:
-		case 0x043:
-		case 0x045:
-		case 0x046:
-		case 0x054:
-		case 0x055:
-			return &amd_f15_PMC20;
-		case 0x02D:
-			return &amd_f15_PMC3;
-		case 0x02E:
-			return &amd_f15_PMC30;
-		case 0x031:
-			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
-				return &amd_f15_PMC20;
-			return &emptyconstraint;
-		case 0x1C0:
-			return &amd_f15_PMC53;
-		default:
-			return &amd_f15_PMC50;
-		}
-	case AMD_EVENT_CU:
-	case AMD_EVENT_IC_DE:
-	case AMD_EVENT_DE:
-		switch (event_code) {
-		case 0x08F:
-		case 0x187:
-		case 0x188:
-			return &amd_f15_PMC0;
-		case 0x0DB ... 0x0DF:
-		case 0x1D6:
-		case 0x1D8:
-			return &amd_f15_PMC50;
-		default:
-			return &amd_f15_PMC20;
-		}
-	case AMD_EVENT_NB:
-		/* moved to perf_event_amd_uncore.c */
-		return &emptyconstraint;
-	default:
-		return &emptyconstraint;
-	}
-}
-
-static ssize_t amd_event_sysfs_show(char *page, u64 config)
-{
-	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
-		    (config & AMD64_EVENTSEL_EVENT) >> 24;
-
-	return x86_event_sysfs_show(page, config, event);
-}
-
-static __initconst const struct x86_pmu amd_pmu = {
-	.name			= "AMD",
-	.handle_irq		= x86_pmu_handle_irq,
-	.disable_all		= x86_pmu_disable_all,
-	.enable_all		= x86_pmu_enable_all,
-	.enable			= x86_pmu_enable_event,
-	.disable		= x86_pmu_disable_event,
-	.hw_config		= amd_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_K7_EVNTSEL0,
-	.perfctr		= MSR_K7_PERFCTR0,
-	.addr_offset            = amd_pmu_addr_offset,
-	.event_map		= amd_pmu_event_map,
-	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
-	.num_counters		= AMD64_NUM_COUNTERS,
-	.cntval_bits		= 48,
-	.cntval_mask		= (1ULL << 48) - 1,
-	.apic			= 1,
-	/* use highest bit to detect overflow */
-	.max_period		= (1ULL << 47) - 1,
-	.get_event_constraints	= amd_get_event_constraints,
-	.put_event_constraints	= amd_put_event_constraints,
-
-	.format_attrs		= amd_format_attr,
-	.events_sysfs_show	= amd_event_sysfs_show,
-
-	.cpu_prepare		= amd_pmu_cpu_prepare,
-	.cpu_starting		= amd_pmu_cpu_starting,
-	.cpu_dead		= amd_pmu_cpu_dead,
-};
-
-static int __init amd_core_pmu_init(void)
-{
-	if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-		return 0;
-
-	switch (boot_cpu_data.x86) {
-	case 0x15:
-		pr_cont("Fam15h ");
-		x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
-		break;
-
-	default:
-		pr_err("core perfctr but no constraints; unknown hardware!\n");
-		return -ENODEV;
-	}
-
-	/*
-	 * If core performance counter extensions exists, we must use
-	 * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
-	 * amd_pmu_addr_offset().
-	 */
-	x86_pmu.eventsel	= MSR_F15H_PERF_CTL;
-	x86_pmu.perfctr		= MSR_F15H_PERF_CTR;
-	x86_pmu.num_counters	= AMD64_NUM_COUNTERS_CORE;
-
-	pr_cont("core perfctr, ");
-	return 0;
-}
-
-__init int amd_pmu_init(void)
-{
-	int ret;
-
-	/* Performance-monitoring supported from K7 and later: */
-	if (boot_cpu_data.x86 < 6)
-		return -ENODEV;
-
-	x86_pmu = amd_pmu;
-
-	ret = amd_core_pmu_init();
-	if (ret)
-		return ret;
-
-	/* Events are common for all AMDs */
-	memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
-	       sizeof(hw_cache_event_ids));
-
-	return 0;
-}
-
-void amd_pmu_enable_virt(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	cpuc->perf_ctr_virt_mask = 0;
-
-	/* Reload all events */
-	x86_pmu_disable_all();
-	x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
-
-void amd_pmu_disable_virt(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	/*
-	 * We only mask out the Host-only bit so that host-only counting works
-	 * when SVM is disabled. If someone sets up a guest-only counter when
-	 * SVM is disabled the Guest-only bits still gets set and the counter
-	 * will not count anything.
-	 */
-	cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
-
-	/* Reload all events */
-	x86_pmu_disable_all();
-	x86_pmu_enable_all(0);
-}
-EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
-- 
cgit v0.10.2


From 218cfe4ed8885f988d67ac5f52efeff7233ae1f2 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 8 Feb 2016 17:09:06 +0100
Subject: perf/x86: Move perf_event_amd_ibs.c ....... => x86/events/amd/ibs.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1454947748-28629-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index e0560b6..88f7873 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,3 +1,4 @@
 obj-y			+= core.o
 
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o
+obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
new file mode 100644
index 0000000..a8abd08
--- /dev/null
+++ b/arch/x86/events/amd/ibs.c
@@ -0,0 +1,959 @@
+/*
+ * Performance events - AMD IBS
+ *
+ *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
+
+#include <asm/apic.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+static u32 ibs_caps;
+
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
+
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+
+#include <asm/nmi.h>
+
+#define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
+#define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT
+
+enum ibs_states {
+	IBS_ENABLED	= 0,
+	IBS_STARTED	= 1,
+	IBS_STOPPING	= 2,
+
+	IBS_MAX_STATES,
+};
+
+struct cpu_perf_ibs {
+	struct perf_event	*event;
+	unsigned long		state[BITS_TO_LONGS(IBS_MAX_STATES)];
+};
+
+struct perf_ibs {
+	struct pmu			pmu;
+	unsigned int			msr;
+	u64				config_mask;
+	u64				cnt_mask;
+	u64				enable_mask;
+	u64				valid_mask;
+	u64				max_period;
+	unsigned long			offset_mask[1];
+	int				offset_max;
+	struct cpu_perf_ibs __percpu	*pcpu;
+
+	struct attribute		**format_attrs;
+	struct attribute_group		format_group;
+	const struct attribute_group	*attr_groups[2];
+
+	u64				(*get_count)(u64 config);
+};
+
+struct perf_ibs_data {
+	u32		size;
+	union {
+		u32	data[0];	/* data buffer starts here */
+		u32	caps;
+	};
+	u64		regs[MSR_AMD64_IBS_REG_COUNT_MAX];
+};
+
+static int
+perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
+{
+	s64 left = local64_read(&hwc->period_left);
+	s64 period = hwc->sample_period;
+	int overflow = 0;
+
+	/*
+	 * If we are way outside a reasonable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	}
+
+	if (unlikely(left < (s64)min)) {
+		left += period;
+		local64_set(&hwc->period_left, left);
+		hwc->last_period = period;
+		overflow = 1;
+	}
+
+	/*
+	 * If the hw period that triggers the sw overflow is too short
+	 * we might hit the irq handler. This biases the results.
+	 * Thus we shorten the next-to-last period and set the last
+	 * period to the max period.
+	 */
+	if (left > max) {
+		left -= max;
+		if (left > max)
+			left = max;
+		else if (left < min)
+			left = min;
+	}
+
+	*hw_period = (u64)left;
+
+	return overflow;
+}
+
+static  int
+perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int shift = 64 - width;
+	u64 prev_raw_count;
+	u64 delta;
+
+	/*
+	 * Careful: an NMI might modify the previous event value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic event atomically:
+	 */
+	prev_raw_count = local64_read(&hwc->prev_count);
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		return 0;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+	local64_sub(delta, &hwc->period_left);
+
+	return 1;
+}
+
+static struct perf_ibs perf_ibs_fetch;
+static struct perf_ibs perf_ibs_op;
+
+static struct perf_ibs *get_ibs_pmu(int type)
+{
+	if (perf_ibs_fetch.pmu.type == type)
+		return &perf_ibs_fetch;
+	if (perf_ibs_op.pmu.type == type)
+		return &perf_ibs_op;
+	return NULL;
+}
+
+/*
+ * Use IBS for precise event sampling:
+ *
+ *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
+ *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
+ *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
+ *
+ * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
+ * MSRC001_1033) is used to select either cycle or micro-ops counting
+ * mode.
+ *
+ * The rip of IBS samples has skid 0. Thus, IBS supports precise
+ * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
+ * rip is invalid when IBS was not able to record the rip correctly.
+ * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
+ *
+ */
+static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
+{
+	switch (event->attr.precise_ip) {
+	case 0:
+		return -ENOENT;
+	case 1:
+	case 2:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	switch (event->attr.type) {
+	case PERF_TYPE_HARDWARE:
+		switch (event->attr.config) {
+		case PERF_COUNT_HW_CPU_CYCLES:
+			*config = 0;
+			return 0;
+		}
+		break;
+	case PERF_TYPE_RAW:
+		switch (event->attr.config) {
+		case 0x0076:
+			*config = 0;
+			return 0;
+		case 0x00C1:
+			*config = IBS_OP_CNT_CTL;
+			return 0;
+		}
+		break;
+	default:
+		return -ENOENT;
+	}
+
+	return -EOPNOTSUPP;
+}
+
+static const struct perf_event_attr ibs_notsupp = {
+	.exclude_user	= 1,
+	.exclude_kernel	= 1,
+	.exclude_hv	= 1,
+	.exclude_idle	= 1,
+	.exclude_host	= 1,
+	.exclude_guest	= 1,
+};
+
+static int perf_ibs_init(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs;
+	u64 max_cnt, config;
+	int ret;
+
+	perf_ibs = get_ibs_pmu(event->attr.type);
+	if (perf_ibs) {
+		config = event->attr.config;
+	} else {
+		perf_ibs = &perf_ibs_op;
+		ret = perf_ibs_precise_event(event, &config);
+		if (ret)
+			return ret;
+	}
+
+	if (event->pmu != &perf_ibs->pmu)
+		return -ENOENT;
+
+	if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
+		return -EINVAL;
+
+	if (config & ~perf_ibs->config_mask)
+		return -EINVAL;
+
+	if (hwc->sample_period) {
+		if (config & perf_ibs->cnt_mask)
+			/* raw max_cnt may not be set */
+			return -EINVAL;
+		if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
+			/*
+			 * lower 4 bits can not be set in ibs max cnt,
+			 * but allowing it in case we adjust the
+			 * sample period to set a frequency.
+			 */
+			return -EINVAL;
+		hwc->sample_period &= ~0x0FULL;
+		if (!hwc->sample_period)
+			hwc->sample_period = 0x10;
+	} else {
+		max_cnt = config & perf_ibs->cnt_mask;
+		config &= ~perf_ibs->cnt_mask;
+		event->attr.sample_period = max_cnt << 4;
+		hwc->sample_period = event->attr.sample_period;
+	}
+
+	if (!hwc->sample_period)
+		return -EINVAL;
+
+	/*
+	 * If we modify hwc->sample_period, we also need to update
+	 * hwc->last_period and hwc->period_left.
+	 */
+	hwc->last_period = hwc->sample_period;
+	local64_set(&hwc->period_left, hwc->sample_period);
+
+	hwc->config_base = perf_ibs->msr;
+	hwc->config = config;
+
+	return 0;
+}
+
+static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
+			       struct hw_perf_event *hwc, u64 *period)
+{
+	int overflow;
+
+	/* ignore lower 4 bits in min count: */
+	overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
+	local64_set(&hwc->prev_count, 0);
+
+	return overflow;
+}
+
+static u64 get_ibs_fetch_count(u64 config)
+{
+	return (config & IBS_FETCH_CNT) >> 12;
+}
+
+static u64 get_ibs_op_count(u64 config)
+{
+	u64 count = 0;
+
+	if (config & IBS_OP_VAL)
+		count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
+
+	if (ibs_caps & IBS_CAPS_RDWROPCNT)
+		count += (config & IBS_OP_CUR_CNT) >> 32;
+
+	return count;
+}
+
+static void
+perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
+		      u64 *config)
+{
+	u64 count = perf_ibs->get_count(*config);
+
+	/*
+	 * Set width to 64 since we do not overflow on max width but
+	 * instead on max count. In perf_ibs_set_period() we clear
+	 * prev count manually on overflow.
+	 */
+	while (!perf_event_try_update(event, count, 64)) {
+		rdmsrl(event->hw.config_base, *config);
+		count = perf_ibs->get_count(*config);
+	}
+}
+
+static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
+					 struct hw_perf_event *hwc, u64 config)
+{
+	wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
+}
+
+/*
+ * Erratum #420 Instruction-Based Sampling Engine May Generate
+ * Interrupt that Cannot Be Cleared:
+ *
+ * Must clear counter mask first, then clear the enable bit. See
+ * Revision Guide for AMD Family 10h Processors, Publication #41322.
+ */
+static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
+					  struct hw_perf_event *hwc, u64 config)
+{
+	config &= ~perf_ibs->cnt_mask;
+	wrmsrl(hwc->config_base, config);
+	config &= ~perf_ibs->enable_mask;
+	wrmsrl(hwc->config_base, config);
+}
+
+/*
+ * We cannot restore the ibs pmu state, so we always needs to update
+ * the event while stopping it and then reset the state when starting
+ * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
+ * perf_ibs_start()/perf_ibs_stop() and instead always do it.
+ */
+static void perf_ibs_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	u64 period;
+
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+	hwc->state = 0;
+
+	perf_ibs_set_period(perf_ibs, hwc, &period);
+	set_bit(IBS_STARTED, pcpu->state);
+	perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+	perf_event_update_userpage(event);
+}
+
+static void perf_ibs_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	u64 config;
+	int stopping;
+
+	stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+
+	if (!stopping && (hwc->state & PERF_HES_UPTODATE))
+		return;
+
+	rdmsrl(hwc->config_base, config);
+
+	if (stopping) {
+		set_bit(IBS_STOPPING, pcpu->state);
+		perf_ibs_disable_event(perf_ibs, hwc, config);
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	/*
+	 * Clear valid bit to not count rollovers on update, rollovers
+	 * are only updated in the irq handler.
+	 */
+	config &= ~perf_ibs->valid_mask;
+
+	perf_ibs_event_update(perf_ibs, event, &config);
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_ibs_add(struct perf_event *event, int flags)
+{
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+	if (test_and_set_bit(IBS_ENABLED, pcpu->state))
+		return -ENOSPC;
+
+	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	pcpu->event = event;
+
+	if (flags & PERF_EF_START)
+		perf_ibs_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void perf_ibs_del(struct perf_event *event, int flags)
+{
+	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+
+	if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
+		return;
+
+	perf_ibs_stop(event, PERF_EF_UPDATE);
+
+	pcpu->event = NULL;
+
+	perf_event_update_userpage(event);
+}
+
+static void perf_ibs_read(struct perf_event *event) { }
+
+PMU_FORMAT_ATTR(rand_en,	"config:57");
+PMU_FORMAT_ATTR(cnt_ctl,	"config:19");
+
+static struct attribute *ibs_fetch_format_attrs[] = {
+	&format_attr_rand_en.attr,
+	NULL,
+};
+
+static struct attribute *ibs_op_format_attrs[] = {
+	NULL,	/* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
+	NULL,
+};
+
+static struct perf_ibs perf_ibs_fetch = {
+	.pmu = {
+		.task_ctx_nr	= perf_invalid_context,
+
+		.event_init	= perf_ibs_init,
+		.add		= perf_ibs_add,
+		.del		= perf_ibs_del,
+		.start		= perf_ibs_start,
+		.stop		= perf_ibs_stop,
+		.read		= perf_ibs_read,
+	},
+	.msr			= MSR_AMD64_IBSFETCHCTL,
+	.config_mask		= IBS_FETCH_CONFIG_MASK,
+	.cnt_mask		= IBS_FETCH_MAX_CNT,
+	.enable_mask		= IBS_FETCH_ENABLE,
+	.valid_mask		= IBS_FETCH_VAL,
+	.max_period		= IBS_FETCH_MAX_CNT << 4,
+	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK },
+	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT,
+	.format_attrs		= ibs_fetch_format_attrs,
+
+	.get_count		= get_ibs_fetch_count,
+};
+
+static struct perf_ibs perf_ibs_op = {
+	.pmu = {
+		.task_ctx_nr	= perf_invalid_context,
+
+		.event_init	= perf_ibs_init,
+		.add		= perf_ibs_add,
+		.del		= perf_ibs_del,
+		.start		= perf_ibs_start,
+		.stop		= perf_ibs_stop,
+		.read		= perf_ibs_read,
+	},
+	.msr			= MSR_AMD64_IBSOPCTL,
+	.config_mask		= IBS_OP_CONFIG_MASK,
+	.cnt_mask		= IBS_OP_MAX_CNT,
+	.enable_mask		= IBS_OP_ENABLE,
+	.valid_mask		= IBS_OP_VAL,
+	.max_period		= IBS_OP_MAX_CNT << 4,
+	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK },
+	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT,
+	.format_attrs		= ibs_op_format_attrs,
+
+	.get_count		= get_ibs_op_count,
+};
+
+static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
+{
+	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
+	struct perf_event *event = pcpu->event;
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_sample_data data;
+	struct perf_raw_record raw;
+	struct pt_regs regs;
+	struct perf_ibs_data ibs_data;
+	int offset, size, check_rip, offset_max, throttle = 0;
+	unsigned int msr;
+	u64 *buf, *config, period;
+
+	if (!test_bit(IBS_STARTED, pcpu->state)) {
+		/*
+		 * Catch spurious interrupts after stopping IBS: After
+		 * disabling IBS there could be still incoming NMIs
+		 * with samples that even have the valid bit cleared.
+		 * Mark all this NMIs as handled.
+		 */
+		return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+	}
+
+	msr = hwc->config_base;
+	buf = ibs_data.regs;
+	rdmsrl(msr, *buf);
+	if (!(*buf++ & perf_ibs->valid_mask))
+		return 0;
+
+	config = &ibs_data.regs[0];
+	perf_ibs_event_update(perf_ibs, event, config);
+	perf_sample_data_init(&data, 0, hwc->last_period);
+	if (!perf_ibs_set_period(perf_ibs, hwc, &period))
+		goto out;	/* no sw counter overflow */
+
+	ibs_data.caps = ibs_caps;
+	size = 1;
+	offset = 1;
+	check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
+	if (event->attr.sample_type & PERF_SAMPLE_RAW)
+		offset_max = perf_ibs->offset_max;
+	else if (check_rip)
+		offset_max = 2;
+	else
+		offset_max = 1;
+	do {
+		rdmsrl(msr + offset, *buf++);
+		size++;
+		offset = find_next_bit(perf_ibs->offset_mask,
+				       perf_ibs->offset_max,
+				       offset + 1);
+	} while (offset < offset_max);
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		/*
+		 * Read IbsBrTarget and IbsOpData4 separately
+		 * depending on their availability.
+		 * Can't add to offset_max as they are staggered
+		 */
+		if (ibs_caps & IBS_CAPS_BRNTRGT) {
+			rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
+			size++;
+		}
+		if (ibs_caps & IBS_CAPS_OPDATA4) {
+			rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
+			size++;
+		}
+	}
+	ibs_data.size = sizeof(u64) * size;
+
+	regs = *iregs;
+	if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
+		regs.flags &= ~PERF_EFLAGS_EXACT;
+	} else {
+		set_linear_ip(&regs, ibs_data.regs[1]);
+		regs.flags |= PERF_EFLAGS_EXACT;
+	}
+
+	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
+		raw.size = sizeof(u32) + ibs_data.size;
+		raw.data = ibs_data.data;
+		data.raw = &raw;
+	}
+
+	throttle = perf_event_overflow(event, &data, &regs);
+out:
+	if (throttle)
+		perf_ibs_disable_event(perf_ibs, hwc, *config);
+	else
+		perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
+
+	perf_event_update_userpage(event);
+
+	return 1;
+}
+
+static int
+perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
+{
+	int handled = 0;
+
+	handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
+	handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	return handled;
+}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
+
+static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
+{
+	struct cpu_perf_ibs __percpu *pcpu;
+	int ret;
+
+	pcpu = alloc_percpu(struct cpu_perf_ibs);
+	if (!pcpu)
+		return -ENOMEM;
+
+	perf_ibs->pcpu = pcpu;
+
+	/* register attributes */
+	if (perf_ibs->format_attrs[0]) {
+		memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
+		perf_ibs->format_group.name	= "format";
+		perf_ibs->format_group.attrs	= perf_ibs->format_attrs;
+
+		memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
+		perf_ibs->attr_groups[0]	= &perf_ibs->format_group;
+		perf_ibs->pmu.attr_groups	= perf_ibs->attr_groups;
+	}
+
+	ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
+	if (ret) {
+		perf_ibs->pcpu = NULL;
+		free_percpu(pcpu);
+	}
+
+	return ret;
+}
+
+static __init int perf_event_ibs_init(void)
+{
+	struct attribute **attr = ibs_op_format_attrs;
+
+	if (!ibs_caps)
+		return -ENODEV;	/* ibs not supported by the cpu */
+
+	perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
+
+	if (ibs_caps & IBS_CAPS_OPCNT) {
+		perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+		*attr++ = &format_attr_cnt_ctl.attr;
+	}
+	perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+
+	register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
+	pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
+
+	return 0;
+}
+
+#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
+
+static __init int perf_event_ibs_init(void) { return 0; }
+
+#endif
+
+/* IBS - apic initialization, for perf and oprofile */
+
+static __init u32 __get_ibs_caps(void)
+{
+	u32 caps;
+	unsigned int max_level;
+
+	if (!boot_cpu_has(X86_FEATURE_IBS))
+		return 0;
+
+	/* check IBS cpuid feature flags */
+	max_level = cpuid_eax(0x80000000);
+	if (max_level < IBS_CPUID_FEATURES)
+		return IBS_CAPS_DEFAULT;
+
+	caps = cpuid_eax(IBS_CPUID_FEATURES);
+	if (!(caps & IBS_CAPS_AVAIL))
+		/* cpuid flags not valid */
+		return IBS_CAPS_DEFAULT;
+
+	return caps;
+}
+
+u32 get_ibs_caps(void)
+{
+	return ibs_caps;
+}
+
+EXPORT_SYMBOL(get_ibs_caps);
+
+static inline int get_eilvt(int offset)
+{
+	return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
+}
+
+static inline int put_eilvt(int offset)
+{
+	return !setup_APIC_eilvt(offset, 0, 0, 1);
+}
+
+/*
+ * Check and reserve APIC extended interrupt LVT offset for IBS if available.
+ */
+static inline int ibs_eilvt_valid(void)
+{
+	int offset;
+	u64 val;
+	int valid = 0;
+
+	preempt_disable();
+
+	rdmsrl(MSR_AMD64_IBSCTL, val);
+	offset = val & IBSCTL_LVT_OFFSET_MASK;
+
+	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
+		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
+		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+		goto out;
+	}
+
+	if (!get_eilvt(offset)) {
+		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
+		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
+		goto out;
+	}
+
+	valid = 1;
+out:
+	preempt_enable();
+
+	return valid;
+}
+
+static int setup_ibs_ctl(int ibs_eilvt_off)
+{
+	struct pci_dev *cpu_cfg;
+	int nodes;
+	u32 value = 0;
+
+	nodes = 0;
+	cpu_cfg = NULL;
+	do {
+		cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
+					 PCI_DEVICE_ID_AMD_10H_NB_MISC,
+					 cpu_cfg);
+		if (!cpu_cfg)
+			break;
+		++nodes;
+		pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
+				       | IBSCTL_LVT_OFFSET_VALID);
+		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
+		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
+			pci_dev_put(cpu_cfg);
+			pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
+				 value);
+			return -EINVAL;
+		}
+	} while (1);
+
+	if (!nodes) {
+		pr_debug("No CPU node configured for IBS\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
+ * is using the new offset.
+ */
+static void force_ibs_eilvt_setup(void)
+{
+	int offset;
+	int ret;
+
+	preempt_disable();
+	/* find the next free available EILVT entry, skip offset 0 */
+	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
+		if (get_eilvt(offset))
+			break;
+	}
+	preempt_enable();
+
+	if (offset == APIC_EILVT_NR_MAX) {
+		pr_debug("No EILVT entry available\n");
+		return;
+	}
+
+	ret = setup_ibs_ctl(offset);
+	if (ret)
+		goto out;
+
+	if (!ibs_eilvt_valid())
+		goto out;
+
+	pr_info("IBS: LVT offset %d assigned\n", offset);
+
+	return;
+out:
+	preempt_disable();
+	put_eilvt(offset);
+	preempt_enable();
+	return;
+}
+
+static void ibs_eilvt_setup(void)
+{
+	/*
+	 * Force LVT offset assignment for family 10h: The offsets are
+	 * not assigned by the BIOS for this family, so the OS is
+	 * responsible for doing it. If the OS assignment fails, fall
+	 * back to BIOS settings and try to setup this.
+	 */
+	if (boot_cpu_data.x86 == 0x10)
+		force_ibs_eilvt_setup();
+}
+
+static inline int get_ibs_lvt_offset(void)
+{
+	u64 val;
+
+	rdmsrl(MSR_AMD64_IBSCTL, val);
+	if (!(val & IBSCTL_LVT_OFFSET_VALID))
+		return -EINVAL;
+
+	return val & IBSCTL_LVT_OFFSET_MASK;
+}
+
+static void setup_APIC_ibs(void *dummy)
+{
+	int offset;
+
+	offset = get_ibs_lvt_offset();
+	if (offset < 0)
+		goto failed;
+
+	if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
+		return;
+failed:
+	pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
+		smp_processor_id());
+}
+
+static void clear_APIC_ibs(void *dummy)
+{
+	int offset;
+
+	offset = get_ibs_lvt_offset();
+	if (offset >= 0)
+		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
+}
+
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+	clear_APIC_ibs(NULL);
+	return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+	ibs_eilvt_setup();
+	setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+	.resume		= perf_ibs_resume,
+	.suspend	= perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+	register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
+static int
+perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_STARTING:
+		setup_APIC_ibs(NULL);
+		break;
+	case CPU_DYING:
+		clear_APIC_ibs(NULL);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static __init int amd_ibs_init(void)
+{
+	u32 caps;
+	int ret = -EINVAL;
+
+	caps = __get_ibs_caps();
+	if (!caps)
+		return -ENODEV;	/* ibs not supported by the cpu */
+
+	ibs_eilvt_setup();
+
+	if (!ibs_eilvt_valid())
+		goto out;
+
+	perf_ibs_pm_init();
+	cpu_notifier_register_begin();
+	ibs_caps = caps;
+	/* make ibs_caps visible to other cpus: */
+	smp_mb();
+	smp_call_function(setup_APIC_ibs, NULL, 1);
+	__perf_cpu_notifier(perf_ibs_cpu_notifier);
+	cpu_notifier_register_done();
+
+	ret = perf_event_ibs_init();
+out:
+	if (ret)
+		pr_err("Failed to setup IBS, %d\n", ret);
+	return ret;
+}
+
+/* Since we need the pci subsystem to init ibs we can't do this earlier: */
+device_initcall(amd_ibs_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d549b02..dddba22 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -54,7 +54,7 @@ obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
 obj-$(CONFIG_MICROCODE)			+= microcode/
 
-obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o perf_event_amd_ibs.o
+obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 
 obj-$(CONFIG_HYPERVISOR_GUEST)		+= vmware.o hypervisor.o mshyperv.o
 
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
deleted file mode 100644
index aa12f95..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ /dev/null
@@ -1,959 +0,0 @@
-/*
- * Performance events - AMD IBS
- *
- *  Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/ptrace.h>
-#include <linux/syscore_ops.h>
-
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-static u32 ibs_caps;
-
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
-
-#include <linux/kprobes.h>
-#include <linux/hardirq.h>
-
-#include <asm/nmi.h>
-
-#define IBS_FETCH_CONFIG_MASK	(IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
-#define IBS_OP_CONFIG_MASK	IBS_OP_MAX_CNT
-
-enum ibs_states {
-	IBS_ENABLED	= 0,
-	IBS_STARTED	= 1,
-	IBS_STOPPING	= 2,
-
-	IBS_MAX_STATES,
-};
-
-struct cpu_perf_ibs {
-	struct perf_event	*event;
-	unsigned long		state[BITS_TO_LONGS(IBS_MAX_STATES)];
-};
-
-struct perf_ibs {
-	struct pmu			pmu;
-	unsigned int			msr;
-	u64				config_mask;
-	u64				cnt_mask;
-	u64				enable_mask;
-	u64				valid_mask;
-	u64				max_period;
-	unsigned long			offset_mask[1];
-	int				offset_max;
-	struct cpu_perf_ibs __percpu	*pcpu;
-
-	struct attribute		**format_attrs;
-	struct attribute_group		format_group;
-	const struct attribute_group	*attr_groups[2];
-
-	u64				(*get_count)(u64 config);
-};
-
-struct perf_ibs_data {
-	u32		size;
-	union {
-		u32	data[0];	/* data buffer starts here */
-		u32	caps;
-	};
-	u64		regs[MSR_AMD64_IBS_REG_COUNT_MAX];
-};
-
-static int
-perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
-{
-	s64 left = local64_read(&hwc->period_left);
-	s64 period = hwc->sample_period;
-	int overflow = 0;
-
-	/*
-	 * If we are way outside a reasonable range then just skip forward:
-	 */
-	if (unlikely(left <= -period)) {
-		left = period;
-		local64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		overflow = 1;
-	}
-
-	if (unlikely(left < (s64)min)) {
-		left += period;
-		local64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-		overflow = 1;
-	}
-
-	/*
-	 * If the hw period that triggers the sw overflow is too short
-	 * we might hit the irq handler. This biases the results.
-	 * Thus we shorten the next-to-last period and set the last
-	 * period to the max period.
-	 */
-	if (left > max) {
-		left -= max;
-		if (left > max)
-			left = max;
-		else if (left < min)
-			left = min;
-	}
-
-	*hw_period = (u64)left;
-
-	return overflow;
-}
-
-static  int
-perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	int shift = 64 - width;
-	u64 prev_raw_count;
-	u64 delta;
-
-	/*
-	 * Careful: an NMI might modify the previous event value.
-	 *
-	 * Our tactic to handle this is to first atomically read and
-	 * exchange a new raw count - then add that new-prev delta
-	 * count to the generic event atomically:
-	 */
-	prev_raw_count = local64_read(&hwc->prev_count);
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-					new_raw_count) != prev_raw_count)
-		return 0;
-
-	/*
-	 * Now we have the new raw value and have updated the prev
-	 * timestamp already. We can now calculate the elapsed delta
-	 * (event-)time and add that to the generic event.
-	 *
-	 * Careful, not all hw sign-extends above the physical width
-	 * of the count.
-	 */
-	delta = (new_raw_count << shift) - (prev_raw_count << shift);
-	delta >>= shift;
-
-	local64_add(delta, &event->count);
-	local64_sub(delta, &hwc->period_left);
-
-	return 1;
-}
-
-static struct perf_ibs perf_ibs_fetch;
-static struct perf_ibs perf_ibs_op;
-
-static struct perf_ibs *get_ibs_pmu(int type)
-{
-	if (perf_ibs_fetch.pmu.type == type)
-		return &perf_ibs_fetch;
-	if (perf_ibs_op.pmu.type == type)
-		return &perf_ibs_op;
-	return NULL;
-}
-
-/*
- * Use IBS for precise event sampling:
- *
- *  perf record -a -e cpu-cycles:p ...    # use ibs op counting cycle count
- *  perf record -a -e r076:p ...          # same as -e cpu-cycles:p
- *  perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
- *
- * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
- * MSRC001_1033) is used to select either cycle or micro-ops counting
- * mode.
- *
- * The rip of IBS samples has skid 0. Thus, IBS supports precise
- * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
- * rip is invalid when IBS was not able to record the rip correctly.
- * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
- *
- */
-static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
-{
-	switch (event->attr.precise_ip) {
-	case 0:
-		return -ENOENT;
-	case 1:
-	case 2:
-		break;
-	default:
-		return -EOPNOTSUPP;
-	}
-
-	switch (event->attr.type) {
-	case PERF_TYPE_HARDWARE:
-		switch (event->attr.config) {
-		case PERF_COUNT_HW_CPU_CYCLES:
-			*config = 0;
-			return 0;
-		}
-		break;
-	case PERF_TYPE_RAW:
-		switch (event->attr.config) {
-		case 0x0076:
-			*config = 0;
-			return 0;
-		case 0x00C1:
-			*config = IBS_OP_CNT_CTL;
-			return 0;
-		}
-		break;
-	default:
-		return -ENOENT;
-	}
-
-	return -EOPNOTSUPP;
-}
-
-static const struct perf_event_attr ibs_notsupp = {
-	.exclude_user	= 1,
-	.exclude_kernel	= 1,
-	.exclude_hv	= 1,
-	.exclude_idle	= 1,
-	.exclude_host	= 1,
-	.exclude_guest	= 1,
-};
-
-static int perf_ibs_init(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct perf_ibs *perf_ibs;
-	u64 max_cnt, config;
-	int ret;
-
-	perf_ibs = get_ibs_pmu(event->attr.type);
-	if (perf_ibs) {
-		config = event->attr.config;
-	} else {
-		perf_ibs = &perf_ibs_op;
-		ret = perf_ibs_precise_event(event, &config);
-		if (ret)
-			return ret;
-	}
-
-	if (event->pmu != &perf_ibs->pmu)
-		return -ENOENT;
-
-	if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
-		return -EINVAL;
-
-	if (config & ~perf_ibs->config_mask)
-		return -EINVAL;
-
-	if (hwc->sample_period) {
-		if (config & perf_ibs->cnt_mask)
-			/* raw max_cnt may not be set */
-			return -EINVAL;
-		if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
-			/*
-			 * lower 4 bits can not be set in ibs max cnt,
-			 * but allowing it in case we adjust the
-			 * sample period to set a frequency.
-			 */
-			return -EINVAL;
-		hwc->sample_period &= ~0x0FULL;
-		if (!hwc->sample_period)
-			hwc->sample_period = 0x10;
-	} else {
-		max_cnt = config & perf_ibs->cnt_mask;
-		config &= ~perf_ibs->cnt_mask;
-		event->attr.sample_period = max_cnt << 4;
-		hwc->sample_period = event->attr.sample_period;
-	}
-
-	if (!hwc->sample_period)
-		return -EINVAL;
-
-	/*
-	 * If we modify hwc->sample_period, we also need to update
-	 * hwc->last_period and hwc->period_left.
-	 */
-	hwc->last_period = hwc->sample_period;
-	local64_set(&hwc->period_left, hwc->sample_period);
-
-	hwc->config_base = perf_ibs->msr;
-	hwc->config = config;
-
-	return 0;
-}
-
-static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
-			       struct hw_perf_event *hwc, u64 *period)
-{
-	int overflow;
-
-	/* ignore lower 4 bits in min count: */
-	overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
-	local64_set(&hwc->prev_count, 0);
-
-	return overflow;
-}
-
-static u64 get_ibs_fetch_count(u64 config)
-{
-	return (config & IBS_FETCH_CNT) >> 12;
-}
-
-static u64 get_ibs_op_count(u64 config)
-{
-	u64 count = 0;
-
-	if (config & IBS_OP_VAL)
-		count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
-
-	if (ibs_caps & IBS_CAPS_RDWROPCNT)
-		count += (config & IBS_OP_CUR_CNT) >> 32;
-
-	return count;
-}
-
-static void
-perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
-		      u64 *config)
-{
-	u64 count = perf_ibs->get_count(*config);
-
-	/*
-	 * Set width to 64 since we do not overflow on max width but
-	 * instead on max count. In perf_ibs_set_period() we clear
-	 * prev count manually on overflow.
-	 */
-	while (!perf_event_try_update(event, count, 64)) {
-		rdmsrl(event->hw.config_base, *config);
-		count = perf_ibs->get_count(*config);
-	}
-}
-
-static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
-					 struct hw_perf_event *hwc, u64 config)
-{
-	wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
-}
-
-/*
- * Erratum #420 Instruction-Based Sampling Engine May Generate
- * Interrupt that Cannot Be Cleared:
- *
- * Must clear counter mask first, then clear the enable bit. See
- * Revision Guide for AMD Family 10h Processors, Publication #41322.
- */
-static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
-					  struct hw_perf_event *hwc, u64 config)
-{
-	config &= ~perf_ibs->cnt_mask;
-	wrmsrl(hwc->config_base, config);
-	config &= ~perf_ibs->enable_mask;
-	wrmsrl(hwc->config_base, config);
-}
-
-/*
- * We cannot restore the ibs pmu state, so we always needs to update
- * the event while stopping it and then reset the state when starting
- * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
- * perf_ibs_start()/perf_ibs_stop() and instead always do it.
- */
-static void perf_ibs_start(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-	u64 period;
-
-	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-		return;
-
-	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-	hwc->state = 0;
-
-	perf_ibs_set_period(perf_ibs, hwc, &period);
-	set_bit(IBS_STARTED, pcpu->state);
-	perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
-	perf_event_update_userpage(event);
-}
-
-static void perf_ibs_stop(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-	u64 config;
-	int stopping;
-
-	stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
-
-	if (!stopping && (hwc->state & PERF_HES_UPTODATE))
-		return;
-
-	rdmsrl(hwc->config_base, config);
-
-	if (stopping) {
-		set_bit(IBS_STOPPING, pcpu->state);
-		perf_ibs_disable_event(perf_ibs, hwc, config);
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-	}
-
-	if (hwc->state & PERF_HES_UPTODATE)
-		return;
-
-	/*
-	 * Clear valid bit to not count rollovers on update, rollovers
-	 * are only updated in the irq handler.
-	 */
-	config &= ~perf_ibs->valid_mask;
-
-	perf_ibs_event_update(perf_ibs, event, &config);
-	hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_ibs_add(struct perf_event *event, int flags)
-{
-	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
-	if (test_and_set_bit(IBS_ENABLED, pcpu->state))
-		return -ENOSPC;
-
-	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	pcpu->event = event;
-
-	if (flags & PERF_EF_START)
-		perf_ibs_start(event, PERF_EF_RELOAD);
-
-	return 0;
-}
-
-static void perf_ibs_del(struct perf_event *event, int flags)
-{
-	struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
-	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-
-	if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
-		return;
-
-	perf_ibs_stop(event, PERF_EF_UPDATE);
-
-	pcpu->event = NULL;
-
-	perf_event_update_userpage(event);
-}
-
-static void perf_ibs_read(struct perf_event *event) { }
-
-PMU_FORMAT_ATTR(rand_en,	"config:57");
-PMU_FORMAT_ATTR(cnt_ctl,	"config:19");
-
-static struct attribute *ibs_fetch_format_attrs[] = {
-	&format_attr_rand_en.attr,
-	NULL,
-};
-
-static struct attribute *ibs_op_format_attrs[] = {
-	NULL,	/* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
-	NULL,
-};
-
-static struct perf_ibs perf_ibs_fetch = {
-	.pmu = {
-		.task_ctx_nr	= perf_invalid_context,
-
-		.event_init	= perf_ibs_init,
-		.add		= perf_ibs_add,
-		.del		= perf_ibs_del,
-		.start		= perf_ibs_start,
-		.stop		= perf_ibs_stop,
-		.read		= perf_ibs_read,
-	},
-	.msr			= MSR_AMD64_IBSFETCHCTL,
-	.config_mask		= IBS_FETCH_CONFIG_MASK,
-	.cnt_mask		= IBS_FETCH_MAX_CNT,
-	.enable_mask		= IBS_FETCH_ENABLE,
-	.valid_mask		= IBS_FETCH_VAL,
-	.max_period		= IBS_FETCH_MAX_CNT << 4,
-	.offset_mask		= { MSR_AMD64_IBSFETCH_REG_MASK },
-	.offset_max		= MSR_AMD64_IBSFETCH_REG_COUNT,
-	.format_attrs		= ibs_fetch_format_attrs,
-
-	.get_count		= get_ibs_fetch_count,
-};
-
-static struct perf_ibs perf_ibs_op = {
-	.pmu = {
-		.task_ctx_nr	= perf_invalid_context,
-
-		.event_init	= perf_ibs_init,
-		.add		= perf_ibs_add,
-		.del		= perf_ibs_del,
-		.start		= perf_ibs_start,
-		.stop		= perf_ibs_stop,
-		.read		= perf_ibs_read,
-	},
-	.msr			= MSR_AMD64_IBSOPCTL,
-	.config_mask		= IBS_OP_CONFIG_MASK,
-	.cnt_mask		= IBS_OP_MAX_CNT,
-	.enable_mask		= IBS_OP_ENABLE,
-	.valid_mask		= IBS_OP_VAL,
-	.max_period		= IBS_OP_MAX_CNT << 4,
-	.offset_mask		= { MSR_AMD64_IBSOP_REG_MASK },
-	.offset_max		= MSR_AMD64_IBSOP_REG_COUNT,
-	.format_attrs		= ibs_op_format_attrs,
-
-	.get_count		= get_ibs_op_count,
-};
-
-static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
-{
-	struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
-	struct perf_event *event = pcpu->event;
-	struct hw_perf_event *hwc = &event->hw;
-	struct perf_sample_data data;
-	struct perf_raw_record raw;
-	struct pt_regs regs;
-	struct perf_ibs_data ibs_data;
-	int offset, size, check_rip, offset_max, throttle = 0;
-	unsigned int msr;
-	u64 *buf, *config, period;
-
-	if (!test_bit(IBS_STARTED, pcpu->state)) {
-		/*
-		 * Catch spurious interrupts after stopping IBS: After
-		 * disabling IBS there could be still incoming NMIs
-		 * with samples that even have the valid bit cleared.
-		 * Mark all this NMIs as handled.
-		 */
-		return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
-	}
-
-	msr = hwc->config_base;
-	buf = ibs_data.regs;
-	rdmsrl(msr, *buf);
-	if (!(*buf++ & perf_ibs->valid_mask))
-		return 0;
-
-	config = &ibs_data.regs[0];
-	perf_ibs_event_update(perf_ibs, event, config);
-	perf_sample_data_init(&data, 0, hwc->last_period);
-	if (!perf_ibs_set_period(perf_ibs, hwc, &period))
-		goto out;	/* no sw counter overflow */
-
-	ibs_data.caps = ibs_caps;
-	size = 1;
-	offset = 1;
-	check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
-	if (event->attr.sample_type & PERF_SAMPLE_RAW)
-		offset_max = perf_ibs->offset_max;
-	else if (check_rip)
-		offset_max = 2;
-	else
-		offset_max = 1;
-	do {
-		rdmsrl(msr + offset, *buf++);
-		size++;
-		offset = find_next_bit(perf_ibs->offset_mask,
-				       perf_ibs->offset_max,
-				       offset + 1);
-	} while (offset < offset_max);
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		/*
-		 * Read IbsBrTarget and IbsOpData4 separately
-		 * depending on their availability.
-		 * Can't add to offset_max as they are staggered
-		 */
-		if (ibs_caps & IBS_CAPS_BRNTRGT) {
-			rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++);
-			size++;
-		}
-		if (ibs_caps & IBS_CAPS_OPDATA4) {
-			rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++);
-			size++;
-		}
-	}
-	ibs_data.size = sizeof(u64) * size;
-
-	regs = *iregs;
-	if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
-		regs.flags &= ~PERF_EFLAGS_EXACT;
-	} else {
-		set_linear_ip(&regs, ibs_data.regs[1]);
-		regs.flags |= PERF_EFLAGS_EXACT;
-	}
-
-	if (event->attr.sample_type & PERF_SAMPLE_RAW) {
-		raw.size = sizeof(u32) + ibs_data.size;
-		raw.data = ibs_data.data;
-		data.raw = &raw;
-	}
-
-	throttle = perf_event_overflow(event, &data, &regs);
-out:
-	if (throttle)
-		perf_ibs_disable_event(perf_ibs, hwc, *config);
-	else
-		perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
-
-	perf_event_update_userpage(event);
-
-	return 1;
-}
-
-static int
-perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
-{
-	int handled = 0;
-
-	handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
-	handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
-
-	if (handled)
-		inc_irq_stat(apic_perf_irqs);
-
-	return handled;
-}
-NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
-
-static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
-{
-	struct cpu_perf_ibs __percpu *pcpu;
-	int ret;
-
-	pcpu = alloc_percpu(struct cpu_perf_ibs);
-	if (!pcpu)
-		return -ENOMEM;
-
-	perf_ibs->pcpu = pcpu;
-
-	/* register attributes */
-	if (perf_ibs->format_attrs[0]) {
-		memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
-		perf_ibs->format_group.name	= "format";
-		perf_ibs->format_group.attrs	= perf_ibs->format_attrs;
-
-		memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
-		perf_ibs->attr_groups[0]	= &perf_ibs->format_group;
-		perf_ibs->pmu.attr_groups	= perf_ibs->attr_groups;
-	}
-
-	ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
-	if (ret) {
-		perf_ibs->pcpu = NULL;
-		free_percpu(pcpu);
-	}
-
-	return ret;
-}
-
-static __init int perf_event_ibs_init(void)
-{
-	struct attribute **attr = ibs_op_format_attrs;
-
-	if (!ibs_caps)
-		return -ENODEV;	/* ibs not supported by the cpu */
-
-	perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
-
-	if (ibs_caps & IBS_CAPS_OPCNT) {
-		perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
-		*attr++ = &format_attr_cnt_ctl.attr;
-	}
-	perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
-
-	register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
-	pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps);
-
-	return 0;
-}
-
-#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
-
-static __init int perf_event_ibs_init(void) { return 0; }
-
-#endif
-
-/* IBS - apic initialization, for perf and oprofile */
-
-static __init u32 __get_ibs_caps(void)
-{
-	u32 caps;
-	unsigned int max_level;
-
-	if (!boot_cpu_has(X86_FEATURE_IBS))
-		return 0;
-
-	/* check IBS cpuid feature flags */
-	max_level = cpuid_eax(0x80000000);
-	if (max_level < IBS_CPUID_FEATURES)
-		return IBS_CAPS_DEFAULT;
-
-	caps = cpuid_eax(IBS_CPUID_FEATURES);
-	if (!(caps & IBS_CAPS_AVAIL))
-		/* cpuid flags not valid */
-		return IBS_CAPS_DEFAULT;
-
-	return caps;
-}
-
-u32 get_ibs_caps(void)
-{
-	return ibs_caps;
-}
-
-EXPORT_SYMBOL(get_ibs_caps);
-
-static inline int get_eilvt(int offset)
-{
-	return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
-}
-
-static inline int put_eilvt(int offset)
-{
-	return !setup_APIC_eilvt(offset, 0, 0, 1);
-}
-
-/*
- * Check and reserve APIC extended interrupt LVT offset for IBS if available.
- */
-static inline int ibs_eilvt_valid(void)
-{
-	int offset;
-	u64 val;
-	int valid = 0;
-
-	preempt_disable();
-
-	rdmsrl(MSR_AMD64_IBSCTL, val);
-	offset = val & IBSCTL_LVT_OFFSET_MASK;
-
-	if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
-		pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
-		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-		goto out;
-	}
-
-	if (!get_eilvt(offset)) {
-		pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
-		       smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
-		goto out;
-	}
-
-	valid = 1;
-out:
-	preempt_enable();
-
-	return valid;
-}
-
-static int setup_ibs_ctl(int ibs_eilvt_off)
-{
-	struct pci_dev *cpu_cfg;
-	int nodes;
-	u32 value = 0;
-
-	nodes = 0;
-	cpu_cfg = NULL;
-	do {
-		cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
-					 PCI_DEVICE_ID_AMD_10H_NB_MISC,
-					 cpu_cfg);
-		if (!cpu_cfg)
-			break;
-		++nodes;
-		pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
-				       | IBSCTL_LVT_OFFSET_VALID);
-		pci_read_config_dword(cpu_cfg, IBSCTL, &value);
-		if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
-			pci_dev_put(cpu_cfg);
-			pr_debug("Failed to setup IBS LVT offset, IBSCTL = 0x%08x\n",
-				 value);
-			return -EINVAL;
-		}
-	} while (1);
-
-	if (!nodes) {
-		pr_debug("No CPU node configured for IBS\n");
-		return -ENODEV;
-	}
-
-	return 0;
-}
-
-/*
- * This runs only on the current cpu. We try to find an LVT offset and
- * setup the local APIC. For this we must disable preemption. On
- * success we initialize all nodes with this offset. This updates then
- * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
- * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
- * is using the new offset.
- */
-static void force_ibs_eilvt_setup(void)
-{
-	int offset;
-	int ret;
-
-	preempt_disable();
-	/* find the next free available EILVT entry, skip offset 0 */
-	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
-		if (get_eilvt(offset))
-			break;
-	}
-	preempt_enable();
-
-	if (offset == APIC_EILVT_NR_MAX) {
-		pr_debug("No EILVT entry available\n");
-		return;
-	}
-
-	ret = setup_ibs_ctl(offset);
-	if (ret)
-		goto out;
-
-	if (!ibs_eilvt_valid())
-		goto out;
-
-	pr_info("IBS: LVT offset %d assigned\n", offset);
-
-	return;
-out:
-	preempt_disable();
-	put_eilvt(offset);
-	preempt_enable();
-	return;
-}
-
-static void ibs_eilvt_setup(void)
-{
-	/*
-	 * Force LVT offset assignment for family 10h: The offsets are
-	 * not assigned by the BIOS for this family, so the OS is
-	 * responsible for doing it. If the OS assignment fails, fall
-	 * back to BIOS settings and try to setup this.
-	 */
-	if (boot_cpu_data.x86 == 0x10)
-		force_ibs_eilvt_setup();
-}
-
-static inline int get_ibs_lvt_offset(void)
-{
-	u64 val;
-
-	rdmsrl(MSR_AMD64_IBSCTL, val);
-	if (!(val & IBSCTL_LVT_OFFSET_VALID))
-		return -EINVAL;
-
-	return val & IBSCTL_LVT_OFFSET_MASK;
-}
-
-static void setup_APIC_ibs(void *dummy)
-{
-	int offset;
-
-	offset = get_ibs_lvt_offset();
-	if (offset < 0)
-		goto failed;
-
-	if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
-		return;
-failed:
-	pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
-		smp_processor_id());
-}
-
-static void clear_APIC_ibs(void *dummy)
-{
-	int offset;
-
-	offset = get_ibs_lvt_offset();
-	if (offset >= 0)
-		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
-}
-
-#ifdef CONFIG_PM
-
-static int perf_ibs_suspend(void)
-{
-	clear_APIC_ibs(NULL);
-	return 0;
-}
-
-static void perf_ibs_resume(void)
-{
-	ibs_eilvt_setup();
-	setup_APIC_ibs(NULL);
-}
-
-static struct syscore_ops perf_ibs_syscore_ops = {
-	.resume		= perf_ibs_resume,
-	.suspend	= perf_ibs_suspend,
-};
-
-static void perf_ibs_pm_init(void)
-{
-	register_syscore_ops(&perf_ibs_syscore_ops);
-}
-
-#else
-
-static inline void perf_ibs_pm_init(void) { }
-
-#endif
-
-static int
-perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_STARTING:
-		setup_APIC_ibs(NULL);
-		break;
-	case CPU_DYING:
-		clear_APIC_ibs(NULL);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static __init int amd_ibs_init(void)
-{
-	u32 caps;
-	int ret = -EINVAL;
-
-	caps = __get_ibs_caps();
-	if (!caps)
-		return -ENODEV;	/* ibs not supported by the cpu */
-
-	ibs_eilvt_setup();
-
-	if (!ibs_eilvt_valid())
-		goto out;
-
-	perf_ibs_pm_init();
-	cpu_notifier_register_begin();
-	ibs_caps = caps;
-	/* make ibs_caps visible to other cpus: */
-	smp_mb();
-	smp_call_function(setup_APIC_ibs, NULL, 1);
-	__perf_cpu_notifier(perf_ibs_cpu_notifier);
-	cpu_notifier_register_done();
-
-	ret = perf_event_ibs_init();
-out:
-	if (ret)
-		pr_err("Failed to setup IBS, %d\n", ret);
-	return ret;
-}
-
-/* Since we need the pci subsystem to init ibs we can't do this earlier: */
-device_initcall(amd_ibs_init);
-- 
cgit v0.10.2


From 5b26547dd7faa84e1293baa144a0f3e74ed7d4c7 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 8 Feb 2016 17:09:07 +0100
Subject: perf/x86: Move perf_event_amd_iommu.[ch] .. =>
 x86/events/amd/iommu.[ch]

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1454947748-28629-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 88f7873..838195d 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -2,3 +2,6 @@ obj-y			+= core.o
 
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o
 obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
+ifdef CONFIG_AMD_IOMMU
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
+endif
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
new file mode 100644
index 0000000..629bc70
--- /dev/null
+++ b/arch/x86/events/amd/iommu.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/module.h>
+#include <linux/cpumask.h>
+#include <linux/slab.h>
+
+#include "../../kernel/cpu/perf_event.h"
+#include "iommu.h"
+
+#define COUNTER_SHIFT		16
+
+#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
+#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
+
+/* iommu pmu config masks */
+#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
+#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
+#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
+#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
+#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
+#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
+#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+
+static struct perf_amd_iommu __perf_iommu;
+
+struct perf_amd_iommu {
+	struct pmu pmu;
+	u8 max_banks;
+	u8 max_counters;
+	u64 cntr_assign_mask;
+	raw_spinlock_t lock;
+	const struct attribute_group *attr_groups[4];
+};
+
+#define format_group	attr_groups[0]
+#define cpumask_group	attr_groups[1]
+#define events_group	attr_groups[2]
+#define null_group	attr_groups[3]
+
+/*---------------------------------------------
+ * sysfs format attributes
+ *---------------------------------------------*/
+PMU_FORMAT_ATTR(csource,    "config:0-7");
+PMU_FORMAT_ATTR(devid,      "config:8-23");
+PMU_FORMAT_ATTR(pasid,      "config:24-39");
+PMU_FORMAT_ATTR(domid,      "config:40-55");
+PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
+PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+
+static struct attribute *iommu_format_attrs[] = {
+	&format_attr_csource.attr,
+	&format_attr_devid.attr,
+	&format_attr_pasid.attr,
+	&format_attr_domid.attr,
+	&format_attr_devid_mask.attr,
+	&format_attr_pasid_mask.attr,
+	&format_attr_domid_mask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_iommu_format_group = {
+	.name = "format",
+	.attrs = iommu_format_attrs,
+};
+
+/*---------------------------------------------
+ * sysfs events attributes
+ *---------------------------------------------*/
+struct amd_iommu_event_desc {
+	struct kobj_attribute attr;
+	const char *event;
+};
+
+static ssize_t _iommu_event_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *buf)
+{
+	struct amd_iommu_event_desc *event =
+		container_of(attr, struct amd_iommu_event_desc, attr);
+	return sprintf(buf, "%s\n", event->event);
+}
+
+#define AMD_IOMMU_EVENT_DESC(_name, _event)			\
+{								\
+	.attr  = __ATTR(_name, 0444, _iommu_event_show, NULL),	\
+	.event = _event,					\
+}
+
+static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
+	AMD_IOMMU_EVENT_DESC(mem_pass_untrans,        "csource=0x01"),
+	AMD_IOMMU_EVENT_DESC(mem_pass_pretrans,       "csource=0x02"),
+	AMD_IOMMU_EVENT_DESC(mem_pass_excl,           "csource=0x03"),
+	AMD_IOMMU_EVENT_DESC(mem_target_abort,        "csource=0x04"),
+	AMD_IOMMU_EVENT_DESC(mem_trans_total,         "csource=0x05"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit,   "csource=0x06"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis,   "csource=0x07"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit,   "csource=0x08"),
+	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis,   "csource=0x09"),
+	AMD_IOMMU_EVENT_DESC(mem_dte_hit,             "csource=0x0a"),
+	AMD_IOMMU_EVENT_DESC(mem_dte_mis,             "csource=0x0b"),
+	AMD_IOMMU_EVENT_DESC(page_tbl_read_tot,       "csource=0x0c"),
+	AMD_IOMMU_EVENT_DESC(page_tbl_read_nst,       "csource=0x0d"),
+	AMD_IOMMU_EVENT_DESC(page_tbl_read_gst,       "csource=0x0e"),
+	AMD_IOMMU_EVENT_DESC(int_dte_hit,             "csource=0x0f"),
+	AMD_IOMMU_EVENT_DESC(int_dte_mis,             "csource=0x10"),
+	AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
+	AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
+	AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
+	{ /* end: all zeroes */ },
+};
+
+/*---------------------------------------------
+ * sysfs cpumask attributes
+ *---------------------------------------------*/
+static cpumask_t iommu_cpumask;
+
+static ssize_t _iommu_cpumask_show(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
+
+static struct attribute *iommu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_iommu_cpumask_group = {
+	.attrs = iommu_cpumask_attrs,
+};
+
+/*---------------------------------------------*/
+
+static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+{
+	unsigned long flags;
+	int shift, bank, cntr, retval;
+	int max_banks = perf_iommu->max_banks;
+	int max_cntrs = perf_iommu->max_counters;
+
+	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+
+	for (bank = 0, shift = 0; bank < max_banks; bank++) {
+		for (cntr = 0; cntr < max_cntrs; cntr++) {
+			shift = bank + (bank*3) + cntr;
+			if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
+				continue;
+			} else {
+				perf_iommu->cntr_assign_mask |= (1ULL<<shift);
+				retval = ((u16)((u16)bank<<8) | (u8)(cntr));
+				goto out;
+			}
+		}
+	}
+	retval = -ENOSPC;
+out:
+	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+	return retval;
+}
+
+static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
+					u8 bank, u8 cntr)
+{
+	unsigned long flags;
+	int max_banks, max_cntrs;
+	int shift = 0;
+
+	max_banks = perf_iommu->max_banks;
+	max_cntrs = perf_iommu->max_counters;
+
+	if ((bank > max_banks) || (cntr > max_cntrs))
+		return -EINVAL;
+
+	shift = bank + cntr + (bank*3);
+
+	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+	perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
+	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+
+	return 0;
+}
+
+static int perf_iommu_event_init(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct perf_amd_iommu *perf_iommu;
+	u64 config, config1;
+
+	/* test the event attr type check for PMU enumeration */
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/*
+	 * IOMMU counters are shared across all cores.
+	 * Therefore, it does not support per-process mode.
+	 * Also, it does not support event sampling mode.
+	 */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EINVAL;
+
+	/* IOMMU counters do not have usr/os/guest/host bits */
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+	    event->attr.exclude_host || event->attr.exclude_guest)
+		return -EINVAL;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	perf_iommu = &__perf_iommu;
+
+	if (event->pmu != &perf_iommu->pmu)
+		return -ENOENT;
+
+	if (perf_iommu) {
+		config = event->attr.config;
+		config1 = event->attr.config1;
+	} else {
+		return -EINVAL;
+	}
+
+	/* integrate with iommu base devid (0000), assume one iommu */
+	perf_iommu->max_banks =
+		amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
+	perf_iommu->max_counters =
+		amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
+	if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
+		return -EINVAL;
+
+	/* update the hw_perf_event struct with the iommu config data */
+	hwc->config = config;
+	hwc->extra_reg.config = config1;
+
+	return 0;
+}
+
+static void perf_iommu_enable_event(struct perf_event *ev)
+{
+	u8 csource = _GET_CSOURCE(ev);
+	u16 devid = _GET_DEVID(ev);
+	u64 reg = 0ULL;
+
+	reg = csource;
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+
+	reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
+	if (reg)
+		reg |= (1UL << 31);
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_DEVID_MATCH_REG, &reg, true);
+
+	reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+	if (reg)
+		reg |= (1UL << 31);
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_PASID_MATCH_REG, &reg, true);
+
+	reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+	if (reg)
+		reg |= (1UL << 31);
+	amd_iommu_pc_get_set_reg_val(devid,
+			_GET_BANK(ev), _GET_CNTR(ev) ,
+			 IOMMU_PC_DOMID_MATCH_REG, &reg, true);
+}
+
+static void perf_iommu_disable_event(struct perf_event *event)
+{
+	u64 reg = 0ULL;
+
+	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+			_GET_BANK(event), _GET_CNTR(event),
+			IOMMU_PC_COUNTER_SRC_REG, &reg, true);
+}
+
+static void perf_iommu_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	pr_debug("perf: amd_iommu:perf_iommu_start\n");
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+	hwc->state = 0;
+
+	if (flags & PERF_EF_RELOAD) {
+		u64 prev_raw_count =  local64_read(&hwc->prev_count);
+		amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+				_GET_BANK(event), _GET_CNTR(event),
+				IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
+	}
+
+	perf_iommu_enable_event(event);
+	perf_event_update_userpage(event);
+
+}
+
+static void perf_iommu_read(struct perf_event *event)
+{
+	u64 count = 0ULL;
+	u64 prev_raw_count = 0ULL;
+	u64 delta = 0ULL;
+	struct hw_perf_event *hwc = &event->hw;
+	pr_debug("perf: amd_iommu:perf_iommu_read\n");
+
+	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
+				_GET_BANK(event), _GET_CNTR(event),
+				IOMMU_PC_COUNTER_REG, &count, false);
+
+	/* IOMMU pc counter register is only 48 bits */
+	count &= 0xFFFFFFFFFFFFULL;
+
+	prev_raw_count =  local64_read(&hwc->prev_count);
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					count) != prev_raw_count)
+		return;
+
+	/* Handling 48-bit counter overflowing */
+	delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
+	delta >>= COUNTER_SHIFT;
+	local64_add(delta, &event->count);
+
+}
+
+static void perf_iommu_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 config;
+
+	pr_debug("perf: amd_iommu:perf_iommu_stop\n");
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	perf_iommu_disable_event(event);
+	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+	hwc->state |= PERF_HES_STOPPED;
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	config = hwc->config;
+	perf_iommu_read(event);
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int perf_iommu_add(struct perf_event *event, int flags)
+{
+	int retval;
+	struct perf_amd_iommu *perf_iommu =
+			container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+	pr_debug("perf: amd_iommu:perf_iommu_add\n");
+	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	/* request an iommu bank/counter */
+	retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
+	if (retval != -ENOSPC)
+		event->hw.extra_reg.reg = (u16)retval;
+	else
+		return retval;
+
+	if (flags & PERF_EF_START)
+		perf_iommu_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void perf_iommu_del(struct perf_event *event, int flags)
+{
+	struct perf_amd_iommu *perf_iommu =
+			container_of(event->pmu, struct perf_amd_iommu, pmu);
+
+	pr_debug("perf: amd_iommu:perf_iommu_del\n");
+	perf_iommu_stop(event, PERF_EF_UPDATE);
+
+	/* clear the assigned iommu bank/counter */
+	clear_avail_iommu_bnk_cntr(perf_iommu,
+				     _GET_BANK(event),
+				     _GET_CNTR(event));
+
+	perf_event_update_userpage(event);
+}
+
+static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
+{
+	struct attribute **attrs;
+	struct attribute_group *attr_group;
+	int i = 0, j;
+
+	while (amd_iommu_v2_event_descs[i].attr.attr.name)
+		i++;
+
+	attr_group = kzalloc(sizeof(struct attribute *)
+		* (i + 1) + sizeof(*attr_group), GFP_KERNEL);
+	if (!attr_group)
+		return -ENOMEM;
+
+	attrs = (struct attribute **)(attr_group + 1);
+	for (j = 0; j < i; j++)
+		attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
+
+	attr_group->name = "events";
+	attr_group->attrs = attrs;
+	perf_iommu->events_group = attr_group;
+
+	return 0;
+}
+
+static __init void amd_iommu_pc_exit(void)
+{
+	if (__perf_iommu.events_group != NULL) {
+		kfree(__perf_iommu.events_group);
+		__perf_iommu.events_group = NULL;
+	}
+}
+
+static __init int _init_perf_amd_iommu(
+	struct perf_amd_iommu *perf_iommu, char *name)
+{
+	int ret;
+
+	raw_spin_lock_init(&perf_iommu->lock);
+
+	/* Init format attributes */
+	perf_iommu->format_group = &amd_iommu_format_group;
+
+	/* Init cpumask attributes to only core 0 */
+	cpumask_set_cpu(0, &iommu_cpumask);
+	perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
+
+	/* Init events attributes */
+	if (_init_events_attrs(perf_iommu) != 0)
+		pr_err("perf: amd_iommu: Only support raw events.\n");
+
+	/* Init null attributes */
+	perf_iommu->null_group = NULL;
+	perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
+
+	ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
+	if (ret) {
+		pr_err("perf: amd_iommu: Failed to initialized.\n");
+		amd_iommu_pc_exit();
+	} else {
+		pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
+			amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
+			amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
+	}
+
+	return ret;
+}
+
+static struct perf_amd_iommu __perf_iommu = {
+	.pmu = {
+		.event_init	= perf_iommu_event_init,
+		.add		= perf_iommu_add,
+		.del		= perf_iommu_del,
+		.start		= perf_iommu_start,
+		.stop		= perf_iommu_stop,
+		.read		= perf_iommu_read,
+	},
+	.max_banks		= 0x00,
+	.max_counters		= 0x00,
+	.cntr_assign_mask	= 0ULL,
+	.format_group		= NULL,
+	.cpumask_group		= NULL,
+	.events_group		= NULL,
+	.null_group		= NULL,
+};
+
+static __init int amd_iommu_pc_init(void)
+{
+	/* Make sure the IOMMU PC resource is available */
+	if (!amd_iommu_pc_supported())
+		return -ENODEV;
+
+	_init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
+
+	return 0;
+}
+
+device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
new file mode 100644
index 0000000..845d173
--- /dev/null
+++ b/arch/x86/events/amd/iommu.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Steven Kinney <Steven.Kinney@amd.com>
+ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _PERF_EVENT_AMD_IOMMU_H_
+#define _PERF_EVENT_AMD_IOMMU_H_
+
+/* iommu pc mmio region register indexes */
+#define IOMMU_PC_COUNTER_REG			0x00
+#define IOMMU_PC_COUNTER_SRC_REG		0x08
+#define IOMMU_PC_PASID_MATCH_REG		0x10
+#define IOMMU_PC_DOMID_MATCH_REG		0x18
+#define IOMMU_PC_DEVID_MATCH_REG		0x20
+#define IOMMU_PC_COUNTER_REPORT_REG		0x28
+
+/* maximun specified bank/counters */
+#define PC_MAX_SPEC_BNKS			64
+#define PC_MAX_SPEC_CNTRS			16
+
+/* iommu pc reg masks*/
+#define IOMMU_BASE_DEVID			0x0000
+
+/* amd_iommu_init.c external support functions */
+extern bool amd_iommu_pc_supported(void);
+
+extern u8 amd_iommu_pc_get_max_banks(u16 devid);
+
+extern u8 amd_iommu_pc_get_max_counters(u16 devid);
+
+extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
+			u8 fxn, u64 *value, bool is_write);
+
+#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index dddba22..2e15d9d 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,9 +32,6 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_uncore.o
-ifdef CONFIG_AMD_IOMMU
-obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o
-endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c
deleted file mode 100644
index 97242a9..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.c
+++ /dev/null
@@ -1,499 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/module.h>
-#include <linux/cpumask.h>
-#include <linux/slab.h>
-
-#include "perf_event.h"
-#include "perf_event_amd_iommu.h"
-
-#define COUNTER_SHIFT		16
-
-#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
-
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
-#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
-
-static struct perf_amd_iommu __perf_iommu;
-
-struct perf_amd_iommu {
-	struct pmu pmu;
-	u8 max_banks;
-	u8 max_counters;
-	u64 cntr_assign_mask;
-	raw_spinlock_t lock;
-	const struct attribute_group *attr_groups[4];
-};
-
-#define format_group	attr_groups[0]
-#define cpumask_group	attr_groups[1]
-#define events_group	attr_groups[2]
-#define null_group	attr_groups[3]
-
-/*---------------------------------------------
- * sysfs format attributes
- *---------------------------------------------*/
-PMU_FORMAT_ATTR(csource,    "config:0-7");
-PMU_FORMAT_ATTR(devid,      "config:8-23");
-PMU_FORMAT_ATTR(pasid,      "config:24-39");
-PMU_FORMAT_ATTR(domid,      "config:40-55");
-PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
-
-static struct attribute *iommu_format_attrs[] = {
-	&format_attr_csource.attr,
-	&format_attr_devid.attr,
-	&format_attr_pasid.attr,
-	&format_attr_domid.attr,
-	&format_attr_devid_mask.attr,
-	&format_attr_pasid_mask.attr,
-	&format_attr_domid_mask.attr,
-	NULL,
-};
-
-static struct attribute_group amd_iommu_format_group = {
-	.name = "format",
-	.attrs = iommu_format_attrs,
-};
-
-/*---------------------------------------------
- * sysfs events attributes
- *---------------------------------------------*/
-struct amd_iommu_event_desc {
-	struct kobj_attribute attr;
-	const char *event;
-};
-
-static ssize_t _iommu_event_show(struct kobject *kobj,
-				struct kobj_attribute *attr, char *buf)
-{
-	struct amd_iommu_event_desc *event =
-		container_of(attr, struct amd_iommu_event_desc, attr);
-	return sprintf(buf, "%s\n", event->event);
-}
-
-#define AMD_IOMMU_EVENT_DESC(_name, _event)			\
-{								\
-	.attr  = __ATTR(_name, 0444, _iommu_event_show, NULL),	\
-	.event = _event,					\
-}
-
-static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
-	AMD_IOMMU_EVENT_DESC(mem_pass_untrans,        "csource=0x01"),
-	AMD_IOMMU_EVENT_DESC(mem_pass_pretrans,       "csource=0x02"),
-	AMD_IOMMU_EVENT_DESC(mem_pass_excl,           "csource=0x03"),
-	AMD_IOMMU_EVENT_DESC(mem_target_abort,        "csource=0x04"),
-	AMD_IOMMU_EVENT_DESC(mem_trans_total,         "csource=0x05"),
-	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit,   "csource=0x06"),
-	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis,   "csource=0x07"),
-	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit,   "csource=0x08"),
-	AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis,   "csource=0x09"),
-	AMD_IOMMU_EVENT_DESC(mem_dte_hit,             "csource=0x0a"),
-	AMD_IOMMU_EVENT_DESC(mem_dte_mis,             "csource=0x0b"),
-	AMD_IOMMU_EVENT_DESC(page_tbl_read_tot,       "csource=0x0c"),
-	AMD_IOMMU_EVENT_DESC(page_tbl_read_nst,       "csource=0x0d"),
-	AMD_IOMMU_EVENT_DESC(page_tbl_read_gst,       "csource=0x0e"),
-	AMD_IOMMU_EVENT_DESC(int_dte_hit,             "csource=0x0f"),
-	AMD_IOMMU_EVENT_DESC(int_dte_mis,             "csource=0x10"),
-	AMD_IOMMU_EVENT_DESC(cmd_processed,           "csource=0x11"),
-	AMD_IOMMU_EVENT_DESC(cmd_processed_inv,       "csource=0x12"),
-	AMD_IOMMU_EVENT_DESC(tlb_inv,                 "csource=0x13"),
-	{ /* end: all zeroes */ },
-};
-
-/*---------------------------------------------
- * sysfs cpumask attributes
- *---------------------------------------------*/
-static cpumask_t iommu_cpumask;
-
-static ssize_t _iommu_cpumask_show(struct device *dev,
-				   struct device_attribute *attr,
-				   char *buf)
-{
-	return cpumap_print_to_pagebuf(true, buf, &iommu_cpumask);
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL);
-
-static struct attribute *iommu_cpumask_attrs[] = {
-	&dev_attr_cpumask.attr,
-	NULL,
-};
-
-static struct attribute_group amd_iommu_cpumask_group = {
-	.attrs = iommu_cpumask_attrs,
-};
-
-/*---------------------------------------------*/
-
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
-{
-	unsigned long flags;
-	int shift, bank, cntr, retval;
-	int max_banks = perf_iommu->max_banks;
-	int max_cntrs = perf_iommu->max_counters;
-
-	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
-
-	for (bank = 0, shift = 0; bank < max_banks; bank++) {
-		for (cntr = 0; cntr < max_cntrs; cntr++) {
-			shift = bank + (bank*3) + cntr;
-			if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) {
-				continue;
-			} else {
-				perf_iommu->cntr_assign_mask |= (1ULL<<shift);
-				retval = ((u16)((u16)bank<<8) | (u8)(cntr));
-				goto out;
-			}
-		}
-	}
-	retval = -ENOSPC;
-out:
-	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
-	return retval;
-}
-
-static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
-					u8 bank, u8 cntr)
-{
-	unsigned long flags;
-	int max_banks, max_cntrs;
-	int shift = 0;
-
-	max_banks = perf_iommu->max_banks;
-	max_cntrs = perf_iommu->max_counters;
-
-	if ((bank > max_banks) || (cntr > max_cntrs))
-		return -EINVAL;
-
-	shift = bank + cntr + (bank*3);
-
-	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
-	perf_iommu->cntr_assign_mask &= ~(1ULL<<shift);
-	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
-
-	return 0;
-}
-
-static int perf_iommu_event_init(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct perf_amd_iommu *perf_iommu;
-	u64 config, config1;
-
-	/* test the event attr type check for PMU enumeration */
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	/*
-	 * IOMMU counters are shared across all cores.
-	 * Therefore, it does not support per-process mode.
-	 * Also, it does not support event sampling mode.
-	 */
-	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
-		return -EINVAL;
-
-	/* IOMMU counters do not have usr/os/guest/host bits */
-	if (event->attr.exclude_user || event->attr.exclude_kernel ||
-	    event->attr.exclude_host || event->attr.exclude_guest)
-		return -EINVAL;
-
-	if (event->cpu < 0)
-		return -EINVAL;
-
-	perf_iommu = &__perf_iommu;
-
-	if (event->pmu != &perf_iommu->pmu)
-		return -ENOENT;
-
-	if (perf_iommu) {
-		config = event->attr.config;
-		config1 = event->attr.config1;
-	} else {
-		return -EINVAL;
-	}
-
-	/* integrate with iommu base devid (0000), assume one iommu */
-	perf_iommu->max_banks =
-		amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID);
-	perf_iommu->max_counters =
-		amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID);
-	if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0))
-		return -EINVAL;
-
-	/* update the hw_perf_event struct with the iommu config data */
-	hwc->config = config;
-	hwc->extra_reg.config = config1;
-
-	return 0;
-}
-
-static void perf_iommu_enable_event(struct perf_event *ev)
-{
-	u8 csource = _GET_CSOURCE(ev);
-	u16 devid = _GET_DEVID(ev);
-	u64 reg = 0ULL;
-
-	reg = csource;
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-
-	reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32);
-	if (reg)
-		reg |= (1UL << 31);
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_DEVID_MATCH_REG, &reg, true);
-
-	reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
-	if (reg)
-		reg |= (1UL << 31);
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_PASID_MATCH_REG, &reg, true);
-
-	reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
-	if (reg)
-		reg |= (1UL << 31);
-	amd_iommu_pc_get_set_reg_val(devid,
-			_GET_BANK(ev), _GET_CNTR(ev) ,
-			 IOMMU_PC_DOMID_MATCH_REG, &reg, true);
-}
-
-static void perf_iommu_disable_event(struct perf_event *event)
-{
-	u64 reg = 0ULL;
-
-	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-			_GET_BANK(event), _GET_CNTR(event),
-			IOMMU_PC_COUNTER_SRC_REG, &reg, true);
-}
-
-static void perf_iommu_start(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	pr_debug("perf: amd_iommu:perf_iommu_start\n");
-	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
-		return;
-
-	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
-	hwc->state = 0;
-
-	if (flags & PERF_EF_RELOAD) {
-		u64 prev_raw_count =  local64_read(&hwc->prev_count);
-		amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-				_GET_BANK(event), _GET_CNTR(event),
-				IOMMU_PC_COUNTER_REG, &prev_raw_count, true);
-	}
-
-	perf_iommu_enable_event(event);
-	perf_event_update_userpage(event);
-
-}
-
-static void perf_iommu_read(struct perf_event *event)
-{
-	u64 count = 0ULL;
-	u64 prev_raw_count = 0ULL;
-	u64 delta = 0ULL;
-	struct hw_perf_event *hwc = &event->hw;
-	pr_debug("perf: amd_iommu:perf_iommu_read\n");
-
-	amd_iommu_pc_get_set_reg_val(_GET_DEVID(event),
-				_GET_BANK(event), _GET_CNTR(event),
-				IOMMU_PC_COUNTER_REG, &count, false);
-
-	/* IOMMU pc counter register is only 48 bits */
-	count &= 0xFFFFFFFFFFFFULL;
-
-	prev_raw_count =  local64_read(&hwc->prev_count);
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-					count) != prev_raw_count)
-		return;
-
-	/* Handling 48-bit counter overflowing */
-	delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT);
-	delta >>= COUNTER_SHIFT;
-	local64_add(delta, &event->count);
-
-}
-
-static void perf_iommu_stop(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
-
-	pr_debug("perf: amd_iommu:perf_iommu_stop\n");
-
-	if (hwc->state & PERF_HES_UPTODATE)
-		return;
-
-	perf_iommu_disable_event(event);
-	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-	hwc->state |= PERF_HES_STOPPED;
-
-	if (hwc->state & PERF_HES_UPTODATE)
-		return;
-
-	config = hwc->config;
-	perf_iommu_read(event);
-	hwc->state |= PERF_HES_UPTODATE;
-}
-
-static int perf_iommu_add(struct perf_event *event, int flags)
-{
-	int retval;
-	struct perf_amd_iommu *perf_iommu =
-			container_of(event->pmu, struct perf_amd_iommu, pmu);
-
-	pr_debug("perf: amd_iommu:perf_iommu_add\n");
-	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	/* request an iommu bank/counter */
-	retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
-	if (retval != -ENOSPC)
-		event->hw.extra_reg.reg = (u16)retval;
-	else
-		return retval;
-
-	if (flags & PERF_EF_START)
-		perf_iommu_start(event, PERF_EF_RELOAD);
-
-	return 0;
-}
-
-static void perf_iommu_del(struct perf_event *event, int flags)
-{
-	struct perf_amd_iommu *perf_iommu =
-			container_of(event->pmu, struct perf_amd_iommu, pmu);
-
-	pr_debug("perf: amd_iommu:perf_iommu_del\n");
-	perf_iommu_stop(event, PERF_EF_UPDATE);
-
-	/* clear the assigned iommu bank/counter */
-	clear_avail_iommu_bnk_cntr(perf_iommu,
-				     _GET_BANK(event),
-				     _GET_CNTR(event));
-
-	perf_event_update_userpage(event);
-}
-
-static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu)
-{
-	struct attribute **attrs;
-	struct attribute_group *attr_group;
-	int i = 0, j;
-
-	while (amd_iommu_v2_event_descs[i].attr.attr.name)
-		i++;
-
-	attr_group = kzalloc(sizeof(struct attribute *)
-		* (i + 1) + sizeof(*attr_group), GFP_KERNEL);
-	if (!attr_group)
-		return -ENOMEM;
-
-	attrs = (struct attribute **)(attr_group + 1);
-	for (j = 0; j < i; j++)
-		attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr;
-
-	attr_group->name = "events";
-	attr_group->attrs = attrs;
-	perf_iommu->events_group = attr_group;
-
-	return 0;
-}
-
-static __init void amd_iommu_pc_exit(void)
-{
-	if (__perf_iommu.events_group != NULL) {
-		kfree(__perf_iommu.events_group);
-		__perf_iommu.events_group = NULL;
-	}
-}
-
-static __init int _init_perf_amd_iommu(
-	struct perf_amd_iommu *perf_iommu, char *name)
-{
-	int ret;
-
-	raw_spin_lock_init(&perf_iommu->lock);
-
-	/* Init format attributes */
-	perf_iommu->format_group = &amd_iommu_format_group;
-
-	/* Init cpumask attributes to only core 0 */
-	cpumask_set_cpu(0, &iommu_cpumask);
-	perf_iommu->cpumask_group = &amd_iommu_cpumask_group;
-
-	/* Init events attributes */
-	if (_init_events_attrs(perf_iommu) != 0)
-		pr_err("perf: amd_iommu: Only support raw events.\n");
-
-	/* Init null attributes */
-	perf_iommu->null_group = NULL;
-	perf_iommu->pmu.attr_groups = perf_iommu->attr_groups;
-
-	ret = perf_pmu_register(&perf_iommu->pmu, name, -1);
-	if (ret) {
-		pr_err("perf: amd_iommu: Failed to initialized.\n");
-		amd_iommu_pc_exit();
-	} else {
-		pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n",
-			amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID),
-			amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID));
-	}
-
-	return ret;
-}
-
-static struct perf_amd_iommu __perf_iommu = {
-	.pmu = {
-		.event_init	= perf_iommu_event_init,
-		.add		= perf_iommu_add,
-		.del		= perf_iommu_del,
-		.start		= perf_iommu_start,
-		.stop		= perf_iommu_stop,
-		.read		= perf_iommu_read,
-	},
-	.max_banks		= 0x00,
-	.max_counters		= 0x00,
-	.cntr_assign_mask	= 0ULL,
-	.format_group		= NULL,
-	.cpumask_group		= NULL,
-	.events_group		= NULL,
-	.null_group		= NULL,
-};
-
-static __init int amd_iommu_pc_init(void)
-{
-	/* Make sure the IOMMU PC resource is available */
-	if (!amd_iommu_pc_supported())
-		return -ENODEV;
-
-	_init_perf_amd_iommu(&__perf_iommu, "amd_iommu");
-
-	return 0;
-}
-
-device_initcall(amd_iommu_pc_init);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/kernel/cpu/perf_event_amd_iommu.h
deleted file mode 100644
index 845d173..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_iommu.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Steven Kinney <Steven.Kinney@amd.com>
- * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _PERF_EVENT_AMD_IOMMU_H_
-#define _PERF_EVENT_AMD_IOMMU_H_
-
-/* iommu pc mmio region register indexes */
-#define IOMMU_PC_COUNTER_REG			0x00
-#define IOMMU_PC_COUNTER_SRC_REG		0x08
-#define IOMMU_PC_PASID_MATCH_REG		0x10
-#define IOMMU_PC_DOMID_MATCH_REG		0x18
-#define IOMMU_PC_DEVID_MATCH_REG		0x20
-#define IOMMU_PC_COUNTER_REPORT_REG		0x28
-
-/* maximun specified bank/counters */
-#define PC_MAX_SPEC_BNKS			64
-#define PC_MAX_SPEC_CNTRS			16
-
-/* iommu pc reg masks*/
-#define IOMMU_BASE_DEVID			0x0000
-
-/* amd_iommu_init.c external support functions */
-extern bool amd_iommu_pc_supported(void);
-
-extern u8 amd_iommu_pc_get_max_banks(u16 devid);
-
-extern u8 amd_iommu_pc_get_max_counters(u16 devid);
-
-extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr,
-			u8 fxn, u64 *value, bool is_write);
-
-#endif /*_PERF_EVENT_AMD_IOMMU_H_*/
-- 
cgit v0.10.2


From d0af1c0525d561fe3ab6d7a767cdd52704da25cd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 8 Feb 2016 17:09:08 +0100
Subject: perf/x86: Move perf_event_amd_uncore.c .... =>
 x86/events/amd/uncore.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1454947748-28629-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 838195d..7d1ecff 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,6 +1,6 @@
 obj-y			+= core.o
 
-obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o
+obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o amd/uncore.o
 obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
new file mode 100644
index 0000000..19a1736
--- /dev/null
+++ b/arch/x86/events/amd/uncore.c
@@ -0,0 +1,601 @@
+/*
+ * Copyright (C) 2013 Advanced Micro Devices, Inc.
+ *
+ * Author: Jacob Shin <jacob.shin@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+
+#include <asm/cpufeature.h>
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+
+#define NUM_COUNTERS_NB		4
+#define NUM_COUNTERS_L2		4
+#define MAX_COUNTERS		NUM_COUNTERS_NB
+
+#define RDPMC_BASE_NB		6
+#define RDPMC_BASE_L2		10
+
+#define COUNTER_SHIFT		16
+
+struct amd_uncore {
+	int id;
+	int refcnt;
+	int cpu;
+	int num_counters;
+	int rdpmc_base;
+	u32 msr_base;
+	cpumask_t *active_mask;
+	struct pmu *pmu;
+	struct perf_event *events[MAX_COUNTERS];
+	struct amd_uncore *free_when_cpu_online;
+};
+
+static struct amd_uncore * __percpu *amd_uncore_nb;
+static struct amd_uncore * __percpu *amd_uncore_l2;
+
+static struct pmu amd_nb_pmu;
+static struct pmu amd_l2_pmu;
+
+static cpumask_t amd_nb_active_mask;
+static cpumask_t amd_l2_active_mask;
+
+static bool is_nb_event(struct perf_event *event)
+{
+	return event->pmu->type == amd_nb_pmu.type;
+}
+
+static bool is_l2_event(struct perf_event *event)
+{
+	return event->pmu->type == amd_l2_pmu.type;
+}
+
+static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
+{
+	if (is_nb_event(event) && amd_uncore_nb)
+		return *per_cpu_ptr(amd_uncore_nb, event->cpu);
+	else if (is_l2_event(event) && amd_uncore_l2)
+		return *per_cpu_ptr(amd_uncore_l2, event->cpu);
+
+	return NULL;
+}
+
+static void amd_uncore_read(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev, new;
+	s64 delta;
+
+	/*
+	 * since we do not enable counter overflow interrupts,
+	 * we do not have to worry about prev_count changing on us
+	 */
+
+	prev = local64_read(&hwc->prev_count);
+	rdpmcl(hwc->event_base_rdpmc, new);
+	local64_set(&hwc->prev_count, new);
+	delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
+	delta >>= COUNTER_SHIFT;
+	local64_add(delta, &event->count);
+}
+
+static void amd_uncore_start(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (flags & PERF_EF_RELOAD)
+		wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
+
+	hwc->state = 0;
+	wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
+	perf_event_update_userpage(event);
+}
+
+static void amd_uncore_stop(struct perf_event *event, int flags)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+	hwc->state |= PERF_HES_STOPPED;
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		amd_uncore_read(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int amd_uncore_add(struct perf_event *event, int flags)
+{
+	int i;
+	struct amd_uncore *uncore = event_to_amd_uncore(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	/* are we already assigned? */
+	if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
+		goto out;
+
+	for (i = 0; i < uncore->num_counters; i++) {
+		if (uncore->events[i] == event) {
+			hwc->idx = i;
+			goto out;
+		}
+	}
+
+	/* if not, take the first available counter */
+	hwc->idx = -1;
+	for (i = 0; i < uncore->num_counters; i++) {
+		if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
+			hwc->idx = i;
+			break;
+		}
+	}
+
+out:
+	if (hwc->idx == -1)
+		return -EBUSY;
+
+	hwc->config_base = uncore->msr_base + (2 * hwc->idx);
+	hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
+	hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	if (flags & PERF_EF_START)
+		amd_uncore_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void amd_uncore_del(struct perf_event *event, int flags)
+{
+	int i;
+	struct amd_uncore *uncore = event_to_amd_uncore(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	amd_uncore_stop(event, PERF_EF_UPDATE);
+
+	for (i = 0; i < uncore->num_counters; i++) {
+		if (cmpxchg(&uncore->events[i], event, NULL) == event)
+			break;
+	}
+
+	hwc->idx = -1;
+}
+
+static int amd_uncore_event_init(struct perf_event *event)
+{
+	struct amd_uncore *uncore;
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/*
+	 * NB and L2 counters (MSRs) are shared across all cores that share the
+	 * same NB / L2 cache. Interrupts can be directed to a single target
+	 * core, however, event counts generated by processes running on other
+	 * cores cannot be masked out. So we do not support sampling and
+	 * per-thread events.
+	 */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EINVAL;
+
+	/* NB and L2 counters do not have usr/os/guest/host bits */
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+	    event->attr.exclude_host || event->attr.exclude_guest)
+		return -EINVAL;
+
+	/* and we do not enable counter overflow interrupts */
+	hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
+	hwc->idx = -1;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	uncore = event_to_amd_uncore(event);
+	if (!uncore)
+		return -ENODEV;
+
+	/*
+	 * since request can come in to any of the shared cores, we will remap
+	 * to a single common cpu.
+	 */
+	event->cpu = uncore->cpu;
+
+	return 0;
+}
+
+static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
+					    struct device_attribute *attr,
+					    char *buf)
+{
+	cpumask_t *active_mask;
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	if (pmu->type == amd_nb_pmu.type)
+		active_mask = &amd_nb_active_mask;
+	else if (pmu->type == amd_l2_pmu.type)
+		active_mask = &amd_l2_active_mask;
+	else
+		return 0;
+
+	return cpumap_print_to_pagebuf(true, buf, active_mask);
+}
+static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
+
+static struct attribute *amd_uncore_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_uncore_attr_group = {
+	.attrs = amd_uncore_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7,32-35");
+PMU_FORMAT_ATTR(umask, "config:8-15");
+
+static struct attribute *amd_uncore_format_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	NULL,
+};
+
+static struct attribute_group amd_uncore_format_group = {
+	.name = "format",
+	.attrs = amd_uncore_format_attr,
+};
+
+static const struct attribute_group *amd_uncore_attr_groups[] = {
+	&amd_uncore_attr_group,
+	&amd_uncore_format_group,
+	NULL,
+};
+
+static struct pmu amd_nb_pmu = {
+	.attr_groups	= amd_uncore_attr_groups,
+	.name		= "amd_nb",
+	.event_init	= amd_uncore_event_init,
+	.add		= amd_uncore_add,
+	.del		= amd_uncore_del,
+	.start		= amd_uncore_start,
+	.stop		= amd_uncore_stop,
+	.read		= amd_uncore_read,
+};
+
+static struct pmu amd_l2_pmu = {
+	.attr_groups	= amd_uncore_attr_groups,
+	.name		= "amd_l2",
+	.event_init	= amd_uncore_event_init,
+	.add		= amd_uncore_add,
+	.del		= amd_uncore_del,
+	.start		= amd_uncore_start,
+	.stop		= amd_uncore_stop,
+	.read		= amd_uncore_read,
+};
+
+static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
+{
+	return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
+			cpu_to_node(cpu));
+}
+
+static int amd_uncore_cpu_up_prepare(unsigned int cpu)
+{
+	struct amd_uncore *uncore_nb = NULL, *uncore_l2;
+
+	if (amd_uncore_nb) {
+		uncore_nb = amd_uncore_alloc(cpu);
+		if (!uncore_nb)
+			goto fail;
+		uncore_nb->cpu = cpu;
+		uncore_nb->num_counters = NUM_COUNTERS_NB;
+		uncore_nb->rdpmc_base = RDPMC_BASE_NB;
+		uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
+		uncore_nb->active_mask = &amd_nb_active_mask;
+		uncore_nb->pmu = &amd_nb_pmu;
+		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
+	}
+
+	if (amd_uncore_l2) {
+		uncore_l2 = amd_uncore_alloc(cpu);
+		if (!uncore_l2)
+			goto fail;
+		uncore_l2->cpu = cpu;
+		uncore_l2->num_counters = NUM_COUNTERS_L2;
+		uncore_l2->rdpmc_base = RDPMC_BASE_L2;
+		uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
+		uncore_l2->active_mask = &amd_l2_active_mask;
+		uncore_l2->pmu = &amd_l2_pmu;
+		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
+	}
+
+	return 0;
+
+fail:
+	kfree(uncore_nb);
+	return -ENOMEM;
+}
+
+static struct amd_uncore *
+amd_uncore_find_online_sibling(struct amd_uncore *this,
+			       struct amd_uncore * __percpu *uncores)
+{
+	unsigned int cpu;
+	struct amd_uncore *that;
+
+	for_each_online_cpu(cpu) {
+		that = *per_cpu_ptr(uncores, cpu);
+
+		if (!that)
+			continue;
+
+		if (this == that)
+			continue;
+
+		if (this->id == that->id) {
+			that->free_when_cpu_online = this;
+			this = that;
+			break;
+		}
+	}
+
+	this->refcnt++;
+	return this;
+}
+
+static void amd_uncore_cpu_starting(unsigned int cpu)
+{
+	unsigned int eax, ebx, ecx, edx;
+	struct amd_uncore *uncore;
+
+	if (amd_uncore_nb) {
+		uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
+		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
+		uncore->id = ecx & 0xff;
+
+		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
+		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+	}
+
+	if (amd_uncore_l2) {
+		unsigned int apicid = cpu_data(cpu).apicid;
+		unsigned int nshared;
+
+		uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
+		cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
+		nshared = ((eax >> 14) & 0xfff) + 1;
+		uncore->id = apicid - (apicid % nshared);
+
+		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
+		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+	}
+}
+
+static void uncore_online(unsigned int cpu,
+			  struct amd_uncore * __percpu *uncores)
+{
+	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+	kfree(uncore->free_when_cpu_online);
+	uncore->free_when_cpu_online = NULL;
+
+	if (cpu == uncore->cpu)
+		cpumask_set_cpu(cpu, uncore->active_mask);
+}
+
+static void amd_uncore_cpu_online(unsigned int cpu)
+{
+	if (amd_uncore_nb)
+		uncore_online(cpu, amd_uncore_nb);
+
+	if (amd_uncore_l2)
+		uncore_online(cpu, amd_uncore_l2);
+}
+
+static void uncore_down_prepare(unsigned int cpu,
+				struct amd_uncore * __percpu *uncores)
+{
+	unsigned int i;
+	struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
+
+	if (this->cpu != cpu)
+		return;
+
+	/* this cpu is going down, migrate to a shared sibling if possible */
+	for_each_online_cpu(i) {
+		struct amd_uncore *that = *per_cpu_ptr(uncores, i);
+
+		if (cpu == i)
+			continue;
+
+		if (this == that) {
+			perf_pmu_migrate_context(this->pmu, cpu, i);
+			cpumask_clear_cpu(cpu, that->active_mask);
+			cpumask_set_cpu(i, that->active_mask);
+			that->cpu = i;
+			break;
+		}
+	}
+}
+
+static void amd_uncore_cpu_down_prepare(unsigned int cpu)
+{
+	if (amd_uncore_nb)
+		uncore_down_prepare(cpu, amd_uncore_nb);
+
+	if (amd_uncore_l2)
+		uncore_down_prepare(cpu, amd_uncore_l2);
+}
+
+static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
+{
+	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
+
+	if (cpu == uncore->cpu)
+		cpumask_clear_cpu(cpu, uncore->active_mask);
+
+	if (!--uncore->refcnt)
+		kfree(uncore);
+	*per_cpu_ptr(uncores, cpu) = NULL;
+}
+
+static void amd_uncore_cpu_dead(unsigned int cpu)
+{
+	if (amd_uncore_nb)
+		uncore_dead(cpu, amd_uncore_nb);
+
+	if (amd_uncore_l2)
+		uncore_dead(cpu, amd_uncore_l2);
+}
+
+static int
+amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
+			void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		if (amd_uncore_cpu_up_prepare(cpu))
+			return notifier_from_errno(-ENOMEM);
+		break;
+
+	case CPU_STARTING:
+		amd_uncore_cpu_starting(cpu);
+		break;
+
+	case CPU_ONLINE:
+		amd_uncore_cpu_online(cpu);
+		break;
+
+	case CPU_DOWN_PREPARE:
+		amd_uncore_cpu_down_prepare(cpu);
+		break;
+
+	case CPU_UP_CANCELED:
+	case CPU_DEAD:
+		amd_uncore_cpu_dead(cpu);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block amd_uncore_cpu_notifier_block = {
+	.notifier_call	= amd_uncore_cpu_notifier,
+	.priority	= CPU_PRI_PERF + 1,
+};
+
+static void __init init_cpu_already_online(void *dummy)
+{
+	unsigned int cpu = smp_processor_id();
+
+	amd_uncore_cpu_starting(cpu);
+	amd_uncore_cpu_online(cpu);
+}
+
+static void cleanup_cpu_online(void *dummy)
+{
+	unsigned int cpu = smp_processor_id();
+
+	amd_uncore_cpu_dead(cpu);
+}
+
+static int __init amd_uncore_init(void)
+{
+	unsigned int cpu, cpu2;
+	int ret = -ENODEV;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		goto fail_nodev;
+
+	if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+		goto fail_nodev;
+
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
+		amd_uncore_nb = alloc_percpu(struct amd_uncore *);
+		if (!amd_uncore_nb) {
+			ret = -ENOMEM;
+			goto fail_nb;
+		}
+		ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+		if (ret)
+			goto fail_nb;
+
+		pr_info("perf: AMD NB counters detected\n");
+		ret = 0;
+	}
+
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
+		amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
+		if (!amd_uncore_l2) {
+			ret = -ENOMEM;
+			goto fail_l2;
+		}
+		ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+		if (ret)
+			goto fail_l2;
+
+		pr_info("perf: AMD L2I counters detected\n");
+		ret = 0;
+	}
+
+	if (ret)
+		goto fail_nodev;
+
+	cpu_notifier_register_begin();
+
+	/* init cpus already online before registering for hotplug notifier */
+	for_each_online_cpu(cpu) {
+		ret = amd_uncore_cpu_up_prepare(cpu);
+		if (ret)
+			goto fail_online;
+		smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
+	}
+
+	__register_cpu_notifier(&amd_uncore_cpu_notifier_block);
+	cpu_notifier_register_done();
+
+	return 0;
+
+
+fail_online:
+	for_each_online_cpu(cpu2) {
+		if (cpu2 == cpu)
+			break;
+		smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
+	}
+	cpu_notifier_register_done();
+
+	/* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
+	amd_uncore_nb = amd_uncore_l2 = NULL;
+
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
+		perf_pmu_unregister(&amd_l2_pmu);
+fail_l2:
+	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
+		perf_pmu_unregister(&amd_nb_pmu);
+	if (amd_uncore_l2)
+		free_percpu(amd_uncore_l2);
+fail_nb:
+	if (amd_uncore_nb)
+		free_percpu(amd_uncore_nb);
+
+fail_nodev:
+	return ret;
+}
+device_initcall(amd_uncore_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 2e15d9d..7edbeb9 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -31,7 +31,6 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_uncore.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
deleted file mode 100644
index 19a1736..0000000
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ /dev/null
@@ -1,601 +0,0 @@
-/*
- * Copyright (C) 2013 Advanced Micro Devices, Inc.
- *
- * Author: Jacob Shin <jacob.shin@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/perf_event.h>
-#include <linux/percpu.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/cpumask.h>
-
-#include <asm/cpufeature.h>
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-
-#define NUM_COUNTERS_NB		4
-#define NUM_COUNTERS_L2		4
-#define MAX_COUNTERS		NUM_COUNTERS_NB
-
-#define RDPMC_BASE_NB		6
-#define RDPMC_BASE_L2		10
-
-#define COUNTER_SHIFT		16
-
-struct amd_uncore {
-	int id;
-	int refcnt;
-	int cpu;
-	int num_counters;
-	int rdpmc_base;
-	u32 msr_base;
-	cpumask_t *active_mask;
-	struct pmu *pmu;
-	struct perf_event *events[MAX_COUNTERS];
-	struct amd_uncore *free_when_cpu_online;
-};
-
-static struct amd_uncore * __percpu *amd_uncore_nb;
-static struct amd_uncore * __percpu *amd_uncore_l2;
-
-static struct pmu amd_nb_pmu;
-static struct pmu amd_l2_pmu;
-
-static cpumask_t amd_nb_active_mask;
-static cpumask_t amd_l2_active_mask;
-
-static bool is_nb_event(struct perf_event *event)
-{
-	return event->pmu->type == amd_nb_pmu.type;
-}
-
-static bool is_l2_event(struct perf_event *event)
-{
-	return event->pmu->type == amd_l2_pmu.type;
-}
-
-static struct amd_uncore *event_to_amd_uncore(struct perf_event *event)
-{
-	if (is_nb_event(event) && amd_uncore_nb)
-		return *per_cpu_ptr(amd_uncore_nb, event->cpu);
-	else if (is_l2_event(event) && amd_uncore_l2)
-		return *per_cpu_ptr(amd_uncore_l2, event->cpu);
-
-	return NULL;
-}
-
-static void amd_uncore_read(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 prev, new;
-	s64 delta;
-
-	/*
-	 * since we do not enable counter overflow interrupts,
-	 * we do not have to worry about prev_count changing on us
-	 */
-
-	prev = local64_read(&hwc->prev_count);
-	rdpmcl(hwc->event_base_rdpmc, new);
-	local64_set(&hwc->prev_count, new);
-	delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT);
-	delta >>= COUNTER_SHIFT;
-	local64_add(delta, &event->count);
-}
-
-static void amd_uncore_start(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (flags & PERF_EF_RELOAD)
-		wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count));
-
-	hwc->state = 0;
-	wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE));
-	perf_event_update_userpage(event);
-}
-
-static void amd_uncore_stop(struct perf_event *event, int flags)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	wrmsrl(hwc->config_base, hwc->config);
-	hwc->state |= PERF_HES_STOPPED;
-
-	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		amd_uncore_read(event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-}
-
-static int amd_uncore_add(struct perf_event *event, int flags)
-{
-	int i;
-	struct amd_uncore *uncore = event_to_amd_uncore(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	/* are we already assigned? */
-	if (hwc->idx != -1 && uncore->events[hwc->idx] == event)
-		goto out;
-
-	for (i = 0; i < uncore->num_counters; i++) {
-		if (uncore->events[i] == event) {
-			hwc->idx = i;
-			goto out;
-		}
-	}
-
-	/* if not, take the first available counter */
-	hwc->idx = -1;
-	for (i = 0; i < uncore->num_counters; i++) {
-		if (cmpxchg(&uncore->events[i], NULL, event) == NULL) {
-			hwc->idx = i;
-			break;
-		}
-	}
-
-out:
-	if (hwc->idx == -1)
-		return -EBUSY;
-
-	hwc->config_base = uncore->msr_base + (2 * hwc->idx);
-	hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx);
-	hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	if (flags & PERF_EF_START)
-		amd_uncore_start(event, PERF_EF_RELOAD);
-
-	return 0;
-}
-
-static void amd_uncore_del(struct perf_event *event, int flags)
-{
-	int i;
-	struct amd_uncore *uncore = event_to_amd_uncore(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	amd_uncore_stop(event, PERF_EF_UPDATE);
-
-	for (i = 0; i < uncore->num_counters; i++) {
-		if (cmpxchg(&uncore->events[i], event, NULL) == event)
-			break;
-	}
-
-	hwc->idx = -1;
-}
-
-static int amd_uncore_event_init(struct perf_event *event)
-{
-	struct amd_uncore *uncore;
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	/*
-	 * NB and L2 counters (MSRs) are shared across all cores that share the
-	 * same NB / L2 cache. Interrupts can be directed to a single target
-	 * core, however, event counts generated by processes running on other
-	 * cores cannot be masked out. So we do not support sampling and
-	 * per-thread events.
-	 */
-	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
-		return -EINVAL;
-
-	/* NB and L2 counters do not have usr/os/guest/host bits */
-	if (event->attr.exclude_user || event->attr.exclude_kernel ||
-	    event->attr.exclude_host || event->attr.exclude_guest)
-		return -EINVAL;
-
-	/* and we do not enable counter overflow interrupts */
-	hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
-	hwc->idx = -1;
-
-	if (event->cpu < 0)
-		return -EINVAL;
-
-	uncore = event_to_amd_uncore(event);
-	if (!uncore)
-		return -ENODEV;
-
-	/*
-	 * since request can come in to any of the shared cores, we will remap
-	 * to a single common cpu.
-	 */
-	event->cpu = uncore->cpu;
-
-	return 0;
-}
-
-static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
-					    struct device_attribute *attr,
-					    char *buf)
-{
-	cpumask_t *active_mask;
-	struct pmu *pmu = dev_get_drvdata(dev);
-
-	if (pmu->type == amd_nb_pmu.type)
-		active_mask = &amd_nb_active_mask;
-	else if (pmu->type == amd_l2_pmu.type)
-		active_mask = &amd_l2_active_mask;
-	else
-		return 0;
-
-	return cpumap_print_to_pagebuf(true, buf, active_mask);
-}
-static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL);
-
-static struct attribute *amd_uncore_attrs[] = {
-	&dev_attr_cpumask.attr,
-	NULL,
-};
-
-static struct attribute_group amd_uncore_attr_group = {
-	.attrs = amd_uncore_attrs,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7,32-35");
-PMU_FORMAT_ATTR(umask, "config:8-15");
-
-static struct attribute *amd_uncore_format_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	NULL,
-};
-
-static struct attribute_group amd_uncore_format_group = {
-	.name = "format",
-	.attrs = amd_uncore_format_attr,
-};
-
-static const struct attribute_group *amd_uncore_attr_groups[] = {
-	&amd_uncore_attr_group,
-	&amd_uncore_format_group,
-	NULL,
-};
-
-static struct pmu amd_nb_pmu = {
-	.attr_groups	= amd_uncore_attr_groups,
-	.name		= "amd_nb",
-	.event_init	= amd_uncore_event_init,
-	.add		= amd_uncore_add,
-	.del		= amd_uncore_del,
-	.start		= amd_uncore_start,
-	.stop		= amd_uncore_stop,
-	.read		= amd_uncore_read,
-};
-
-static struct pmu amd_l2_pmu = {
-	.attr_groups	= amd_uncore_attr_groups,
-	.name		= "amd_l2",
-	.event_init	= amd_uncore_event_init,
-	.add		= amd_uncore_add,
-	.del		= amd_uncore_del,
-	.start		= amd_uncore_start,
-	.stop		= amd_uncore_stop,
-	.read		= amd_uncore_read,
-};
-
-static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
-{
-	return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL,
-			cpu_to_node(cpu));
-}
-
-static int amd_uncore_cpu_up_prepare(unsigned int cpu)
-{
-	struct amd_uncore *uncore_nb = NULL, *uncore_l2;
-
-	if (amd_uncore_nb) {
-		uncore_nb = amd_uncore_alloc(cpu);
-		if (!uncore_nb)
-			goto fail;
-		uncore_nb->cpu = cpu;
-		uncore_nb->num_counters = NUM_COUNTERS_NB;
-		uncore_nb->rdpmc_base = RDPMC_BASE_NB;
-		uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
-		uncore_nb->active_mask = &amd_nb_active_mask;
-		uncore_nb->pmu = &amd_nb_pmu;
-		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
-	}
-
-	if (amd_uncore_l2) {
-		uncore_l2 = amd_uncore_alloc(cpu);
-		if (!uncore_l2)
-			goto fail;
-		uncore_l2->cpu = cpu;
-		uncore_l2->num_counters = NUM_COUNTERS_L2;
-		uncore_l2->rdpmc_base = RDPMC_BASE_L2;
-		uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
-		uncore_l2->active_mask = &amd_l2_active_mask;
-		uncore_l2->pmu = &amd_l2_pmu;
-		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
-	}
-
-	return 0;
-
-fail:
-	kfree(uncore_nb);
-	return -ENOMEM;
-}
-
-static struct amd_uncore *
-amd_uncore_find_online_sibling(struct amd_uncore *this,
-			       struct amd_uncore * __percpu *uncores)
-{
-	unsigned int cpu;
-	struct amd_uncore *that;
-
-	for_each_online_cpu(cpu) {
-		that = *per_cpu_ptr(uncores, cpu);
-
-		if (!that)
-			continue;
-
-		if (this == that)
-			continue;
-
-		if (this->id == that->id) {
-			that->free_when_cpu_online = this;
-			this = that;
-			break;
-		}
-	}
-
-	this->refcnt++;
-	return this;
-}
-
-static void amd_uncore_cpu_starting(unsigned int cpu)
-{
-	unsigned int eax, ebx, ecx, edx;
-	struct amd_uncore *uncore;
-
-	if (amd_uncore_nb) {
-		uncore = *per_cpu_ptr(amd_uncore_nb, cpu);
-		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-		uncore->id = ecx & 0xff;
-
-		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb);
-		*per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
-	}
-
-	if (amd_uncore_l2) {
-		unsigned int apicid = cpu_data(cpu).apicid;
-		unsigned int nshared;
-
-		uncore = *per_cpu_ptr(amd_uncore_l2, cpu);
-		cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx);
-		nshared = ((eax >> 14) & 0xfff) + 1;
-		uncore->id = apicid - (apicid % nshared);
-
-		uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2);
-		*per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
-	}
-}
-
-static void uncore_online(unsigned int cpu,
-			  struct amd_uncore * __percpu *uncores)
-{
-	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
-	kfree(uncore->free_when_cpu_online);
-	uncore->free_when_cpu_online = NULL;
-
-	if (cpu == uncore->cpu)
-		cpumask_set_cpu(cpu, uncore->active_mask);
-}
-
-static void amd_uncore_cpu_online(unsigned int cpu)
-{
-	if (amd_uncore_nb)
-		uncore_online(cpu, amd_uncore_nb);
-
-	if (amd_uncore_l2)
-		uncore_online(cpu, amd_uncore_l2);
-}
-
-static void uncore_down_prepare(unsigned int cpu,
-				struct amd_uncore * __percpu *uncores)
-{
-	unsigned int i;
-	struct amd_uncore *this = *per_cpu_ptr(uncores, cpu);
-
-	if (this->cpu != cpu)
-		return;
-
-	/* this cpu is going down, migrate to a shared sibling if possible */
-	for_each_online_cpu(i) {
-		struct amd_uncore *that = *per_cpu_ptr(uncores, i);
-
-		if (cpu == i)
-			continue;
-
-		if (this == that) {
-			perf_pmu_migrate_context(this->pmu, cpu, i);
-			cpumask_clear_cpu(cpu, that->active_mask);
-			cpumask_set_cpu(i, that->active_mask);
-			that->cpu = i;
-			break;
-		}
-	}
-}
-
-static void amd_uncore_cpu_down_prepare(unsigned int cpu)
-{
-	if (amd_uncore_nb)
-		uncore_down_prepare(cpu, amd_uncore_nb);
-
-	if (amd_uncore_l2)
-		uncore_down_prepare(cpu, amd_uncore_l2);
-}
-
-static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
-{
-	struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu);
-
-	if (cpu == uncore->cpu)
-		cpumask_clear_cpu(cpu, uncore->active_mask);
-
-	if (!--uncore->refcnt)
-		kfree(uncore);
-	*per_cpu_ptr(uncores, cpu) = NULL;
-}
-
-static void amd_uncore_cpu_dead(unsigned int cpu)
-{
-	if (amd_uncore_nb)
-		uncore_dead(cpu, amd_uncore_nb);
-
-	if (amd_uncore_l2)
-		uncore_dead(cpu, amd_uncore_l2);
-}
-
-static int
-amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
-			void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		if (amd_uncore_cpu_up_prepare(cpu))
-			return notifier_from_errno(-ENOMEM);
-		break;
-
-	case CPU_STARTING:
-		amd_uncore_cpu_starting(cpu);
-		break;
-
-	case CPU_ONLINE:
-		amd_uncore_cpu_online(cpu);
-		break;
-
-	case CPU_DOWN_PREPARE:
-		amd_uncore_cpu_down_prepare(cpu);
-		break;
-
-	case CPU_UP_CANCELED:
-	case CPU_DEAD:
-		amd_uncore_cpu_dead(cpu);
-		break;
-
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block amd_uncore_cpu_notifier_block = {
-	.notifier_call	= amd_uncore_cpu_notifier,
-	.priority	= CPU_PRI_PERF + 1,
-};
-
-static void __init init_cpu_already_online(void *dummy)
-{
-	unsigned int cpu = smp_processor_id();
-
-	amd_uncore_cpu_starting(cpu);
-	amd_uncore_cpu_online(cpu);
-}
-
-static void cleanup_cpu_online(void *dummy)
-{
-	unsigned int cpu = smp_processor_id();
-
-	amd_uncore_cpu_dead(cpu);
-}
-
-static int __init amd_uncore_init(void)
-{
-	unsigned int cpu, cpu2;
-	int ret = -ENODEV;
-
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
-		goto fail_nodev;
-
-	if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
-		goto fail_nodev;
-
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
-		amd_uncore_nb = alloc_percpu(struct amd_uncore *);
-		if (!amd_uncore_nb) {
-			ret = -ENOMEM;
-			goto fail_nb;
-		}
-		ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
-		if (ret)
-			goto fail_nb;
-
-		pr_info("perf: AMD NB counters detected\n");
-		ret = 0;
-	}
-
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) {
-		amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
-		if (!amd_uncore_l2) {
-			ret = -ENOMEM;
-			goto fail_l2;
-		}
-		ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
-		if (ret)
-			goto fail_l2;
-
-		pr_info("perf: AMD L2I counters detected\n");
-		ret = 0;
-	}
-
-	if (ret)
-		goto fail_nodev;
-
-	cpu_notifier_register_begin();
-
-	/* init cpus already online before registering for hotplug notifier */
-	for_each_online_cpu(cpu) {
-		ret = amd_uncore_cpu_up_prepare(cpu);
-		if (ret)
-			goto fail_online;
-		smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
-	}
-
-	__register_cpu_notifier(&amd_uncore_cpu_notifier_block);
-	cpu_notifier_register_done();
-
-	return 0;
-
-
-fail_online:
-	for_each_online_cpu(cpu2) {
-		if (cpu2 == cpu)
-			break;
-		smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
-	}
-	cpu_notifier_register_done();
-
-	/* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
-	amd_uncore_nb = amd_uncore_l2 = NULL;
-
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_L2))
-		perf_pmu_unregister(&amd_l2_pmu);
-fail_l2:
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_NB))
-		perf_pmu_unregister(&amd_nb_pmu);
-	if (amd_uncore_l2)
-		free_percpu(amd_uncore_l2);
-fail_nb:
-	if (amd_uncore_nb)
-		free_percpu(amd_uncore_nb);
-
-fail_nodev:
-	return ret;
-}
-device_initcall(amd_uncore_init);
-- 
cgit v0.10.2


From 975db45e9cc561bf8a7eddfa0705d3a078ec184f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 8 Feb 2016 15:36:52 +0100
Subject: locking/static_keys: Avoid nested functions

clang does not support nested functions inside of an array definition:

  lib/test_static_keys.c:105:16: error: function definition is not allowed here
                          .test_key       = test_key_func(&old_true_key, static_key_true),
  lib/test_static_keys.c:50:20: note: expanded from macro 'test_key_func'
          ({bool func(void) { return branch(key); } func; })

That code appears to be a little too clever, so this simplifies it
a bit by defining functions outside of the array.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Jason Baron <jbaron@akamai.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1454942223-2781480-1-git-send-email-arnd@arndb.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/lib/test_static_keys.c b/lib/test_static_keys.c
index c61b299..915d75d 100644
--- a/lib/test_static_keys.c
+++ b/lib/test_static_keys.c
@@ -46,8 +46,11 @@ struct test_key {
 	bool			(*test_key)(void);
 };
 
-#define test_key_func(key, branch) \
-	({bool func(void) { return branch(key); } func;	})
+#define test_key_func(key, branch)	\
+static bool key ## _ ## branch(void)	\
+{					\
+	return branch(&key);		\
+}
 
 static void invert_key(struct static_key *key)
 {
@@ -92,6 +95,25 @@ static int verify_keys(struct test_key *keys, int size, bool invert)
 	return 0;
 }
 
+test_key_func(old_true_key, static_key_true)
+test_key_func(old_false_key, static_key_false)
+test_key_func(true_key, static_branch_likely)
+test_key_func(true_key, static_branch_unlikely)
+test_key_func(false_key, static_branch_likely)
+test_key_func(false_key, static_branch_unlikely)
+test_key_func(base_old_true_key, static_key_true)
+test_key_func(base_inv_old_true_key, static_key_true)
+test_key_func(base_old_false_key, static_key_false)
+test_key_func(base_inv_old_false_key, static_key_false)
+test_key_func(base_true_key, static_branch_likely)
+test_key_func(base_true_key, static_branch_unlikely)
+test_key_func(base_inv_true_key, static_branch_likely)
+test_key_func(base_inv_true_key, static_branch_unlikely)
+test_key_func(base_false_key, static_branch_likely)
+test_key_func(base_false_key, static_branch_unlikely)
+test_key_func(base_inv_false_key, static_branch_likely)
+test_key_func(base_inv_false_key, static_branch_unlikely)
+
 static int __init test_static_key_init(void)
 {
 	int ret;
@@ -102,95 +124,95 @@ static int __init test_static_key_init(void)
 		{
 			.init_state	= true,
 			.key		= &old_true_key,
-			.test_key	= test_key_func(&old_true_key, static_key_true),
+			.test_key	= &old_true_key_static_key_true,
 		},
 		{
 			.init_state	= false,
 			.key		= &old_false_key,
-			.test_key	= test_key_func(&old_false_key, static_key_false),
+			.test_key	= &old_false_key_static_key_false,
 		},
 		/* internal keys - new keys */
 		{
 			.init_state	= true,
 			.key		= &true_key.key,
-			.test_key	= test_key_func(&true_key, static_branch_likely),
+			.test_key	= &true_key_static_branch_likely,
 		},
 		{
 			.init_state	= true,
 			.key		= &true_key.key,
-			.test_key	= test_key_func(&true_key, static_branch_unlikely),
+			.test_key	= &true_key_static_branch_unlikely,
 		},
 		{
 			.init_state	= false,
 			.key		= &false_key.key,
-			.test_key	= test_key_func(&false_key, static_branch_likely),
+			.test_key	= &false_key_static_branch_likely,
 		},
 		{
 			.init_state	= false,
 			.key		= &false_key.key,
-			.test_key	= test_key_func(&false_key, static_branch_unlikely),
+			.test_key	= &false_key_static_branch_unlikely,
 		},
 		/* external keys - old keys */
 		{
 			.init_state	= true,
 			.key		= &base_old_true_key,
-			.test_key	= test_key_func(&base_old_true_key, static_key_true),
+			.test_key	= &base_old_true_key_static_key_true,
 		},
 		{
 			.init_state	= false,
 			.key		= &base_inv_old_true_key,
-			.test_key	= test_key_func(&base_inv_old_true_key, static_key_true),
+			.test_key	= &base_inv_old_true_key_static_key_true,
 		},
 		{
 			.init_state	= false,
 			.key		= &base_old_false_key,
-			.test_key	= test_key_func(&base_old_false_key, static_key_false),
+			.test_key	= &base_old_false_key_static_key_false,
 		},
 		{
 			.init_state	= true,
 			.key		= &base_inv_old_false_key,
-			.test_key	= test_key_func(&base_inv_old_false_key, static_key_false),
+			.test_key	= &base_inv_old_false_key_static_key_false,
 		},
 		/* external keys - new keys */
 		{
 			.init_state	= true,
 			.key		= &base_true_key.key,
-			.test_key	= test_key_func(&base_true_key, static_branch_likely),
+			.test_key	= &base_true_key_static_branch_likely,
 		},
 		{
 			.init_state	= true,
 			.key		= &base_true_key.key,
-			.test_key	= test_key_func(&base_true_key, static_branch_unlikely),
+			.test_key	= &base_true_key_static_branch_unlikely,
 		},
 		{
 			.init_state	= false,
 			.key		= &base_inv_true_key.key,
-			.test_key	= test_key_func(&base_inv_true_key, static_branch_likely),
+			.test_key	= &base_inv_true_key_static_branch_likely,
 		},
 		{
 			.init_state	= false,
 			.key		= &base_inv_true_key.key,
-			.test_key	= test_key_func(&base_inv_true_key, static_branch_unlikely),
+			.test_key	= &base_inv_true_key_static_branch_unlikely,
 		},
 		{
 			.init_state	= false,
 			.key		= &base_false_key.key,
-			.test_key	= test_key_func(&base_false_key, static_branch_likely),
+			.test_key	= &base_false_key_static_branch_likely,
 		},
 		{
 			.init_state	= false,
 			.key		= &base_false_key.key,
-			.test_key	= test_key_func(&base_false_key, static_branch_unlikely),
+			.test_key	= &base_false_key_static_branch_unlikely,
 		},
 		{
 			.init_state	= true,
 			.key		= &base_inv_false_key.key,
-			.test_key	= test_key_func(&base_inv_false_key, static_branch_likely),
+			.test_key	= &base_inv_false_key_static_branch_likely,
 		},
 		{
 			.init_state	= true,
 			.key		= &base_inv_false_key.key,
-			.test_key	= test_key_func(&base_inv_false_key, static_branch_unlikely),
+			.test_key	= &base_inv_false_key_static_branch_unlikely,
 		},
 	};
 
-- 
cgit v0.10.2


From 8dd5032d9c540111dd673078738d137a998d6c3f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Sun, 7 Feb 2016 22:51:27 +0100
Subject: x86/asm/bitops: Force inlining of test_and_set_bit and friends

Sometimes GCC mysteriously doesn't inline very small functions
we expect to be inlined, see:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

Arguably, GCC should do better, but GCC people aren't willing
to invest time into it and are asking to use __always_inline
instead.

With this .config:

  http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os

here's an example of functions getting deinlined many times:

  test_and_set_bit (166 copies, ~1260 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f ab 3e          lock bts %rdi,(%rsi)
         72 04                   jb     <test_and_set_bit+0xf>
         31 c0                   xor    %eax,%eax
         eb 05                   jmp    <test_and_set_bit+0x14>
         b8 01 00 00 00          mov    $0x1,%eax
         5d                      pop    %rbp
         c3                      retq

  test_and_clear_bit (124 copies, ~1000 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f b3 3e          lock btr %rdi,(%rsi)
         72 04                   jb     <test_and_clear_bit+0xf>
         31 c0                   xor    %eax,%eax
         eb 05                   jmp    <test_and_clear_bit+0x14>
         b8 01 00 00 00          mov    $0x1,%eax
         5d                      pop    %rbp
         c3                      retq

  change_bit (3 copies, 8 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f bb 3e          lock btc %rdi,(%rsi)
         5d                      pop    %rbp
         c3                      retq

  clear_bit_unlock (2 copies, 11 calls)
         55                      push   %rbp
         48 89 e5                mov    %rsp,%rbp
         f0 48 0f b3 3e          lock btr %rdi,(%rsi)
         5d                      pop    %rbp
         c3                      retq

This patch works it around via s/inline/__always_inline/.

Code size decrease by ~13.5k after the patch:

      text     data      bss       dec    filename
  92110727 20826144 36417536 149354407    vmlinux.before
  92097234 20826176 36417536 149340946    vmlinux.after

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Graf <tgraf@suug.ch>
Link: http://lkml.kernel.org/r/1454881887-1367-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b95..7766d1c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -91,7 +91,7 @@ set_bit(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __set_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
 }
@@ -128,13 +128,13 @@ clear_bit(long nr, volatile unsigned long *addr)
  * clear_bit() is atomic and implies release semantics before the memory
  * operation. It can be used for an unlock.
  */
-static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
 	barrier();
 	clear_bit(nr, addr);
 }
 
-static inline void __clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
 }
@@ -151,7 +151,7 @@ static inline void __clear_bit(long nr, volatile unsigned long *addr)
  * No memory barrier is required here, because x86 cannot reorder stores past
  * older loads. Same principle as spin_unlock.
  */
-static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
+static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
 {
 	barrier();
 	__clear_bit(nr, addr);
@@ -166,7 +166,7 @@ static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void __change_bit(long nr, volatile unsigned long *addr)
 {
 	asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
 }
@@ -180,7 +180,7 @@ static inline void __change_bit(long nr, volatile unsigned long *addr)
  * Note that @nr may be almost arbitrarily large; this function is not
  * restricted to acting on a single-word quantity.
  */
-static inline void change_bit(long nr, volatile unsigned long *addr)
+static __always_inline void change_bit(long nr, volatile unsigned long *addr)
 {
 	if (IS_IMMEDIATE(nr)) {
 		asm volatile(LOCK_PREFIX "xorb %1,%0"
@@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c");
 }
@@ -228,7 +228,7 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr)
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c");
 }
@@ -268,7 +268,7 @@ static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
  * accessed from a hypervisor on the same CPU if running in a VM: don't change
  * this without also updating arch/x86/kernel/kvm.c
  */
-static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -280,7 +280,7 @@ static inline int __test_and_clear_bit(long nr, volatile unsigned long *addr)
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -300,7 +300,7 @@ static inline int __test_and_change_bit(long nr, volatile unsigned long *addr)
  * This operation is atomic and cannot be reordered.
  * It also implies a memory barrier.
  */
-static inline int test_and_change_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c");
 }
@@ -311,7 +311,7 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo
 		(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
 }
 
-static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
+static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr)
 {
 	int oldbit;
 
@@ -343,7 +343,7 @@ static int test_bit(int nr, const volatile unsigned long *addr);
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned long __ffs(unsigned long word)
 {
 	asm("rep; bsf %1,%0"
 		: "=r" (word)
@@ -357,7 +357,7 @@ static inline unsigned long __ffs(unsigned long word)
  *
  * Undefined if no zero exists, so code should check against ~0UL first.
  */
-static inline unsigned long ffz(unsigned long word)
+static __always_inline unsigned long ffz(unsigned long word)
 {
 	asm("rep; bsf %1,%0"
 		: "=r" (word)
@@ -371,7 +371,7 @@ static inline unsigned long ffz(unsigned long word)
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long __fls(unsigned long word)
 {
 	asm("bsr %1,%0"
 	    : "=r" (word)
@@ -393,7 +393,7 @@ static inline unsigned long __fls(unsigned long word)
  * set bit if value is nonzero. The first (least significant) bit
  * is at position 1.
  */
-static inline int ffs(int x)
+static __always_inline int ffs(int x)
 {
 	int r;
 
@@ -434,7 +434,7 @@ static inline int ffs(int x)
  * set bit if value is nonzero. The last (most significant) bit is
  * at position 32.
  */
-static inline int fls(int x)
+static __always_inline int fls(int x)
 {
 	int r;
 
-- 
cgit v0.10.2


From a7636d9ecfa3ab7800a7c04c1f89378229eff609 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 3 Feb 2016 12:28:28 -0800
Subject: kprobes: Optimize hot path by using percpu counter to collect 'nhit'
 statistics

When doing ebpf+kprobe on some hot TCP functions (e.g.
tcp_rcv_established), the kprobe_dispatcher() function
shows up in 'perf report'.

In kprobe_dispatcher(), there is a lot of cache bouncing
on 'tk->nhit++'.  'tk->nhit' and 'tk->tp.flags' also share
the same cacheline.

perf report (cycles:pp):

	8.30%  ipv4_dst_check
	4.74%  copy_user_enhanced_fast_string
	3.93%  dst_release
	2.80%  tcp_v4_rcv
	2.31%  queued_spin_lock_slowpath
	2.30%  _raw_spin_lock
	1.88%  mlx4_en_process_rx_cq
	1.84%  eth_get_headlen
	1.81%  ip_rcv_finish
	~~~~
	1.71%  kprobe_dispatcher
	~~~~
	1.55%  mlx4_en_xmit
	1.09%  __probe_kernel_read

perf report after patch:

	9.15%  ipv4_dst_check
	5.00%  copy_user_enhanced_fast_string
	4.12%  dst_release
	2.96%  tcp_v4_rcv
	2.50%  _raw_spin_lock
	2.39%  queued_spin_lock_slowpath
	2.11%  eth_get_headlen
	2.03%  mlx4_en_process_rx_cq
	1.69%  mlx4_en_xmit
	1.19%  ip_rcv_finish
	1.12%  __probe_kernel_read
	1.02%  ehci_hcd_cleanup

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Kernel Team <kernel-team@fb.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454531308-2441898-1-git-send-email-kafai@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index c995644..21b81a4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -30,7 +30,7 @@
 struct trace_kprobe {
 	struct list_head	list;
 	struct kretprobe	rp;	/* Use rp.kp for kprobe use */
-	unsigned long 		nhit;
+	unsigned long __percpu *nhit;
 	const char		*symbol;	/* symbol name */
 	struct trace_probe	tp;
 };
@@ -274,6 +274,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
 	if (!tk)
 		return ERR_PTR(ret);
 
+	tk->nhit = alloc_percpu(unsigned long);
+	if (!tk->nhit)
+		goto error;
+
 	if (symbol) {
 		tk->symbol = kstrdup(symbol, GFP_KERNEL);
 		if (!tk->symbol)
@@ -313,6 +317,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
 error:
 	kfree(tk->tp.call.name);
 	kfree(tk->symbol);
+	free_percpu(tk->nhit);
 	kfree(tk);
 	return ERR_PTR(ret);
 }
@@ -327,6 +332,7 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
 	kfree(tk->tp.call.class->system);
 	kfree(tk->tp.call.name);
 	kfree(tk->symbol);
+	free_percpu(tk->nhit);
 	kfree(tk);
 }
 
@@ -874,9 +880,14 @@ static const struct file_operations kprobe_events_ops = {
 static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
 	struct trace_kprobe *tk = v;
+	unsigned long nhit = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		nhit += *per_cpu_ptr(tk->nhit, cpu);
 
 	seq_printf(m, "  %-44s %15lu %15lu\n",
-		   trace_event_name(&tk->tp.call), tk->nhit,
+		   trace_event_name(&tk->tp.call), nhit,
 		   tk->rp.kp.nmissed);
 
 	return 0;
@@ -1225,7 +1236,7 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
 
-	tk->nhit++;
+	raw_cpu_inc(*tk->nhit);
 
 	if (tk->tp.flags & TP_FLAG_TRACE)
 		kprobe_trace_func(tk, regs);
@@ -1242,7 +1253,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
 
-	tk->nhit++;
+	raw_cpu_inc(*tk->nhit);
 
 	if (tk->tp.flags & TP_FLAG_TRACE)
 		kretprobe_trace_func(tk, ri, regs);
-- 
cgit v0.10.2


From 5f9c01aa7c49a2d74474d6d879a797b8badf29e6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:29 +0100
Subject: x86/microcode: Untangle from BLK_DEV_INITRD

Thomas Voegtle reported that doing oldconfig with a .config which has
CONFIG_MICROCODE enabled but BLK_DEV_INITRD disabled prevents the
microcode loading mechanism from being built.

So untangle it from the BLK_DEV_INITRD dependency so that oldconfig
doesn't turn it off and add an explanatory text to its Kconfig help what
the supported methods for supplying microcode are.

Reported-by: Thomas Voegtle <tv@lio96.de>
Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org> # 4.4
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9af2e63..405b185 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1159,22 +1159,23 @@ config MICROCODE
 	bool "CPU microcode loading support"
 	default y
 	depends on CPU_SUP_AMD || CPU_SUP_INTEL
-	depends on BLK_DEV_INITRD
 	select FW_LOADER
 	---help---
-
 	  If you say Y here, you will be able to update the microcode on
-	  certain Intel and AMD processors. The Intel support is for the
-	  IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4,
-	  Xeon etc. The AMD support is for families 0x10 and later. You will
-	  obviously need the actual microcode binary data itself which is not
-	  shipped with the Linux kernel.
-
-	  This option selects the general module only, you need to select
-	  at least one vendor specific module as well.
-
-	  To compile this driver as a module, choose M here: the module
-	  will be called microcode.
+	  Intel and AMD processors. The Intel support is for the IA32 family,
+	  e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, Xeon etc. The
+	  AMD support is for families 0x10 and later. You will obviously need
+	  the actual microcode binary data itself which is not shipped with
+	  the Linux kernel.
+
+	  The preferred method to load microcode from a detached initrd is described
+	  in Documentation/x86/early-microcode.txt. For that you need to enable
+	  CONFIG_BLK_DEV_INITRD in order for the loader to be able to scan the
+	  initrd for microcode blobs.
+
+	  In addition, you can build-in the microcode into the kernel. For that you
+	  need to enable FIRMWARE_IN_KERNEL and add the vendor-supplied microcode
+	  to the CONFIG_EXTRA_FIRMWARE config option.
 
 config MICROCODE_INTEL
 	bool "Intel microcode loading support"
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 1e1b07a..9d3a96c 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -3,6 +3,7 @@
 
 #include <asm/cpu.h>
 #include <linux/earlycpio.h>
+#include <linux/initrd.h>
 
 #define native_rdmsr(msr, val1, val2)			\
 do {							\
@@ -143,4 +144,29 @@ static inline void reload_early_microcode(void)			{ }
 static inline bool
 get_builtin_firmware(struct cpio_data *cd, const char *name)	{ return false; }
 #endif
+
+static inline unsigned long get_initrd_start(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+	return initrd_start;
+#else
+	return 0;
+#endif
+}
+
+static inline unsigned long get_initrd_start_addr(void)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+#ifdef CONFIG_X86_32
+	unsigned long *initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
+
+	return (unsigned long)__pa_nodebug(*initrd_start_p);
+#else
+	return get_initrd_start();
+#endif
+#else /* CONFIG_BLK_DEV_INITRD */
+	return 0;
+#endif
+}
+
 #endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ee81c54..044bbbbc 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -690,7 +690,7 @@ int __init save_microcode_in_initrd_intel(void)
 	if (count == 0)
 		return ret;
 
-	copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
+	copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count);
 	ret = save_microcode(&mc_saved_data, mc_saved, count);
 	if (ret)
 		pr_err("Cannot save microcode patches from initrd.\n");
@@ -748,20 +748,14 @@ void load_ucode_intel_ap(void)
 	struct mc_saved_data *mc_saved_data_p;
 	struct ucode_cpu_info uci;
 	unsigned long *mc_saved_in_initrd_p;
-	unsigned long initrd_start_addr;
 	enum ucode_state ret;
 #ifdef CONFIG_X86_32
-	unsigned long *initrd_start_p;
 
-	mc_saved_in_initrd_p =
-		(unsigned long *)__pa_nodebug(mc_saved_in_initrd);
+	mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
 	mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
-	initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start);
-	initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p);
 #else
-	mc_saved_data_p = &mc_saved_data;
 	mc_saved_in_initrd_p = mc_saved_in_initrd;
-	initrd_start_addr = initrd_start;
+	mc_saved_data_p = &mc_saved_data;
 #endif
 
 	/*
@@ -773,7 +767,7 @@ void load_ucode_intel_ap(void)
 
 	collect_cpu_info_early(&uci);
 	ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
-			     initrd_start_addr, &uci);
+			     get_initrd_start_addr(), &uci);
 
 	if (ret != UCODE_OK)
 		return;
-- 
cgit v0.10.2


From 264285ac01673e70557c43ecee338ce97c4c0672 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:30 +0100
Subject: x86/microcode/intel: Make early loader look for builtin microcode too

Set the initrd @start depending on the presence of an initrd. Otherwise,
builtin microcode loading doesn't work as the start is wrong and we're
using it to compute offset to the microcode blobs.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org> # 4.4
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 044bbbbc..4f4735b 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -551,10 +551,14 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
 	cd.data = NULL;
 	cd.size = 0;
 
-	cd = find_cpio_data(p, (void *)start, size, &offset);
-	if (!cd.data) {
+	/* try built-in microcode if no initrd */
+	if (!size) {
 		if (!load_builtin_intel_microcode(&cd))
 			return UCODE_ERROR;
+	} else {
+		cd = find_cpio_data(p, (void *)start, size, &offset);
+		if (!cd.data)
+			return UCODE_ERROR;
 	}
 
 	return get_matching_model_microcode(0, start, cd.data, cd.size,
@@ -728,16 +732,20 @@ void __init load_ucode_intel_bsp(void)
 	struct boot_params *p;
 
 	p	= (struct boot_params *)__pa_nodebug(&boot_params);
-	start	= p->hdr.ramdisk_image;
 	size	= p->hdr.ramdisk_size;
 
-	_load_ucode_intel_bsp(
-			(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
-			(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
-			start, size);
+	/*
+	 * Set start only if we have an initrd image. We cannot use initrd_start
+	 * because it is not set that early yet.
+	 */
+	start	= (size ? p->hdr.ramdisk_image : 0);
+
+	_load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+			      (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+			      start, size);
 #else
-	start	= boot_params.hdr.ramdisk_image + PAGE_OFFSET;
 	size	= boot_params.hdr.ramdisk_size;
+	start	= (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0);
 
 	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
 #endif
-- 
cgit v0.10.2


From e8c8165ecfb1cfd6650777c193361d33b0f7f59e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:31 +0100
Subject: x86/microcode: Remove redundant __setup() param parsing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We do parse for the disable microcode loader chicken bit very early.
After the driver merge, the __setup() param parsing method is not needed
anymore so get rid of it.

In addition, fix a compiler warning from an old SLES11 gcc (4.3.4)
reported by Jan Beulich <jbeulich@suse.com>:

  arch/x86/kernel/cpu/microcode/core.c: In function ‘load_ucode_bsp’:
  arch/x86/kernel/cpu/microcode/core.c:96: warning: array subscript is above array bounds

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index faec712..bca4e48 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -43,16 +43,8 @@
 #define MICROCODE_VERSION	"2.01"
 
 static struct microcode_ops	*microcode_ops;
-
 static bool dis_ucode_ldr;
 
-static int __init disable_loader(char *str)
-{
-	dis_ucode_ldr = true;
-	return 1;
-}
-__setup("dis_ucode_ldr", disable_loader);
-
 /*
  * Synchronization.
  *
@@ -81,15 +73,16 @@ struct cpu_info_ctx {
 
 static bool __init check_loader_disabled_bsp(void)
 {
+	static const char *__dis_opt_str = "dis_ucode_ldr";
+
 #ifdef CONFIG_X86_32
 	const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
-	const char *opt	    = "dis_ucode_ldr";
-	const char *option  = (const char *)__pa_nodebug(opt);
+	const char *option  = (const char *)__pa_nodebug(__dis_opt_str);
 	bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
 
 #else /* CONFIG_X86_64 */
 	const char *cmdline = boot_command_line;
-	const char *option  = "dis_ucode_ldr";
+	const char *option  = __dis_opt_str;
 	bool *res = &dis_ucode_ldr;
 #endif
 
-- 
cgit v0.10.2


From 43858f57bc02388a6420321c021397591199cf91 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 3 Feb 2016 12:33:32 +0100
Subject: x86/microcode: Remove an unneeded NULL check

"uci" is an element of the ucode_cpu_info[] array, it can't be NULL.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-janitors@vger.kernel.org
Link: http://lkml.kernel.org/r/1454499225-21544-5-git-send-email-bp@alien8.de
Link: http://lkml.kernel.org/r/20140120103046.GC14233@elgon.mountain
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index bca4e48..cea8552 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -472,7 +472,7 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
 	enum ucode_state ustate;
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
-	if (uci && uci->valid)
+	if (uci->valid)
 		return UCODE_OK;
 
 	if (collect_cpu_info(cpu))
-- 
cgit v0.10.2


From b7f500aedd4551a9bf29c617804c13f0ff18c879 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:33 +0100
Subject: x86/microcode: Issue update message only once

This is especially annoying on large boxes:

  x86: Booting SMP configuration:
  .... node  #0, CPUs:          #1
  microcode: CPU1 microcode updated early to revision 0x428, date = 2014-05-29
     #2
  microcode: CPU2 microcode updated early to revision 0x428, date = 2014-05-29
     #3
  ...

so issue the update message only once.

$ grep microcode /proc/cpuinfo

shows whether every core got updated properly.

Reported-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 2233f8a..5b63e2f 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -432,8 +432,8 @@ int __init save_microcode_in_initrd_amd(void)
 		container = cont_va;
 
 	if (ucode_new_rev)
-		pr_info("microcode: updated early to new patch_level=0x%08x\n",
-			ucode_new_rev);
+		pr_info_once("microcode updated early to new patch_level=0x%08x\n",
+			     ucode_new_rev);
 
 	eax   = cpuid_eax(0x00000001);
 	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 4f4735b..f4bc5fe 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -571,14 +571,11 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
 static void
 print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
 {
-	int cpu = smp_processor_id();
-
-	pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
-		cpu,
-		uci->cpu_sig.rev,
-		date & 0xffff,
-		date >> 24,
-		(date >> 16) & 0xff);
+	pr_info_once("microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
+		     uci->cpu_sig.rev,
+		     date & 0xffff,
+		     date >> 24,
+		     (date >> 16) & 0xff);
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v0.10.2


From a58017c62b5fa23e532b731e70a63ded54cc2c02 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:34 +0100
Subject: x86/microcode/AMD: Drop redundant printk prefix

It is supplied by pr_fmt already.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-7-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 5b63e2f..9f5ccef 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -469,8 +469,7 @@ void reload_ucode_amd(void)
 	if (mc && rev < mc->hdr.patch_id) {
 		if (!__apply_microcode_amd(mc)) {
 			ucode_new_rev = mc->hdr.patch_id;
-			pr_info("microcode: reload patch_level=0x%08x\n",
-				ucode_new_rev);
+			pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
 		}
 	}
 }
-- 
cgit v0.10.2


From bd6fe58d8e60d45c80ed664a55ef0df5fa076014 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:35 +0100
Subject: x86/microcode/intel: Rename local variables of type struct
 mc_saved_data

So it is always a head-twister when trying to stare at code which has a
bunch of

  struct mc_saved_data *mc_saved_data;

local function variables *and* a global mc_saved_data of the same name.

Rename all locals to "mcs" to differentiate from the global one.

No functionality change.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-8-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index f4bc5fe..4af30bee 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -89,42 +89,39 @@ copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
 
 #ifdef CONFIG_X86_32
 static void
-microcode_phys(struct microcode_intel **mc_saved_tmp,
-	       struct mc_saved_data *mc_saved_data)
+microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
 {
 	int i;
 	struct microcode_intel ***mc_saved;
 
-	mc_saved = (struct microcode_intel ***)
-		   __pa_nodebug(&mc_saved_data->mc_saved);
-	for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
+	mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
+
+	for (i = 0; i < mcs->mc_saved_count; i++) {
 		struct microcode_intel *p;
 
-		p = *(struct microcode_intel **)
-			__pa_nodebug(mc_saved_data->mc_saved + i);
+		p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
 		mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
 	}
 }
 #endif
 
 static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+load_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
 	       unsigned long initrd_start, struct ucode_cpu_info *uci)
 {
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int count = mc_saved_data->mc_saved_count;
+	unsigned int count = mcs->mc_saved_count;
 
-	if (!mc_saved_data->mc_saved) {
+	if (!mcs->mc_saved) {
 		copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
 
 		return load_microcode_early(mc_saved_tmp, count, uci);
 	} else {
 #ifdef CONFIG_X86_32
-		microcode_phys(mc_saved_tmp, mc_saved_data);
+		microcode_phys(mc_saved_tmp, mcs);
 		return load_microcode_early(mc_saved_tmp, count, uci);
 #else
-		return load_microcode_early(mc_saved_data->mc_saved,
-						    count, uci);
+		return load_microcode_early(mcs->mc_saved, count, uci);
 #endif
 	}
 }
@@ -175,7 +172,7 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 }
 
 static int
-save_microcode(struct mc_saved_data *mc_saved_data,
+save_microcode(struct mc_saved_data *mcs,
 	       struct microcode_intel **mc_saved_src,
 	       unsigned int mc_saved_count)
 {
@@ -219,8 +216,8 @@ save_microcode(struct mc_saved_data *mc_saved_data,
 	/*
 	 * Point to newly saved microcode.
 	 */
-	mc_saved_data->mc_saved = saved_ptr;
-	mc_saved_data->mc_saved_count = mc_saved_count;
+	mcs->mc_saved	    = saved_ptr;
+	mcs->mc_saved_count = mc_saved_count;
 
 	return 0;
 
@@ -286,7 +283,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
 static enum ucode_state __init
 get_matching_model_microcode(int cpu, unsigned long start,
 			     void *data, size_t size,
-			     struct mc_saved_data *mc_saved_data,
+			     struct mc_saved_data *mcs,
 			     unsigned long *mc_saved_in_initrd,
 			     struct ucode_cpu_info *uci)
 {
@@ -296,7 +293,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
 	unsigned int mc_size;
 	struct microcode_header_intel *mc_header;
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
+	unsigned int mc_saved_count = mcs->mc_saved_count;
 	int i;
 
 	while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
@@ -342,7 +339,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
 	for (i = 0; i < mc_saved_count; i++)
 		mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
 
-	mc_saved_data->mc_saved_count = mc_saved_count;
+	mcs->mc_saved_count = mc_saved_count;
 out:
 	return state;
 }
@@ -536,7 +533,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
 
 static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
 static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+scan_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
 	       unsigned long start, unsigned long size,
 	       struct ucode_cpu_info *uci)
 {
@@ -562,7 +559,7 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
 	}
 
 	return get_matching_model_microcode(0, start, cd.data, cd.size,
-					    mc_saved_data, initrd, uci);
+					    mcs, initrd, uci);
 }
 
 /*
@@ -702,8 +699,7 @@ int __init save_microcode_in_initrd_intel(void)
 }
 
 static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
-		      unsigned long *initrd,
+_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *initrd,
 		      unsigned long start, unsigned long size)
 {
 	struct ucode_cpu_info uci;
@@ -711,11 +707,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
 
 	collect_cpu_info_early(&uci);
 
-	ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+	ret = scan_microcode(mcs, initrd, start, size, &uci);
 	if (ret != UCODE_OK)
 		return;
 
-	ret = load_microcode(mc_saved_data, initrd, start, &uci);
+	ret = load_microcode(mcs, initrd, start, &uci);
 	if (ret != UCODE_OK)
 		return;
 
@@ -750,28 +746,28 @@ void __init load_ucode_intel_bsp(void)
 
 void load_ucode_intel_ap(void)
 {
-	struct mc_saved_data *mc_saved_data_p;
-	struct ucode_cpu_info uci;
 	unsigned long *mc_saved_in_initrd_p;
+	struct mc_saved_data *mcs_p;
+	struct ucode_cpu_info uci;
 	enum ucode_state ret;
 #ifdef CONFIG_X86_32
 
 	mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
-	mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
+	mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
 #else
 	mc_saved_in_initrd_p = mc_saved_in_initrd;
-	mc_saved_data_p = &mc_saved_data;
+	mcs_p = &mc_saved_data;
 #endif
 
 	/*
 	 * If there is no valid ucode previously saved in memory, no need to
 	 * update ucode on this AP.
 	 */
-	if (mc_saved_data_p->mc_saved_count == 0)
+	if (mcs_p->mc_saved_count == 0)
 		return;
 
 	collect_cpu_info_early(&uci);
-	ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+	ret = load_microcode(mcs_p, mc_saved_in_initrd_p,
 			     get_initrd_start_addr(), &uci);
 
 	if (ret != UCODE_OK)
-- 
cgit v0.10.2


From 4fe9349fc3b042b481692b577bda97cde4d6f517 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:36 +0100
Subject: x86/microcode/intel: Rename mc_saved_count to num_saved

It is shorter and easier on the eyes. Change the "== 0" tests to "!..."
while at it.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-9-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 4af30bee..9f5fe72 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -41,7 +41,7 @@
 
 static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
 static struct mc_saved_data {
-	unsigned int mc_saved_count;
+	unsigned int num_saved;
 	struct microcode_intel **mc_saved;
 } mc_saved_data;
 
@@ -96,7 +96,7 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
 
 	mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
 
-	for (i = 0; i < mcs->mc_saved_count; i++) {
+	for (i = 0; i < mcs->num_saved; i++) {
 		struct microcode_intel *p;
 
 		p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
@@ -110,7 +110,7 @@ load_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
 	       unsigned long initrd_start, struct ucode_cpu_info *uci)
 {
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int count = mcs->mc_saved_count;
+	unsigned int count = mcs->num_saved;
 
 	if (!mcs->mc_saved) {
 		copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
@@ -174,23 +174,23 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 static int
 save_microcode(struct mc_saved_data *mcs,
 	       struct microcode_intel **mc_saved_src,
-	       unsigned int mc_saved_count)
+	       unsigned int num_saved)
 {
 	int i, j;
 	struct microcode_intel **saved_ptr;
 	int ret;
 
-	if (!mc_saved_count)
+	if (!num_saved)
 		return -EINVAL;
 
 	/*
 	 * Copy new microcode data.
 	 */
-	saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+	saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL);
 	if (!saved_ptr)
 		return -ENOMEM;
 
-	for (i = 0; i < mc_saved_count; i++) {
+	for (i = 0; i < num_saved; i++) {
 		struct microcode_header_intel *mc_hdr;
 		struct microcode_intel *mc;
 		unsigned long size;
@@ -216,8 +216,8 @@ save_microcode(struct mc_saved_data *mcs,
 	/*
 	 * Point to newly saved microcode.
 	 */
-	mcs->mc_saved	    = saved_ptr;
-	mcs->mc_saved_count = mc_saved_count;
+	mcs->mc_saved  = saved_ptr;
+	mcs->num_saved = num_saved;
 
 	return 0;
 
@@ -293,10 +293,10 @@ get_matching_model_microcode(int cpu, unsigned long start,
 	unsigned int mc_size;
 	struct microcode_header_intel *mc_header;
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int mc_saved_count = mcs->mc_saved_count;
+	unsigned int num_saved = mcs->num_saved;
 	int i;
 
-	while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
+	while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
 
 		if (leftover < sizeof(mc_header))
 			break;
@@ -321,7 +321,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
 			continue;
 		}
 
-		mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
+		num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved);
 
 		ucode_ptr += mc_size;
 	}
@@ -331,15 +331,15 @@ get_matching_model_microcode(int cpu, unsigned long start,
 		goto out;
 	}
 
-	if (mc_saved_count == 0) {
+	if (!num_saved) {
 		state = UCODE_NFOUND;
 		goto out;
 	}
 
-	for (i = 0; i < mc_saved_count; i++)
+	for (i = 0; i < num_saved; i++)
 		mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
 
-	mcs->mc_saved_count = mc_saved_count;
+	mcs->num_saved = num_saved;
 out:
 	return state;
 }
@@ -393,11 +393,11 @@ static void show_saved_mc(void)
 	unsigned int sig, pf, rev, total_size, data_size, date;
 	struct ucode_cpu_info uci;
 
-	if (mc_saved_data.mc_saved_count == 0) {
+	if (!mc_saved_data.num_saved) {
 		pr_debug("no microcode data saved.\n");
 		return;
 	}
-	pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
+	pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved);
 
 	collect_cpu_info_early(&uci);
 
@@ -406,7 +406,7 @@ static void show_saved_mc(void)
 	rev = uci.cpu_sig.rev;
 	pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
 
-	for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
+	for (i = 0; i < mc_saved_data.num_saved; i++) {
 		struct microcode_header_intel *mc_saved_header;
 		struct extended_sigtable *ext_header;
 		int ext_sigcount;
@@ -462,7 +462,7 @@ int save_mc_for_early(u8 *mc)
 {
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
 	unsigned int mc_saved_count_init;
-	unsigned int mc_saved_count;
+	unsigned int num_saved;
 	struct microcode_intel **mc_saved;
 	int ret = 0;
 	int i;
@@ -473,23 +473,23 @@ int save_mc_for_early(u8 *mc)
 	 */
 	mutex_lock(&x86_cpu_microcode_mutex);
 
-	mc_saved_count_init = mc_saved_data.mc_saved_count;
-	mc_saved_count = mc_saved_data.mc_saved_count;
+	mc_saved_count_init = mc_saved_data.num_saved;
+	num_saved = mc_saved_data.num_saved;
 	mc_saved = mc_saved_data.mc_saved;
 
-	if (mc_saved && mc_saved_count)
+	if (mc_saved && num_saved)
 		memcpy(mc_saved_tmp, mc_saved,
-		       mc_saved_count * sizeof(struct microcode_intel *));
+		       num_saved * sizeof(struct microcode_intel *));
 	/*
 	 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
 	 * version.
 	 */
-	mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
+	num_saved = _save_mc(mc_saved_tmp, mc, num_saved);
 
 	/*
 	 * Save the mc_save_tmp in global mc_saved_data.
 	 */
-	ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
+	ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved);
 	if (ret) {
 		pr_err("Cannot save microcode patch.\n");
 		goto out;
@@ -681,14 +681,15 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
  */
 int __init save_microcode_in_initrd_intel(void)
 {
-	unsigned int count = mc_saved_data.mc_saved_count;
+	unsigned int count = mc_saved_data.num_saved;
 	struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
 	int ret = 0;
 
-	if (count == 0)
+	if (!count)
 		return ret;
 
 	copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count);
+
 	ret = save_microcode(&mc_saved_data, mc_saved, count);
 	if (ret)
 		pr_err("Cannot save microcode patches from initrd.\n");
@@ -763,7 +764,7 @@ void load_ucode_intel_ap(void)
 	 * If there is no valid ucode previously saved in memory, no need to
 	 * update ucode on this AP.
 	 */
-	if (mcs_p->mc_saved_count == 0)
+	if (!mcs_p->num_saved)
 		return;
 
 	collect_cpu_info_early(&uci);
@@ -781,13 +782,13 @@ void reload_ucode_intel(void)
 	struct ucode_cpu_info uci;
 	enum ucode_state ret;
 
-	if (!mc_saved_data.mc_saved_count)
+	if (!mc_saved_data.num_saved)
 		return;
 
 	collect_cpu_info_early(&uci);
 
 	ret = load_microcode_early(mc_saved_data.mc_saved,
-				   mc_saved_data.mc_saved_count, &uci);
+				   mc_saved_data.num_saved, &uci);
 	if (ret != UCODE_OK)
 		return;
 
-- 
cgit v0.10.2


From de778275c295825e6638f3f74103f40642d45caa Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:37 +0100
Subject: x86/microcode/intel: Rename mc_intel variable to mc

Well, it is apparent what it points to - microcode. And since it is the
intel loader, no need for the "_intel" suffix. Use "!" for the 0/NULL
checks, while at it.

No functionality change.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-10-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 9f5fe72..d1b2f58 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -601,19 +601,19 @@ void show_ucode_info_early(void)
  */
 static void print_ucode(struct ucode_cpu_info *uci)
 {
-	struct microcode_intel *mc_intel;
+	struct microcode_intel *mc;
 	int *delay_ucode_info_p;
 	int *current_mc_date_p;
 
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
+	mc = uci->mc;
+	if (!mc)
 		return;
 
 	delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info);
 	current_mc_date_p = (int *)__pa_nodebug(&current_mc_date);
 
 	*delay_ucode_info_p = 1;
-	*current_mc_date_p = mc_intel->hdr.date;
+	*current_mc_date_p = mc->hdr.date;
 }
 #else
 
@@ -628,29 +628,29 @@ static inline void flush_tlb_early(void)
 
 static inline void print_ucode(struct ucode_cpu_info *uci)
 {
-	struct microcode_intel *mc_intel;
+	struct microcode_intel *mc;
 
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
+	mc = uci->mc;
+	if (!mc)
 		return;
 
-	print_ucode_info(uci, mc_intel->hdr.date);
+	print_ucode_info(uci, mc->hdr.date);
 }
 #endif
 
 static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 {
-	struct microcode_intel *mc_intel;
+	struct microcode_intel *mc;
 	unsigned int val[2];
 
-	mc_intel = uci->mc;
-	if (mc_intel == NULL)
+	mc = uci->mc;
+	if (!mc)
 		return 0;
 
 	/* write microcode via MSR 0x79 */
 	native_wrmsr(MSR_IA32_UCODE_WRITE,
-	      (unsigned long) mc_intel->bits,
-	      (unsigned long) mc_intel->bits >> 16 >> 16);
+	      (unsigned long)mc->bits,
+	      (unsigned long)mc->bits >> 16 >> 16);
 	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
@@ -658,7 +658,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 
 	/* get the current revision from MSR 0x8B */
 	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-	if (val[1] != mc_intel->hdr.rev)
+	if (val[1] != mc->hdr.rev)
 		return -1;
 
 #ifdef CONFIG_X86_64
@@ -670,7 +670,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 	if (early)
 		print_ucode(uci);
 	else
-		print_ucode_info(uci, mc_intel->hdr.date);
+		print_ucode_info(uci, mc->hdr.date);
 
 	return 0;
 }
@@ -821,7 +821,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
  * return 0 - no update found
  * return 1 - found update
  */
-static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
+static int get_matching_mc(struct microcode_intel *mc, int cpu)
 {
 	struct cpu_signature cpu_sig;
 	unsigned int csig, cpf, crev;
@@ -832,38 +832,38 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
 	cpf = cpu_sig.pf;
 	crev = cpu_sig.rev;
 
-	return has_newer_microcode(mc_intel, csig, cpf, crev);
+	return has_newer_microcode(mc, csig, cpf, crev);
 }
 
 static int apply_microcode_intel(int cpu)
 {
-	struct microcode_intel *mc_intel;
+	struct microcode_intel *mc;
 	struct ucode_cpu_info *uci;
 	unsigned int val[2];
 	int cpu_num = raw_smp_processor_id();
 	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
 
 	uci = ucode_cpu_info + cpu;
-	mc_intel = uci->mc;
+	mc = uci->mc;
 
 	/* We should bind the task to the CPU */
 	BUG_ON(cpu_num != cpu);
 
-	if (mc_intel == NULL)
+	if (!mc)
 		return 0;
 
 	/*
 	 * Microcode on this CPU could be updated earlier. Only apply the
-	 * microcode patch in mc_intel when it is newer than the one on this
+	 * microcode patch in mc when it is newer than the one on this
 	 * CPU.
 	 */
-	if (get_matching_mc(mc_intel, cpu) == 0)
+	if (!get_matching_mc(mc, cpu))
 		return 0;
 
 	/* write microcode via MSR 0x79 */
 	wrmsr(MSR_IA32_UCODE_WRITE,
-	      (unsigned long) mc_intel->bits,
-	      (unsigned long) mc_intel->bits >> 16 >> 16);
+	      (unsigned long) mc->bits,
+	      (unsigned long) mc->bits >> 16 >> 16);
 	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
@@ -872,16 +872,16 @@ static int apply_microcode_intel(int cpu)
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
 
-	if (val[1] != mc_intel->hdr.rev) {
+	if (val[1] != mc->hdr.rev) {
 		pr_err("CPU%d update to revision 0x%x failed\n",
-		       cpu_num, mc_intel->hdr.rev);
+		       cpu_num, mc->hdr.rev);
 		return -1;
 	}
 	pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
 		cpu_num, val[1],
-		mc_intel->hdr.date & 0xffff,
-		mc_intel->hdr.date >> 24,
-		(mc_intel->hdr.date >> 16) & 0xff);
+		mc->hdr.date & 0xffff,
+		mc->hdr.date >> 24,
+		(mc->hdr.date >> 16) & 0xff);
 
 	uci->cpu_sig.rev = val[1];
 	c->microcode = val[1];
-- 
cgit v0.10.2


From 58b5f2cc4bdbc9b616e68639f5a84886aa5be590 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:38 +0100
Subject: x86/microcode/intel: Move the BUG_ON up and turn it into WARN_ON

If we're going to BUG_ON() because we're running on the wrong CPU, we
better do it as the first thing we do when entering that function. And
also, turn it into a WARN_ON() because it is not worth to panic the
system if we apply the microcode on the wrong CPU - we're simply going
to exit early.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-11-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index d1b2f58..c029c2b 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -843,12 +843,12 @@ static int apply_microcode_intel(int cpu)
 	int cpu_num = raw_smp_processor_id();
 	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
 
-	uci = ucode_cpu_info + cpu;
-	mc = uci->mc;
-
 	/* We should bind the task to the CPU */
-	BUG_ON(cpu_num != cpu);
+	if (WARN_ON(cpu_num != cpu))
+		return -1;
 
+	uci = ucode_cpu_info + cpu;
+	mc = uci->mc;
 	if (!mc)
 		return 0;
 
-- 
cgit v0.10.2


From 26cbaa4dc676a444aa626cbc642c4c8181ef1378 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:39 +0100
Subject: x86/microcode/intel: Cleanup apply_microcode_intel()

Get rid of local variable cpu_num as it is equal to @cpu now. Deref
cpu_data() only when it is really needed at the end.

No functionality change.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-12-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index c029c2b..35186a0 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -839,12 +839,11 @@ static int apply_microcode_intel(int cpu)
 {
 	struct microcode_intel *mc;
 	struct ucode_cpu_info *uci;
+	struct cpuinfo_x86 *c;
 	unsigned int val[2];
-	int cpu_num = raw_smp_processor_id();
-	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
 
 	/* We should bind the task to the CPU */
-	if (WARN_ON(cpu_num != cpu))
+	if (WARN_ON(raw_smp_processor_id() != cpu))
 		return -1;
 
 	uci = ucode_cpu_info + cpu;
@@ -874,15 +873,18 @@ static int apply_microcode_intel(int cpu)
 
 	if (val[1] != mc->hdr.rev) {
 		pr_err("CPU%d update to revision 0x%x failed\n",
-		       cpu_num, mc->hdr.rev);
+		       cpu, mc->hdr.rev);
 		return -1;
 	}
+
 	pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
-		cpu_num, val[1],
+		cpu, val[1],
 		mc->hdr.date & 0xffff,
 		mc->hdr.date >> 24,
 		(mc->hdr.date >> 16) & 0xff);
 
+	c = &cpu_data(cpu);
+
 	uci->cpu_sig.rev = val[1];
 	c->microcode = val[1];
 
-- 
cgit v0.10.2


From c416e6117575213a5a962149620684a09f9e4ece Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:40 +0100
Subject: x86/microcode/intel: Use *wrmsrl variants

... and drop the 32-bit casting games which we had to do at the time
because wrmsr() was unforgiving then, see c3fd0bd5e19a from the
full history tree:

  commit c3fd0bd5e19aaff9cdd104edff136a2023db657e
  Author: Linus Torvalds <torvalds@home.osdl.org>
  Date:   Tue Feb 17 23:23:41 2004 -0800

    Fix up the microcode update on regular 32-bit x86. Our wrmsr()
    is a bit unforgiving and really doesn't like 64-bit values.
    ...

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-13-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 35186a0..ff0b449 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -370,7 +370,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 		native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
 		csig.pf = 1 << ((val[1] >> 18) & 7);
 	}
-	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+	native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
 	sync_core();
@@ -648,10 +648,8 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 		return 0;
 
 	/* write microcode via MSR 0x79 */
-	native_wrmsr(MSR_IA32_UCODE_WRITE,
-	      (unsigned long)mc->bits,
-	      (unsigned long)mc->bits >> 16 >> 16);
-	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+	native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+	native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
 	sync_core();
@@ -860,10 +858,8 @@ static int apply_microcode_intel(int cpu)
 		return 0;
 
 	/* write microcode via MSR 0x79 */
-	wrmsr(MSR_IA32_UCODE_WRITE,
-	      (unsigned long) mc->bits,
-	      (unsigned long) mc->bits >> 16 >> 16);
-	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+	wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
+	wrmsrl(MSR_IA32_UCODE_REV, 0);
 
 	/* As documented in the SDM: Do a CPUID 1 here */
 	sync_core();
-- 
cgit v0.10.2


From f8bb45e2c4acf1395bb6e61a135ce8c9107388cf Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:41 +0100
Subject: x86/microcode/intel: Rename mc_saved_in_initrd

Rename it to mc_tmp_ptrs to denote better what it is - a temporary array
for saving pointers to microcode blobs. And "initrd" is not accurate
anymore since initrd is not the only source for early microcode.
Therefore, rename copy_initrd_ptrs() to copy_ptrs() simply and
"initrd_start" to "offset".

And then do the following convention: the global variable is called
"mc_tmp_ptrs" and the local function arguments "mc_ptrs" for
differentiation.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-14-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ff0b449..5970758 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,7 +39,13 @@
 #include <asm/setup.h>
 #include <asm/msr.h>
 
-static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+/*
+ * Temporary microcode blobs pointers storage. We note here the pointers to
+ * microcode blobs we've got from whatever storage (detached initrd, builtin).
+ * Later on, we put those into final storage mc_saved_data.mc_saved.
+ */
+static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT];
+
 static struct mc_saved_data {
 	unsigned int num_saved;
 	struct microcode_intel **mc_saved;
@@ -78,13 +84,13 @@ load_microcode_early(struct microcode_intel **saved,
 }
 
 static inline void
-copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
-		  unsigned long off, int num_saved)
+copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs,
+	  unsigned long off, int num_saved)
 {
 	int i;
 
 	for (i = 0; i < num_saved; i++)
-		mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
+		mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off);
 }
 
 #ifdef CONFIG_X86_32
@@ -106,14 +112,14 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
 #endif
 
 static enum ucode_state
-load_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
-	       unsigned long initrd_start, struct ucode_cpu_info *uci)
+load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
+	       unsigned long offset, struct ucode_cpu_info *uci)
 {
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
 	unsigned int count = mcs->num_saved;
 
 	if (!mcs->mc_saved) {
-		copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
+		copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count);
 
 		return load_microcode_early(mc_saved_tmp, count, uci);
 	} else {
@@ -284,7 +290,7 @@ static enum ucode_state __init
 get_matching_model_microcode(int cpu, unsigned long start,
 			     void *data, size_t size,
 			     struct mc_saved_data *mcs,
-			     unsigned long *mc_saved_in_initrd,
+			     unsigned long *mc_ptrs,
 			     struct ucode_cpu_info *uci)
 {
 	u8 *ucode_ptr = data;
@@ -337,7 +343,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
 	}
 
 	for (i = 0; i < num_saved; i++)
-		mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
+		mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
 
 	mcs->num_saved = num_saved;
 out:
@@ -533,7 +539,7 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
 
 static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
 static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
+scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
 	       unsigned long start, unsigned long size,
 	       struct ucode_cpu_info *uci)
 {
@@ -559,7 +565,7 @@ scan_microcode(struct mc_saved_data *mcs, unsigned long *initrd,
 	}
 
 	return get_matching_model_microcode(0, start, cd.data, cd.size,
-					    mcs, initrd, uci);
+					    mcs, mc_ptrs, uci);
 }
 
 /*
@@ -675,7 +681,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 
 /*
  * This function converts microcode patch offsets previously stored in
- * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
+ * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data.
  */
 int __init save_microcode_in_initrd_intel(void)
 {
@@ -686,7 +692,7 @@ int __init save_microcode_in_initrd_intel(void)
 	if (!count)
 		return ret;
 
-	copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, get_initrd_start(), count);
+	copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count);
 
 	ret = save_microcode(&mc_saved_data, mc_saved, count);
 	if (ret)
@@ -698,7 +704,7 @@ int __init save_microcode_in_initrd_intel(void)
 }
 
 static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *initrd,
+_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
 		      unsigned long start, unsigned long size)
 {
 	struct ucode_cpu_info uci;
@@ -706,11 +712,11 @@ _load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *initrd,
 
 	collect_cpu_info_early(&uci);
 
-	ret = scan_microcode(mcs, initrd, start, size, &uci);
+	ret = scan_microcode(mcs, mc_ptrs, start, size, &uci);
 	if (ret != UCODE_OK)
 		return;
 
-	ret = load_microcode(mcs, initrd, start, &uci);
+	ret = load_microcode(mcs, mc_ptrs, start, &uci);
 	if (ret != UCODE_OK)
 		return;
 
@@ -733,28 +739,28 @@ void __init load_ucode_intel_bsp(void)
 	start	= (size ? p->hdr.ramdisk_image : 0);
 
 	_load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
-			      (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+			      (unsigned long *)__pa_nodebug(&mc_tmp_ptrs),
 			      start, size);
 #else
 	size	= boot_params.hdr.ramdisk_size;
 	start	= (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0);
 
-	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
+	_load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size);
 #endif
 }
 
 void load_ucode_intel_ap(void)
 {
-	unsigned long *mc_saved_in_initrd_p;
+	unsigned long *mcs_tmp_p;
 	struct mc_saved_data *mcs_p;
 	struct ucode_cpu_info uci;
 	enum ucode_state ret;
 #ifdef CONFIG_X86_32
 
-	mc_saved_in_initrd_p = (unsigned long *)__pa_nodebug(mc_saved_in_initrd);
+	mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
 	mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
 #else
-	mc_saved_in_initrd_p = mc_saved_in_initrd;
+	mcs_tmp_p = mc_tmp_ptrs;
 	mcs_p = &mc_saved_data;
 #endif
 
@@ -766,9 +772,7 @@ void load_ucode_intel_ap(void)
 		return;
 
 	collect_cpu_info_early(&uci);
-	ret = load_microcode(mcs_p, mc_saved_in_initrd_p,
-			     get_initrd_start_addr(), &uci);
-
+	ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci);
 	if (ret != UCODE_OK)
 		return;
 
-- 
cgit v0.10.2


From 2f303c524ed021825671cfa9b1934338bc04f8ab Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:42 +0100
Subject: x86/microcode/intel: Remove unused arg of
 get_matching_model_microcode()

@cpu is unused, kill it.

No functionality change.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-15-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5970758..0c67fd0 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -287,7 +287,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
  * BSP can stay in the platform.
  */
 static enum ucode_state __init
-get_matching_model_microcode(int cpu, unsigned long start,
+get_matching_model_microcode(unsigned long start,
 			     void *data, size_t size,
 			     struct mc_saved_data *mcs,
 			     unsigned long *mc_ptrs,
@@ -564,7 +564,7 @@ scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
 			return UCODE_ERROR;
 	}
 
-	return get_matching_model_microcode(0, start, cd.data, cd.size,
+	return get_matching_model_microcode(start, cd.data, cd.size,
 					    mcs, mc_ptrs, uci);
 }
 
-- 
cgit v0.10.2


From f96fde531946524b26d25d4eed9625695837f524 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:43 +0100
Subject: x86/microcode/intel: Cleanup get_matching_model_microcode()

Reflow arguments, sort local variables in reverse christmas tree, kill
"out" label.

No functionality change.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-16-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 0c67fd0..cb397947 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -287,19 +287,17 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
  * BSP can stay in the platform.
  */
 static enum ucode_state __init
-get_matching_model_microcode(unsigned long start,
-			     void *data, size_t size,
-			     struct mc_saved_data *mcs,
-			     unsigned long *mc_ptrs,
+get_matching_model_microcode(unsigned long start, void *data, size_t size,
+			     struct mc_saved_data *mcs, unsigned long *mc_ptrs,
 			     struct ucode_cpu_info *uci)
 {
-	u8 *ucode_ptr = data;
-	unsigned int leftover = size;
-	enum ucode_state state = UCODE_OK;
-	unsigned int mc_size;
-	struct microcode_header_intel *mc_header;
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
+	struct microcode_header_intel *mc_header;
 	unsigned int num_saved = mcs->num_saved;
+	enum ucode_state state = UCODE_OK;
+	unsigned int leftover = size;
+	u8 *ucode_ptr = data;
+	unsigned int mc_size;
 	int i;
 
 	while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
@@ -321,8 +319,7 @@ get_matching_model_microcode(unsigned long start,
 		 * the platform, we need to find and save microcode patches
 		 * with the same family and model as the BSP.
 		 */
-		if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
-			 UCODE_OK) {
+		if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) {
 			ucode_ptr += mc_size;
 			continue;
 		}
@@ -334,19 +331,19 @@ get_matching_model_microcode(unsigned long start,
 
 	if (leftover) {
 		state = UCODE_ERROR;
-		goto out;
+		return state;
 	}
 
 	if (!num_saved) {
 		state = UCODE_NFOUND;
-		goto out;
+		return state;
 	}
 
 	for (i = 0; i < num_saved; i++)
 		mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
 
 	mcs->num_saved = num_saved;
-out:
+
 	return state;
 }
 
-- 
cgit v0.10.2


From f7eb59dda129e46be5e195a46bfd0dde76db9bbd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:44 +0100
Subject: x86/microcode/AMD: Issue microcode updated message later

Before this, we issued this message from save_microcode_in_initrd()
which is called from free_initrd_mem(), i.e., only when we have an
initrd enabled. However, we can update from builtin microcode too but
then we don't issue the update message.

Fix it by issuing that message on the generic driver init path.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-17-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 9f5ccef..f66cbfe 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -431,10 +431,6 @@ int __init save_microcode_in_initrd_amd(void)
 	else
 		container = cont_va;
 
-	if (ucode_new_rev)
-		pr_info_once("microcode updated early to new patch_level=0x%08x\n",
-			     ucode_new_rev);
-
 	eax   = cpuid_eax(0x00000001);
 	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
 
@@ -956,6 +952,10 @@ struct microcode_ops * __init init_amd_microcode(void)
 		return NULL;
 	}
 
+	if (ucode_new_rev)
+		pr_info_once("microcode updated early to new patch_level=0x%08x\n",
+			     ucode_new_rev);
+
 	return &microcode_amd_ops;
 }
 
-- 
cgit v0.10.2


From b584303261b7041f209afde4bf7ddb7a21598a1f Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 3 Feb 2016 12:33:45 +0100
Subject: x86/microcode: Document builtin microcode loading method

Add some text and an example to Documentation/x86/early-microcode.txt
explaining how to build in microcode.

Tested-by: Thomas Voegtle <tv@lio96.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454499225-21544-18-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/Documentation/x86/early-microcode.txt b/Documentation/x86/early-microcode.txt
index d62bea6..c956d99 100644
--- a/Documentation/x86/early-microcode.txt
+++ b/Documentation/x86/early-microcode.txt
@@ -40,3 +40,28 @@ cp ../microcode.bin kernel/x86/microcode/GenuineIntel.bin (or AuthenticAMD.bin)
 find . | cpio -o -H newc >../ucode.cpio
 cd ..
 cat ucode.cpio /boot/initrd-3.5.0.img >/boot/initrd-3.5.0.ucode.img
+
+Builtin microcode
+=================
+
+We can also load builtin microcode supplied through the regular firmware
+builtin method CONFIG_FIRMWARE_IN_KERNEL. Here's an example:
+
+CONFIG_FIRMWARE_IN_KERNEL=y
+CONFIG_EXTRA_FIRMWARE="intel-ucode/06-3a-09 amd-ucode/microcode_amd_fam15h.bin"
+CONFIG_EXTRA_FIRMWARE_DIR="/lib/firmware"
+
+This basically means, you have the following tree structure locally:
+
+/lib/firmware/
+|-- amd-ucode
+...
+|   |-- microcode_amd_fam15h.bin
+...
+|-- intel-ucode
+...
+|   |-- 06-3a-09
+...
+
+so that the build system can find those files and integrate them into
+the final kernel image. The early loader finds them and applies them.
-- 
cgit v0.10.2


From cb2517653fccaf9f9b4ae968c7ee005c1bbacdc5 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Fri, 5 Feb 2016 09:08:36 +0000
Subject: sched/debug: Make schedstats a runtime tunable that is disabled by
 default

schedstats is very useful during debugging and performance tuning but it
incurs overhead to calculate the stats. As such, even though it can be
disabled at build time, it is often enabled as the information is useful.

This patch adds a kernel command-line and sysctl tunable to enable or
disable schedstats on demand (when it's built in). It is disabled
by default as someone who knows they need it can also learn to enable
it when necessary.

The benefits are dependent on how scheduler-intensive the workload is.
If it is then the patch reduces the number of cycles spent calculating
the stats with a small benefit from reducing the cache footprint of the
scheduler.

These measurements were taken from a 48-core 2-socket
machine with Xeon(R) E5-2670 v3 cpus although they were also tested on a
single socket machine 8-core machine with Intel i7-3770 processors.

netperf-tcp
                           4.5.0-rc1             4.5.0-rc1
                             vanilla          nostats-v3r1
Hmean    64         560.45 (  0.00%)      575.98 (  2.77%)
Hmean    128        766.66 (  0.00%)      795.79 (  3.80%)
Hmean    256        950.51 (  0.00%)      981.50 (  3.26%)
Hmean    1024      1433.25 (  0.00%)     1466.51 (  2.32%)
Hmean    2048      2810.54 (  0.00%)     2879.75 (  2.46%)
Hmean    3312      4618.18 (  0.00%)     4682.09 (  1.38%)
Hmean    4096      5306.42 (  0.00%)     5346.39 (  0.75%)
Hmean    8192     10581.44 (  0.00%)    10698.15 (  1.10%)
Hmean    16384    18857.70 (  0.00%)    18937.61 (  0.42%)

Small gains here, UDP_STREAM showed nothing intresting and neither did
the TCP_RR tests. The gains on the 8-core machine were very similar.

tbench4
                                 4.5.0-rc1             4.5.0-rc1
                                   vanilla          nostats-v3r1
Hmean    mb/sec-1         500.85 (  0.00%)      522.43 (  4.31%)
Hmean    mb/sec-2         984.66 (  0.00%)     1018.19 (  3.41%)
Hmean    mb/sec-4        1827.91 (  0.00%)     1847.78 (  1.09%)
Hmean    mb/sec-8        3561.36 (  0.00%)     3611.28 (  1.40%)
Hmean    mb/sec-16       5824.52 (  0.00%)     5929.03 (  1.79%)
Hmean    mb/sec-32      10943.10 (  0.00%)    10802.83 ( -1.28%)
Hmean    mb/sec-64      15950.81 (  0.00%)    16211.31 (  1.63%)
Hmean    mb/sec-128     15302.17 (  0.00%)    15445.11 (  0.93%)
Hmean    mb/sec-256     14866.18 (  0.00%)    15088.73 (  1.50%)
Hmean    mb/sec-512     15223.31 (  0.00%)    15373.69 (  0.99%)
Hmean    mb/sec-1024    14574.25 (  0.00%)    14598.02 (  0.16%)
Hmean    mb/sec-2048    13569.02 (  0.00%)    13733.86 (  1.21%)
Hmean    mb/sec-3072    12865.98 (  0.00%)    13209.23 (  2.67%)

Small gains of 2-4% at low thread counts and otherwise flat.  The
gains on the 8-core machine were slightly different

tbench4 on 8-core i7-3770 single socket machine
Hmean    mb/sec-1        442.59 (  0.00%)      448.73 (  1.39%)
Hmean    mb/sec-2        796.68 (  0.00%)      794.39 ( -0.29%)
Hmean    mb/sec-4       1322.52 (  0.00%)     1343.66 (  1.60%)
Hmean    mb/sec-8       2611.65 (  0.00%)     2694.86 (  3.19%)
Hmean    mb/sec-16      2537.07 (  0.00%)     2609.34 (  2.85%)
Hmean    mb/sec-32      2506.02 (  0.00%)     2578.18 (  2.88%)
Hmean    mb/sec-64      2511.06 (  0.00%)     2569.16 (  2.31%)
Hmean    mb/sec-128     2313.38 (  0.00%)     2395.50 (  3.55%)
Hmean    mb/sec-256     2110.04 (  0.00%)     2177.45 (  3.19%)
Hmean    mb/sec-512     2072.51 (  0.00%)     2053.97 ( -0.89%)

In constract, this shows a relatively steady 2-3% gain at higher thread
counts. Due to the nature of the patch and the type of workload, it's
not a surprise that the result will depend on the CPU used.

hackbench-pipes
                         4.5.0-rc1             4.5.0-rc1
                           vanilla          nostats-v3r1
Amean    1        0.0637 (  0.00%)      0.0660 ( -3.59%)
Amean    4        0.1229 (  0.00%)      0.1181 (  3.84%)
Amean    7        0.1921 (  0.00%)      0.1911 (  0.52%)
Amean    12       0.3117 (  0.00%)      0.2923 (  6.23%)
Amean    21       0.4050 (  0.00%)      0.3899 (  3.74%)
Amean    30       0.4586 (  0.00%)      0.4433 (  3.33%)
Amean    48       0.5910 (  0.00%)      0.5694 (  3.65%)
Amean    79       0.8663 (  0.00%)      0.8626 (  0.43%)
Amean    110      1.1543 (  0.00%)      1.1517 (  0.22%)
Amean    141      1.4457 (  0.00%)      1.4290 (  1.16%)
Amean    172      1.7090 (  0.00%)      1.6924 (  0.97%)
Amean    192      1.9126 (  0.00%)      1.9089 (  0.19%)

Some small gains and losses and while the variance data is not included,
it's close to the noise. The UMA machine did not show anything particularly
different

pipetest
                             4.5.0-rc1             4.5.0-rc1
                               vanilla          nostats-v2r2
Min         Time        4.13 (  0.00%)        3.99 (  3.39%)
1st-qrtle   Time        4.38 (  0.00%)        4.27 (  2.51%)
2nd-qrtle   Time        4.46 (  0.00%)        4.39 (  1.57%)
3rd-qrtle   Time        4.56 (  0.00%)        4.51 (  1.10%)
Max-90%     Time        4.67 (  0.00%)        4.60 (  1.50%)
Max-93%     Time        4.71 (  0.00%)        4.65 (  1.27%)
Max-95%     Time        4.74 (  0.00%)        4.71 (  0.63%)
Max-99%     Time        4.88 (  0.00%)        4.79 (  1.84%)
Max         Time        4.93 (  0.00%)        4.83 (  2.03%)
Mean        Time        4.48 (  0.00%)        4.39 (  1.91%)
Best99%Mean Time        4.47 (  0.00%)        4.39 (  1.91%)
Best95%Mean Time        4.46 (  0.00%)        4.38 (  1.93%)
Best90%Mean Time        4.45 (  0.00%)        4.36 (  1.98%)
Best50%Mean Time        4.36 (  0.00%)        4.25 (  2.49%)
Best10%Mean Time        4.23 (  0.00%)        4.10 (  3.13%)
Best5%Mean  Time        4.19 (  0.00%)        4.06 (  3.20%)
Best1%Mean  Time        4.13 (  0.00%)        4.00 (  3.39%)

Small improvement and similar gains were seen on the UMA machine.

The gain is small but it stands to reason that doing less work in the
scheduler is a good thing. The downside is that the lack of schedstats and
tracepoints may be surprising to experts doing performance analysis until
they find the existence of the schedstats= parameter or schedstats sysctl.
It will be automatically activated for latencytop and sleep profiling to
alleviate the problem. For tracepoints, there is a simple warning as it's
not safe to activate schedstats in the context when it's known the tracepoint
may be wanted but is unavailable.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <mgalbraith@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454663316-22048-1-git-send-email-mgorman@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 551ecf0..ed47b60 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3528,6 +3528,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	sched_debug	[KNL] Enables verbose scheduler debug messages.
 
+	schedstats=	[KNL,X86] Enable or disable scheduled statistics.
+			Allowed values are enable and disable. This feature
+			incurs a small amount of overhead in the scheduler
+			but is useful for debugging and performance tuning.
+
 	skew_tick=	[KNL] Offset the periodic timer tick per cpu to mitigate
 			xtime_lock contention on larger systems, and/or RCU lock
 			contention on all systems with CONFIG_MAXSMP set.
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index a93b414..87119dc 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -760,6 +760,14 @@ rtsig-nr shows the number of RT signals currently queued.
 
 ==============================================================
 
+sched_schedstats:
+
+Enables/disables scheduler statistics. Enabling this feature
+incurs a small amount of overhead in the scheduler but is
+useful for debugging and performance tuning.
+
+==============================================================
+
 sg-big-buff:
 
 This file shows the size of the generic SCSI (sg) buffer.
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index e23121f..59ccab2 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -37,6 +37,9 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
 
 void clear_all_latency_tracing(struct task_struct *p);
 
+extern int sysctl_latencytop(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos);
+
 #else
 
 static inline void
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..a292c4b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -920,6 +920,10 @@ static inline int sched_info_on(void)
 #endif
 }
 
+#ifdef CONFIG_SCHEDSTATS
+void force_schedstat_enabled(void);
+#endif
+
 enum cpu_idle_type {
 	CPU_IDLE,
 	CPU_NOT_IDLE,
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index c9e4731..4f080ab 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -95,4 +95,8 @@ extern int sysctl_numa_balancing(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);
 
+extern int sysctl_schedstats(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos);
+
 #endif /* _SCHED_SYSCTL_H */
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index a028127..b5c30d9 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -47,12 +47,12 @@
  * of times)
  */
 
-#include <linux/latencytop.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
+#include <linux/latencytop.h>
 #include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/list.h>
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
 	proc_create("latency_stats", 0644, NULL, &lstats_fops);
 	return 0;
 }
+
+int sysctl_latencytop(struct ctl_table *table, int write,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int err;
+
+	err = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (latencytop_enabled)
+		force_schedstat_enabled();
+
+	return err;
+}
 device_initcall(init_lstats_procfs);
diff --git a/kernel/profile.c b/kernel/profile.c
index 99513e1..5136969 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -59,6 +59,7 @@ int profile_setup(char *str)
 
 	if (!strncmp(str, sleepstr, strlen(sleepstr))) {
 #ifdef CONFIG_SCHEDSTATS
+		force_schedstat_enabled();
 		prof_on = SLEEP_PROFILING;
 		if (str[strlen(sleepstr)] == ',')
 			str += strlen(sleepstr) + 1;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 24fcdbf..7e548bd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2093,7 +2093,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
 	ttwu_queue(p, cpu);
 stat:
-	ttwu_stat(p, cpu, wake_flags);
+	if (schedstat_enabled())
+		ttwu_stat(p, cpu, wake_flags);
 out:
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
@@ -2141,7 +2142,8 @@ static void try_to_wake_up_local(struct task_struct *p)
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
 	ttwu_do_wakeup(rq, p, 0);
-	ttwu_stat(p, smp_processor_id(), 0);
+	if (schedstat_enabled())
+		ttwu_stat(p, smp_processor_id(), 0);
 out:
 	raw_spin_unlock(&p->pi_lock);
 }
@@ -2210,6 +2212,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif
 
 #ifdef CONFIG_SCHEDSTATS
+	/* Even if schedstat is disabled, there should not be garbage */
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 
@@ -2281,6 +2284,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 #endif
 #endif
 
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+
+#ifdef CONFIG_SCHEDSTATS
+static void set_schedstats(bool enabled)
+{
+	if (enabled)
+		static_branch_enable(&sched_schedstats);
+	else
+		static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+	if (!schedstat_enabled()) {
+		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+		static_branch_enable(&sched_schedstats);
+	}
+}
+
+static int __init setup_schedstats(char *str)
+{
+	int ret = 0;
+	if (!str)
+		goto out;
+
+	if (!strcmp(str, "enable")) {
+		set_schedstats(true);
+		ret = 1;
+	} else if (!strcmp(str, "disable")) {
+		set_schedstats(false);
+		ret = 1;
+	}
+out:
+	if (!ret)
+		pr_warn("Unable to parse schedstats=\n");
+
+	return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = static_branch_likely(&sched_schedstats);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write)
+		set_schedstats(state);
+	return err;
+}
+#endif
+#endif
+
 /*
  * fork()/clone()-time setup:
  */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 6415117..7cfa87b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -75,16 +75,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
-	PN(se->statistics.wait_start);
-	PN(se->statistics.sleep_start);
-	PN(se->statistics.block_start);
-	PN(se->statistics.sleep_max);
-	PN(se->statistics.block_max);
-	PN(se->statistics.exec_max);
-	PN(se->statistics.slice_max);
-	PN(se->statistics.wait_max);
-	PN(se->statistics.wait_sum);
-	P(se->statistics.wait_count);
+	if (schedstat_enabled()) {
+		PN(se->statistics.wait_start);
+		PN(se->statistics.sleep_start);
+		PN(se->statistics.block_start);
+		PN(se->statistics.sleep_max);
+		PN(se->statistics.block_max);
+		PN(se->statistics.exec_max);
+		PN(se->statistics.slice_max);
+		PN(se->statistics.wait_max);
+		PN(se->statistics.wait_sum);
+		P(se->statistics.wait_count);
+	}
 #endif
 	P(se->load.weight);
 #ifdef CONFIG_SMP
@@ -122,10 +124,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
-	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-		SPLIT_NS(p->se.statistics.wait_sum),
-		SPLIT_NS(p->se.sum_exec_runtime),
-		SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+	if (schedstat_enabled()) {
+		SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
+			SPLIT_NS(p->se.statistics.wait_sum),
+			SPLIT_NS(p->se.sum_exec_runtime),
+			SPLIT_NS(p->se.statistics.sum_sleep_runtime));
+	}
 #else
 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
 		0LL, 0L,
@@ -313,17 +317,18 @@ do {									\
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
 #define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
 
-	P(yld_count);
-
-	P(sched_count);
-	P(sched_goidle);
 #ifdef CONFIG_SMP
 	P64(avg_idle);
 	P64(max_idle_balance_cost);
 #endif
 
-	P(ttwu_count);
-	P(ttwu_local);
+	if (schedstat_enabled()) {
+		P(yld_count);
+		P(sched_count);
+		P(sched_goidle);
+		P(ttwu_count);
+		P(ttwu_local);
+	}
 
 #undef P
 #undef P64
@@ -569,38 +574,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	nr_switches = p->nvcsw + p->nivcsw;
 
 #ifdef CONFIG_SCHEDSTATS
-	PN(se.statistics.sum_sleep_runtime);
-	PN(se.statistics.wait_start);
-	PN(se.statistics.sleep_start);
-	PN(se.statistics.block_start);
-	PN(se.statistics.sleep_max);
-	PN(se.statistics.block_max);
-	PN(se.statistics.exec_max);
-	PN(se.statistics.slice_max);
-	PN(se.statistics.wait_max);
-	PN(se.statistics.wait_sum);
-	P(se.statistics.wait_count);
-	PN(se.statistics.iowait_sum);
-	P(se.statistics.iowait_count);
 	P(se.nr_migrations);
-	P(se.statistics.nr_migrations_cold);
-	P(se.statistics.nr_failed_migrations_affine);
-	P(se.statistics.nr_failed_migrations_running);
-	P(se.statistics.nr_failed_migrations_hot);
-	P(se.statistics.nr_forced_migrations);
-	P(se.statistics.nr_wakeups);
-	P(se.statistics.nr_wakeups_sync);
-	P(se.statistics.nr_wakeups_migrate);
-	P(se.statistics.nr_wakeups_local);
-	P(se.statistics.nr_wakeups_remote);
-	P(se.statistics.nr_wakeups_affine);
-	P(se.statistics.nr_wakeups_affine_attempts);
-	P(se.statistics.nr_wakeups_passive);
-	P(se.statistics.nr_wakeups_idle);
 
-	{
+	if (schedstat_enabled()) {
 		u64 avg_atom, avg_per_cpu;
 
+		PN(se.statistics.sum_sleep_runtime);
+		PN(se.statistics.wait_start);
+		PN(se.statistics.sleep_start);
+		PN(se.statistics.block_start);
+		PN(se.statistics.sleep_max);
+		PN(se.statistics.block_max);
+		PN(se.statistics.exec_max);
+		PN(se.statistics.slice_max);
+		PN(se.statistics.wait_max);
+		PN(se.statistics.wait_sum);
+		P(se.statistics.wait_count);
+		PN(se.statistics.iowait_sum);
+		P(se.statistics.iowait_count);
+		P(se.statistics.nr_migrations_cold);
+		P(se.statistics.nr_failed_migrations_affine);
+		P(se.statistics.nr_failed_migrations_running);
+		P(se.statistics.nr_failed_migrations_hot);
+		P(se.statistics.nr_forced_migrations);
+		P(se.statistics.nr_wakeups);
+		P(se.statistics.nr_wakeups_sync);
+		P(se.statistics.nr_wakeups_migrate);
+		P(se.statistics.nr_wakeups_local);
+		P(se.statistics.nr_wakeups_remote);
+		P(se.statistics.nr_wakeups_affine);
+		P(se.statistics.nr_wakeups_affine_attempts);
+		P(se.statistics.nr_wakeups_passive);
+		P(se.statistics.nr_wakeups_idle);
+
 		avg_atom = p->se.sum_exec_runtime;
 		if (nr_switches)
 			avg_atom = div64_ul(avg_atom, nr_switches);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56b7d4b..51a4550 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,8 +20,8 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  */
 
-#include <linux/latencytop.h>
 #include <linux/sched.h>
+#include <linux/latencytop.h>
 #include <linux/cpumask.h>
 #include <linux/cpuidle.h>
 #include <linux/slab.h>
@@ -755,7 +755,9 @@ static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	struct task_struct *p;
-	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+	u64 delta;
+
+	delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
 
 	if (entity_is_task(se)) {
 		p = task_of(se);
@@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->statistics.wait_sum += delta;
 	se->statistics.wait_start = 0;
 }
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-#endif
 
 /*
  * Task is being enqueued - update stats:
  */
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	/*
 	 * Are we enqueueing a waiting task? (for current tasks
@@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	/*
 	 * Mark the end of the wait period if dequeueing a
@@ -810,7 +802,40 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 */
 	if (se != cfs_rq->curr)
 		update_stats_wait_end(cfs_rq, se);
+
+	if (flags & DEQUEUE_SLEEP) {
+		if (entity_is_task(se)) {
+			struct task_struct *tsk = task_of(se);
+
+			if (tsk->state & TASK_INTERRUPTIBLE)
+				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+			if (tsk->state & TASK_UNINTERRUPTIBLE)
+				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+		}
+	}
+
+}
+#else
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
 }
+#endif
 
 /*
  * We are picking a new current task - update its stats:
@@ -3102,6 +3127,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 
+static inline void check_schedstat_required(void)
+{
+#ifdef CONFIG_SCHEDSTATS
+	if (schedstat_enabled())
+		return;
+
+	/* Force schedstat enabled if a dependent tracepoint is active */
+	if (trace_sched_stat_wait_enabled()    ||
+			trace_sched_stat_sleep_enabled()   ||
+			trace_sched_stat_iowait_enabled()  ||
+			trace_sched_stat_blocked_enabled() ||
+			trace_sched_stat_runtime_enabled())  {
+		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+			     "stat_blocked and stat_runtime require the "
+			     "kernel parameter schedstats=enabled or "
+			     "kernel.sched_schedstats=1\n");
+	}
+#endif
+}
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -3122,11 +3167,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (flags & ENQUEUE_WAKEUP) {
 		place_entity(cfs_rq, se, 0);
-		enqueue_sleeper(cfs_rq, se);
+		if (schedstat_enabled())
+			enqueue_sleeper(cfs_rq, se);
 	}
 
-	update_stats_enqueue(cfs_rq, se);
-	check_spread(cfs_rq, se);
+	check_schedstat_required();
+	if (schedstat_enabled()) {
+		update_stats_enqueue(cfs_rq, se);
+		check_spread(cfs_rq, se);
+	}
 	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
@@ -3193,19 +3242,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_curr(cfs_rq);
 	dequeue_entity_load_avg(cfs_rq, se);
 
-	update_stats_dequeue(cfs_rq, se);
-	if (flags & DEQUEUE_SLEEP) {
-#ifdef CONFIG_SCHEDSTATS
-		if (entity_is_task(se)) {
-			struct task_struct *tsk = task_of(se);
-
-			if (tsk->state & TASK_INTERRUPTIBLE)
-				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
-			if (tsk->state & TASK_UNINTERRUPTIBLE)
-				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
-		}
-#endif
-	}
+	if (schedstat_enabled())
+		update_stats_dequeue(cfs_rq, se, flags);
 
 	clear_buddies(cfs_rq, se);
 
@@ -3279,7 +3317,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 * a CPU. So account for the time it spent waiting on the
 		 * runqueue.
 		 */
-		update_stats_wait_end(cfs_rq, se);
+		if (schedstat_enabled())
+			update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
 		update_load_avg(se, 1);
 	}
@@ -3292,7 +3331,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * least twice that of our own weight (i.e. dont track it
 	 * when there are only lesser-weight tasks around):
 	 */
-	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
 		se->statistics.slice_max = max(se->statistics.slice_max,
 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
 	}
@@ -3375,9 +3414,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	/* throttle cfs_rqs exceeding runtime */
 	check_cfs_rq_runtime(cfs_rq);
 
-	check_spread(cfs_rq, prev);
+	if (schedstat_enabled()) {
+		check_spread(cfs_rq, prev);
+		if (prev->on_rq)
+			update_stats_wait_start(cfs_rq, prev);
+	}
+
 	if (prev->on_rq) {
-		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10f1637..1d58387 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1022,6 +1022,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
 extern struct static_key_false sched_numa_balancing;
+extern struct static_key_false sched_schedstats;
 
 static inline u64 global_rt_period(void)
 {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index b0fbc76..70b3b6a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 	if (rq)
 		rq->rq_sched_info.run_delay += delta;
 }
-# define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
-# define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
-# define schedstat_set(var, val)	do { var = (val); } while (0)
+# define schedstat_enabled()		static_branch_unlikely(&sched_schedstats)
+# define schedstat_inc(rq, field)	do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+# define schedstat_add(rq, field, amt)	do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+# define schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
+# define schedstat_enabled()		0
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
 # define schedstat_set(var, val)	do { } while (0)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 97715fd..f5102fab 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHEDSTATS
+	{
+		.procname	= "sched_schedstats",
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_schedstats,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif /* CONFIG_SCHEDSTATS */
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
 	{
@@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &latencytop_enabled,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= sysctl_latencytop,
 	},
 #endif
 #ifdef CONFIG_BLK_DEV_INITRD
-- 
cgit v0.10.2


From a63f38cc4ccfa076f87fc3d0c276ee62e710f953 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 3 Feb 2016 13:44:12 -0800
Subject: locking/lockdep: Convert hash tables to hlists

Mike said:

: CONFIG_UBSAN_ALIGNMENT breaks x86-64 kernel with lockdep enabled, i.e.
: kernel with CONFIG_UBSAN_ALIGNMENT=y fails to load without even any error
: message.
:
: The problem is that ubsan callbacks use spinlocks and might be called
: before lockdep is initialized.  Particularly this line in the
: reserve_ebda_region function causes problem:
:
: lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
:
: If i put lockdep_init() before reserve_ebda_region call in
: x86_64_start_reservations kernel loads well.

Fix this ordering issue permanently: change lockdep so that it uses hlists
for the hash tables.  Unlike a list_head, an hlist_head is in its
initialized state when it is all-zeroes, so lockdep is ready for operation
immediately upon boot - lockdep_init() need not have run.

The patch will also save some memory.

Probably lockdep_init() and lockdep_initialized can be done away with now.

Suggested-by: Mike Krinkin <krinkin.m.u@gmail.com>
Reported-by: Mike Krinkin <krinkin.m.u@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: mm-commits@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index c57e424..4dca42f 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -66,7 +66,7 @@ struct lock_class {
 	/*
 	 * class-hash:
 	 */
-	struct list_head		hash_entry;
+	struct hlist_node		hash_entry;
 
 	/*
 	 * global list of all lock-classes:
@@ -199,7 +199,7 @@ struct lock_chain {
 	u8				irq_context;
 	u8				depth;
 	u16				base;
-	struct list_head		entry;
+	struct hlist_node		entry;
 	u64				chain_key;
 };
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c7710e4..716547f 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes);
 #define __classhashfn(key)	hash_long((unsigned long)key, CLASSHASH_BITS)
 #define classhashentry(key)	(classhash_table + __classhashfn((key)))
 
-static struct list_head classhash_table[CLASSHASH_SIZE];
+static struct hlist_head classhash_table[CLASSHASH_SIZE];
 
 /*
  * We put the lock dependency chains into a hash-table as well, to cache
@@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
 #define __chainhashfn(chain)	hash_long(chain, CHAINHASH_BITS)
 #define chainhashentry(chain)	(chainhash_table + __chainhashfn((chain)))
 
-static struct list_head chainhash_table[CHAINHASH_SIZE];
+static struct hlist_head chainhash_table[CHAINHASH_SIZE];
 
 /*
  * The hash key of the lock dependency chains is a hash itself too:
@@ -666,7 +666,7 @@ static inline struct lock_class *
 look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 {
 	struct lockdep_subclass_key *key;
-	struct list_head *hash_head;
+	struct hlist_head *hash_head;
 	struct lock_class *class;
 
 #ifdef CONFIG_DEBUG_LOCKDEP
@@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return NULL;
 
-	list_for_each_entry_rcu(class, hash_head, hash_entry) {
+	hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
 		if (class->key == key) {
 			/*
 			 * Huh! same key, different name? Did someone trample
@@ -742,7 +742,7 @@ static inline struct lock_class *
 register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 {
 	struct lockdep_subclass_key *key;
-	struct list_head *hash_head;
+	struct hlist_head *hash_head;
 	struct lock_class *class;
 
 	DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 * We have to do the hash-walk again, to avoid races
 	 * with another CPU:
 	 */
-	list_for_each_entry_rcu(class, hash_head, hash_entry) {
+	hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
 		if (class->key == key)
 			goto out_unlock_set;
 	}
@@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 * We use RCU's safe list-add method to make
 	 * parallel walking of the hash-list safe:
 	 */
-	list_add_tail_rcu(&class->hash_entry, hash_head);
+	hlist_add_head_rcu(&class->hash_entry, hash_head);
 	/*
 	 * Add it to the global list of classes:
 	 */
@@ -2021,7 +2021,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 				     u64 chain_key)
 {
 	struct lock_class *class = hlock_class(hlock);
-	struct list_head *hash_head = chainhashentry(chain_key);
+	struct hlist_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
 	struct held_lock *hlock_curr;
 	int i, j;
@@ -2037,7 +2037,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	 * We can walk it lock-free, because entries only get added
 	 * to the hash:
 	 */
-	list_for_each_entry_rcu(chain, hash_head, entry) {
+	hlist_for_each_entry_rcu(chain, hash_head, entry) {
 		if (chain->chain_key == chain_key) {
 cache_hit:
 			debug_atomic_inc(chain_lookup_hits);
@@ -2061,7 +2061,7 @@ cache_hit:
 	/*
 	 * We have to walk the chain again locked - to avoid duplicates:
 	 */
-	list_for_each_entry(chain, hash_head, entry) {
+	hlist_for_each_entry(chain, hash_head, entry) {
 		if (chain->chain_key == chain_key) {
 			graph_unlock();
 			goto cache_hit;
@@ -2095,7 +2095,7 @@ cache_hit:
 		}
 		chain_hlocks[chain->base + j] = class - lock_classes;
 	}
-	list_add_tail_rcu(&chain->entry, hash_head);
+	hlist_add_head_rcu(&chain->entry, hash_head);
 	debug_atomic_inc(chain_lookup_misses);
 	inc_chains();
 
@@ -3879,7 +3879,7 @@ void lockdep_reset(void)
 	nr_process_chains = 0;
 	debug_locks = 1;
 	for (i = 0; i < CHAINHASH_SIZE; i++)
-		INIT_LIST_HEAD(chainhash_table + i);
+		INIT_HLIST_HEAD(chainhash_table + i);
 	raw_local_irq_restore(flags);
 }
 
@@ -3898,7 +3898,7 @@ static void zap_class(struct lock_class *class)
 	/*
 	 * Unhash the class and remove it from the all_lock_classes list:
 	 */
-	list_del_rcu(&class->hash_entry);
+	hlist_del_rcu(&class->hash_entry);
 	list_del_rcu(&class->lock_entry);
 
 	RCU_INIT_POINTER(class->key, NULL);
@@ -3921,7 +3921,7 @@ static inline int within(const void *addr, void *start, unsigned long size)
 void lockdep_free_key_range(void *start, unsigned long size)
 {
 	struct lock_class *class;
-	struct list_head *head;
+	struct hlist_head *head;
 	unsigned long flags;
 	int i;
 	int locked;
@@ -3934,9 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
 	 */
 	for (i = 0; i < CLASSHASH_SIZE; i++) {
 		head = classhash_table + i;
-		if (list_empty(head))
-			continue;
-		list_for_each_entry_rcu(class, head, hash_entry) {
+		hlist_for_each_entry_rcu(class, head, hash_entry) {
 			if (within(class->key, start, size))
 				zap_class(class);
 			else if (within(class->name, start, size))
@@ -3966,7 +3964,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
 void lockdep_reset_lock(struct lockdep_map *lock)
 {
 	struct lock_class *class;
-	struct list_head *head;
+	struct hlist_head *head;
 	unsigned long flags;
 	int i, j;
 	int locked;
@@ -3991,9 +3989,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 	locked = graph_lock();
 	for (i = 0; i < CLASSHASH_SIZE; i++) {
 		head = classhash_table + i;
-		if (list_empty(head))
-			continue;
-		list_for_each_entry_rcu(class, head, hash_entry) {
+		hlist_for_each_entry_rcu(class, head, hash_entry) {
 			int match = 0;
 
 			for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
@@ -4031,10 +4027,10 @@ void lockdep_init(void)
 		return;
 
 	for (i = 0; i < CLASSHASH_SIZE; i++)
-		INIT_LIST_HEAD(classhash_table + i);
+		INIT_HLIST_HEAD(classhash_table + i);
 
 	for (i = 0; i < CHAINHASH_SIZE; i++)
-		INIT_LIST_HEAD(chainhash_table + i);
+		INIT_HLIST_HEAD(chainhash_table + i);
 
 	lockdep_initialized = 1;
 }
-- 
cgit v0.10.2


From 06bea3dbfe6a4c333c4333362c46bdf4d9e43504 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 4 Feb 2016 11:29:36 -0800
Subject: locking/lockdep: Eliminate lockdep_init()

Lockdep is initialized at compile time now.  Get rid of lockdep_init().

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Krinkin <krinkin.m.u@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: mm-commits@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/c6x/kernel/setup.c b/arch/c6x/kernel/setup.c
index 72e17f7..786e36e 100644
--- a/arch/c6x/kernel/setup.c
+++ b/arch/c6x/kernel/setup.c
@@ -281,8 +281,6 @@ notrace void __init machine_init(unsigned long dt_ptr)
 	 */
 	set_ist(_vectors_start);
 
-	lockdep_init();
-
 	/*
 	 * dtb is passed in from bootloader.
 	 * fdt is linked in blob.
diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c
index 89a2a93..f31ebb5 100644
--- a/arch/microblaze/kernel/setup.c
+++ b/arch/microblaze/kernel/setup.c
@@ -130,8 +130,6 @@ void __init machine_early_init(const char *cmdline, unsigned int ram,
 	memset(__bss_start, 0, __bss_stop-__bss_start);
 	memset(_ssbss, 0, _esbss-_ssbss);
 
-	lockdep_init();
-
 /* initialize device tree for usage in early_printk */
 	early_init_devtree(_fdt_start);
 
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index ad8c9db..d544fa3 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -114,8 +114,6 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */
 
 notrace void __init machine_init(u64 dt_ptr)
 {
-	lockdep_init();
-
 	/* Enable early debugging if any specified (see udbg.h) */
 	udbg_early_init();
 
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 5c03a6a..f98be83 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -255,9 +255,6 @@ void __init early_setup(unsigned long dt_ptr)
 	setup_paca(&boot_paca);
 	fixup_boot_paca();
 
-	/* Initialize lockdep early or else spinlocks will blow */
-	lockdep_init();
-
 	/* -------- printk is now safe to use ------- */
 
 	/* Enable early debugging if any specified (see udbg.h) */
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index c55576b..a0684de 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -448,7 +448,6 @@ void __init startup_init(void)
 	rescue_initrd();
 	clear_bss_section();
 	init_kernel_storage_key();
-	lockdep_init();
 	lockdep_off();
 	setup_lowcore_early();
 	setup_facility_list();
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index f2d30ca..cd1f592 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -696,14 +696,6 @@ tlb_fixup_done:
 	call	__bzero
 	 sub	%o1, %o0, %o1
 
-#ifdef CONFIG_LOCKDEP
-	/* We have this call this super early, as even prom_init can grab
-	 * spinlocks and thus call into the lockdep code.
-	 */
-	call	lockdep_init
-	 nop
-#endif
-
 	call	prom_init
 	 mov	%l7, %o0			! OpenPROM cif handler
 
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4ba229a..f56cc41 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1520,12 +1520,6 @@ __init void lguest_init(void)
 	 */
 	reserve_top_address(lguest_data.reserve_mem);
 
-	/*
-	 * If we don't initialize the lock dependency checker now, it crashes
-	 * atomic_notifier_chain_register, then paravirt_disable_iospace.
-	 */
-	lockdep_init();
-
 	/* Hook in our special panic hypercall code. */
 	atomic_notifier_chain_register(&panic_notifier_list, &paniced);
 
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 4dca42f..d026b19 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -261,7 +261,6 @@ struct held_lock {
 /*
  * Initialization, self-test and debugging-output methods:
  */
-extern void lockdep_init(void);
 extern void lockdep_info(void);
 extern void lockdep_reset(void);
 extern void lockdep_reset_lock(struct lockdep_map *lock);
@@ -392,7 +391,6 @@ static inline void lockdep_on(void)
 # define lockdep_set_current_reclaim_state(g)	do { } while (0)
 # define lockdep_clear_current_reclaim_state()	do { } while (0)
 # define lockdep_trace_alloc(g)			do { } while (0)
-# define lockdep_init()				do { } while (0)
 # define lockdep_info()				do { } while (0)
 # define lockdep_init_map(lock, name, key, sub) \
 		do { (void)(name); (void)(key); } while (0)
diff --git a/init/main.c b/init/main.c
index 58c9e37..b3008bc 100644
--- a/init/main.c
+++ b/init/main.c
@@ -499,11 +499,6 @@ asmlinkage __visible void __init start_kernel(void)
 	char *command_line;
 	char *after_dashes;
 
-	/*
-	 * Need to run as early as possible, to initialize the
-	 * lockdep hash:
-	 */
-	lockdep_init();
 	set_task_stack_end_magic(&init_task);
 	smp_setup_processor_id();
 	debug_objects_early_init();
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 716547f..3261214 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -123,8 +123,6 @@ static inline int debug_locks_off_graph_unlock(void)
 	return ret;
 }
 
-static int lockdep_initialized;
-
 unsigned long nr_list_entries;
 static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
 
@@ -434,19 +432,6 @@ unsigned int max_lockdep_depth;
 
 #ifdef CONFIG_DEBUG_LOCKDEP
 /*
- * We cannot printk in early bootup code. Not even early_printk()
- * might work. So we mark any initialization errors and printk
- * about it later on, in lockdep_info().
- */
-static int lockdep_init_error;
-static const char *lock_init_error;
-static unsigned long lockdep_init_trace_data[20];
-static struct stack_trace lockdep_init_trace = {
-	.max_entries = ARRAY_SIZE(lockdep_init_trace_data),
-	.entries = lockdep_init_trace_data,
-};
-
-/*
  * Various lockdep statistics:
  */
 DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
@@ -669,20 +654,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	struct hlist_head *hash_head;
 	struct lock_class *class;
 
-#ifdef CONFIG_DEBUG_LOCKDEP
-	/*
-	 * If the architecture calls into lockdep before initializing
-	 * the hashes then we'll warn about it later. (we cannot printk
-	 * right now)
-	 */
-	if (unlikely(!lockdep_initialized)) {
-		lockdep_init();
-		lockdep_init_error = 1;
-		lock_init_error = lock->name;
-		save_stack_trace(&lockdep_init_trace);
-	}
-#endif
-
 	if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
 		debug_locks_off();
 		printk(KERN_ERR
@@ -4013,28 +3984,6 @@ out_restore:
 	raw_local_irq_restore(flags);
 }
 
-void lockdep_init(void)
-{
-	int i;
-
-	/*
-	 * Some architectures have their own start_kernel()
-	 * code which calls lockdep_init(), while we also
-	 * call lockdep_init() from the start_kernel() itself,
-	 * and we want to initialize the hashes only once:
-	 */
-	if (lockdep_initialized)
-		return;
-
-	for (i = 0; i < CLASSHASH_SIZE; i++)
-		INIT_HLIST_HEAD(classhash_table + i);
-
-	for (i = 0; i < CHAINHASH_SIZE; i++)
-		INIT_HLIST_HEAD(chainhash_table + i);
-
-	lockdep_initialized = 1;
-}
-
 void __init lockdep_info(void)
 {
 	printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
@@ -4061,14 +4010,6 @@ void __init lockdep_info(void)
 
 	printk(" per task-struct memory footprint: %lu bytes\n",
 		sizeof(struct held_lock) * MAX_LOCK_DEPTH);
-
-#ifdef CONFIG_DEBUG_LOCKDEP
-	if (lockdep_init_error) {
-		printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
-		printk("Call stack leading to lockdep invocation was:\n");
-		print_stack_trace(&lockdep_init_trace, 0);
-	}
-#endif
 }
 
 static void
diff --git a/tools/lib/lockdep/common.c b/tools/lib/lockdep/common.c
index 9be6633..d1c89cc 100644
--- a/tools/lib/lockdep/common.c
+++ b/tools/lib/lockdep/common.c
@@ -11,11 +11,6 @@ static __thread struct task_struct current_obj;
 bool debug_locks = true;
 bool debug_locks_silent;
 
-__attribute__((constructor)) static void liblockdep_init(void)
-{
-	lockdep_init();
-}
-
 __attribute__((destructor)) static void liblockdep_exit(void)
 {
 	debug_check_no_locks_held();
diff --git a/tools/lib/lockdep/include/liblockdep/common.h b/tools/lib/lockdep/include/liblockdep/common.h
index a60c14b..6e66277 100644
--- a/tools/lib/lockdep/include/liblockdep/common.h
+++ b/tools/lib/lockdep/include/liblockdep/common.h
@@ -44,7 +44,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 void lock_release(struct lockdep_map *lock, int nested,
 			unsigned long ip);
 extern void debug_check_no_locks_freed(const void *from, unsigned long len);
-extern void lockdep_init(void);
 
 #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
 	{ .name = (_name), .key = (void *)(_key), }
diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c
index 21cdf86..5284484 100644
--- a/tools/lib/lockdep/preload.c
+++ b/tools/lib/lockdep/preload.c
@@ -439,7 +439,5 @@ __attribute__((constructor)) static void init_preload(void)
 	ll_pthread_rwlock_unlock = dlsym(RTLD_NEXT, "pthread_rwlock_unlock");
 #endif
 
-	lockdep_init();
-
 	__init_state = done;
 }
-- 
cgit v0.10.2


From 69e0210fd01ff157d332102219aaf5c26ca8069b Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Mon, 11 Jan 2016 15:51:18 +0300
Subject: x86/kasan: Clear kasan_zero_page after TLB flush

Currently we clear kasan_zero_page before __flush_tlb_all(). This
works with current implementation of native_flush_tlb[_global]()
because it doesn't cause do any writes to kasan shadow memory.
But any subtle change made in native_flush_tlb*() could break this.
Also current code seems doesn't work for paravirt guests (lguest).

Only after the TLB flush we can be sure that kasan_zero_page is not
used as early shadow anymore (instrumented code will not write to it).
So it should cleared it only after the TLB flush.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1452516679-32040-2-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index d470cf2..303e470 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -120,11 +120,16 @@ void __init kasan_init(void)
 	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
 			(void *)KASAN_SHADOW_END);
 
-	memset(kasan_zero_page, 0, PAGE_SIZE);
-
 	load_cr3(init_level4_pgt);
 	__flush_tlb_all();
-	init_task.kasan_depth = 0;
 
+	/*
+	 * kasan_zero_page has been used as early shadow memory, thus it may
+	 * contain some garbage. Now we can clear it, since after the TLB flush
+	 * no one should write to it.
+	 */
+	memset(kasan_zero_page, 0, PAGE_SIZE);
+
+	init_task.kasan_depth = 0;
 	pr_info("KernelAddressSanitizer initialized\n");
 }
-- 
cgit v0.10.2


From 063fb3e56f6dd29b2633b678b837e1d904200e6f Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Mon, 11 Jan 2016 15:51:19 +0300
Subject: x86/kasan: Write protect kasan zero shadow

After kasan_init() executed, no one is allowed to write to kasan_zero_page,
so write protect it.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1452516679-32040-3-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 303e470..1b1110f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -125,10 +125,16 @@ void __init kasan_init(void)
 
 	/*
 	 * kasan_zero_page has been used as early shadow memory, thus it may
-	 * contain some garbage. Now we can clear it, since after the TLB flush
-	 * no one should write to it.
+	 * contain some garbage. Now we can clear and write protect it, since
+	 * after the TLB flush no one should write to it.
 	 */
 	memset(kasan_zero_page, 0, PAGE_SIZE);
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+		set_pte(&kasan_zero_pte[i], pte);
+	}
+	/* Flush TLBs again to be sure that write protection applied. */
+	__flush_tlb_all();
 
 	init_task.kasan_depth = 0;
 	pr_info("KernelAddressSanitizer initialized\n");
-- 
cgit v0.10.2


From 060a402a1ddb551455ee410de2eadd3349f2801b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 29 Jan 2016 11:42:57 -0800
Subject: x86/mm: Add INVPCID helpers

This adds helpers for each of the four currently-specified INVPCID
modes.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/8a62b23ad686888cee01da134c91409e22064db9.1454096309.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6df2029..8b57683 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,54 @@
 #include <asm/processor.h>
 #include <asm/special_insns.h>
 
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+			     unsigned long type)
+{
+	u64 desc[2] = { pcid, addr };
+
+	/*
+	 * The memory clobber is because the whole point is to invalidate
+	 * stale TLB entries and, especially if we're flushing global
+	 * mappings, we don't want the compiler to reorder any subsequent
+	 * memory accesses before the TLB flush.
+	 *
+	 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+	 * invpcid (%rcx), %rax in long mode.
+	 */
+	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+		      : : "m" (desc), "a" (type), "c" (desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR		0
+#define INVPCID_TYPE_SINGLE_CTXT	1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL	2
+#define INVPCID_TYPE_ALL_NON_GLOBAL	3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+				     unsigned long addr)
+{
+	__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+	__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+	__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
-- 
cgit v0.10.2


From d12a72b844a49d4162f24cefdab30bed3f86730e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 29 Jan 2016 11:42:58 -0800
Subject: x86/mm: Add a 'noinvpcid' boot option to turn off INVPCID

This adds a chicken bit to turn off INVPCID in case something goes
wrong.  It's an early_param() because we do TLB flushes before we
parse __setup() parameters.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/f586317ed1bc2b87aee652267e515b90051af385.1454096309.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 551ecf0..e4c4d2a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2566,6 +2566,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	nointroute	[IA-64]
 
+	noinvpcid	[X86] Disable the INVPCID cpu feature.
+
 	nojitter	[IA-64] Disables jitter checking for ITC timers.
 
 	no-kvmclock	[X86,KVM] Disable paravirtualized KVM clock driver
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 37830de..f4d0aa6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s)
 }
 __setup("nompx", x86_mpx_setup);
 
+static int __init x86_noinvpcid_setup(char *s)
+{
+	/* noinvpcid doesn't accept parameters */
+	if (s)
+		return -EINVAL;
+
+	/* do not emit a message if the feature is not present */
+	if (!boot_cpu_has(X86_FEATURE_INVPCID))
+		return 0;
+
+	setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+	pr_info("noinvpcid: INVPCID feature disabled\n");
+	return 0;
+}
+early_param("noinvpcid", x86_noinvpcid_setup);
+
 #ifdef CONFIG_X86_32
 static int cachesize_override = -1;
 static int disable_x86_serial_nr = 1;
-- 
cgit v0.10.2


From d8bced79af1db6734f66b42064cc773cada2ce99 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 29 Jan 2016 11:42:59 -0800
Subject: x86/mm: If INVPCID is available, use it to flush global mappings

On my Skylake laptop, INVPCID function 2 (flush absolutely
everything) takes about 376ns, whereas saving flags, twiddling
CR4.PGE to flush global mappings, and restoring flags takes about
539ns.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ed0ef62581c0ea9c99b9bf6df726015e96d44743.1454096309.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 8b57683..fc9a2fd 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -152,6 +152,15 @@ static inline void __native_flush_tlb_global(void)
 {
 	unsigned long flags;
 
+	if (static_cpu_has(X86_FEATURE_INVPCID)) {
+		/*
+		 * Using INVPCID is considerably faster than a pair of writes
+		 * to CR4 sandwiched inside an IRQ flag save/restore.
+		 */
+		invpcid_flush_all();
+		return;
+	}
+
 	/*
 	 * Read-modify-write to CR4 - protect it from preemption and
 	 * from interrupts. (Use the raw variant because this code can
-- 
cgit v0.10.2


From ce1143aa60273220a9f89012f2aaaed04f97e9a2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 25 Jan 2016 23:06:49 -0800
Subject: x86/dmi: Switch dmi_remap() from ioremap() [uncached] to
 ioremap_cache()

DMI cacheability is very confused on x86.

dmi_early_remap() uses early_ioremap(), which uses FIXMAP_PAGE_IO,
which is __PAGE_KERNEL_IO, which is __PAGE_KERNEL, which is cached.

Don't ask me why this makes any sense.

dmi_remap() uses ioremap(), which requests an uncached mapping.

However, on non-EFI systems, the DMI data generally lives between
0xf0000 and 0x100000, which is in the legacy ISA range, which
triggers a special case in the PAT code that overrides the cache
mode requested by ioremap() and forces a WB mapping.

On a UEFI boot, however, the DMI table can live at any physical
address.  On my laptop, it's around 0x77dd0000.  That's nowhere near
the legacy ISA range, so the ioremap() implicit uncached type is
honored and we end up with a UC- mapping.

UC- is a very, very slow way to read from main memory, so dmi_walk()
is likely to take much longer than necessary.

Given that, even on UEFI, we do early cached DMI reads, it seems
safe to just ask for cached access.  Switch to ioremap_cache().

I haven't tried to benchmark this, but I'd guess it saves several
milliseconds of boot time.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jean Delvare <jdelvare@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Link: http://lkml.kernel.org/r/3147c38e51f439f3c8911db34c7d4ab22d854915.1453791969.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 535192f..3c69fed 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -15,7 +15,7 @@ static __always_inline __init void *dmi_alloc(unsigned len)
 /* Use early IO mappings for DMI because it's initialized early */
 #define dmi_early_remap		early_ioremap
 #define dmi_early_unmap		early_iounmap
-#define dmi_remap		ioremap
+#define dmi_remap		ioremap_cache
 #define dmi_unmap		iounmap
 
 #endif /* _ASM_X86_DMI_H */
-- 
cgit v0.10.2


From 4142c3ebb685bb338b7d96090d8f90ff49065ff6 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 25 Jan 2016 17:07:39 -0500
Subject: sched/numa: Spread memory according to CPU and memory use

The pseudo-interleaving in NUMA placement has a fundamental problem:
using hard usage thresholds to spread memory equally between nodes
can prevent workloads from converging, or keep memory "trapped" on
nodes where the workload is barely running any more.

In order for workloads to properly converge, the memory migration
should not be stopped when nodes reach parity, but instead be
distributed according to how heavily memory is used from each node.
This way memory migration and task migration reinforce each other,
instead of one putting the brakes on the other.

Remove the hard thresholds from the pseudo-interleaving code, and
instead use a more gradual policy on memory placement. This also
seems to improve convergence of workloads that do not run flat out,
but sleep in between bursts of activity.

We still want to slow down NUMA scanning and migration once a workload
has settled on a few actively used nodes, so keep the 3/4 hysteresis
in place. Keep track of whether a workload is actively running on
multiple nodes, so task_numa_migrate does a full scan of the system
for better task placement.

In the case of running 3 SPECjbb2005 instances on a 4 node system,
this code seems to result in fairer distribution of memory between
nodes, with more memory bandwidth for each instance.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: mgorman@suse.de
Link: http://lkml.kernel.org/r/20160125170739.2fc9a641@annuminas.surriel.com
[ Minor readability tweaks. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 51a4550..7ce24a4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -932,10 +932,11 @@ struct numa_group {
 	spinlock_t lock; /* nr_tasks, tasks */
 	int nr_tasks;
 	pid_t gid;
+	int active_nodes;
 
 	struct rcu_head rcu;
-	nodemask_t active_nodes;
 	unsigned long total_faults;
+	unsigned long max_faults_cpu;
 	/*
 	 * Faults_cpu is used to decide whether memory should move
 	 * towards the CPU. As a consequence, these stats are weighted
@@ -994,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
 }
 
+/*
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
+ * considered part of a numa group's pseudo-interleaving set. Migrations
+ * between these nodes are slowed down, to allow things to settle down.
+ */
+#define ACTIVE_NODE_FRACTION 3
+
+static bool numa_is_active_node(int nid, struct numa_group *ng)
+{
+	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
+}
+
 /* Handle placement on systems where not all nodes are directly connected. */
 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 					int maxdist, bool task)
@@ -1143,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 		return true;
 
 	/*
-	 * Do not migrate if the destination is not a node that
-	 * is actively used by this numa group.
+	 * Destination node is much more heavily used than the source
+	 * node? Allow migration.
 	 */
-	if (!node_isset(dst_nid, ng->active_nodes))
-		return false;
-
-	/*
-	 * Source is a node that is not actively used by this
-	 * numa group, while the destination is. Migrate.
-	 */
-	if (!node_isset(src_nid, ng->active_nodes))
+	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
+					ACTIVE_NODE_FRACTION)
 		return true;
 
 	/*
-	 * Both source and destination are nodes in active
-	 * use by this numa group. Maximize memory bandwidth
-	 * by migrating from more heavily used groups, to less
-	 * heavily used ones, spreading the load around.
-	 * Use a 1/4 hysteresis to avoid spurious page movement.
+	 * Distribute memory according to CPU & memory use on each node,
+	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
+	 *
+	 * faults_cpu(dst)   3   faults_cpu(src)
+	 * --------------- * - > ---------------
+	 * faults_mem(dst)   4   faults_mem(src)
 	 */
-	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
+	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
 
 static unsigned long weighted_cpuload(const int cpu);
@@ -1509,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
 
 		.best_task = NULL,
 		.best_imp = 0,
-		.best_cpu = -1
+		.best_cpu = -1,
 	};
 	struct sched_domain *sd;
 	unsigned long taskweight, groupweight;
@@ -1561,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
 	 *   multiple NUMA nodes; in order to better consolidate the group,
 	 *   we need to check other locations.
 	 */
-	if (env.best_cpu == -1 || (p->numa_group &&
-			nodes_weight(p->numa_group->active_nodes) > 1)) {
+	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
 		for_each_online_node(nid) {
 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
 				continue;
@@ -1597,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
 	 * trying for a better one later. Do not set the preferred node here.
 	 */
 	if (p->numa_group) {
+		struct numa_group *ng = p->numa_group;
+
 		if (env.best_cpu == -1)
 			nid = env.src_nid;
 		else
 			nid = env.dst_nid;
 
-		if (node_isset(nid, p->numa_group->active_nodes))
+		if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
 			sched_setnuma(p, env.dst_nid);
 	}
 
@@ -1652,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
 }
 
 /*
- * Find the nodes on which the workload is actively running. We do this by
+ * Find out how many nodes on the workload is actively running on. Do this by
  * tracking the nodes from which NUMA hinting faults are triggered. This can
  * be different from the set of nodes where the workload's memory is currently
  * located.
- *
- * The bitmask is used to make smarter decisions on when to do NUMA page
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
- * are added when they cause over 6/16 of the maximum number of faults, but
- * only removed when they drop below 3/16.
  */
-static void update_numa_active_node_mask(struct numa_group *numa_group)
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
 {
 	unsigned long faults, max_faults = 0;
-	int nid;
+	int nid, active_nodes = 0;
 
 	for_each_online_node(nid) {
 		faults = group_faults_cpu(numa_group, nid);
@@ -1675,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
 
 	for_each_online_node(nid) {
 		faults = group_faults_cpu(numa_group, nid);
-		if (!node_isset(nid, numa_group->active_nodes)) {
-			if (faults > max_faults * 6 / 16)
-				node_set(nid, numa_group->active_nodes);
-		} else if (faults < max_faults * 3 / 16)
-			node_clear(nid, numa_group->active_nodes);
+		if (faults * ACTIVE_NODE_FRACTION > max_faults)
+			active_nodes++;
 	}
+
+	numa_group->max_faults_cpu = max_faults;
+	numa_group->active_nodes = active_nodes;
 }
 
 /*
@@ -1971,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
 	update_task_scan_period(p, fault_types[0], fault_types[1]);
 
 	if (p->numa_group) {
-		update_numa_active_node_mask(p->numa_group);
+		numa_group_count_active_nodes(p->numa_group);
 		spin_unlock_irq(group_lock);
 		max_nid = preferred_group_nid(p, max_group_nid);
 	}
@@ -2015,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 			return;
 
 		atomic_set(&grp->refcount, 1);
+		grp->active_nodes = 1;
+		grp->max_faults_cpu = 0;
 		spin_lock_init(&grp->lock);
 		grp->gid = p->pid;
 		/* Second half of the array tracks nids where faults happen */
 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
 						nr_node_ids;
 
-		node_set(task_node(current), grp->active_nodes);
-
 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 			grp->faults[i] = p->numa_faults[i];
 
@@ -2136,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	bool migrated = flags & TNF_MIGRATED;
 	int cpu_node = task_node(current);
 	int local = !!(flags & TNF_FAULT_LOCAL);
+	struct numa_group *ng;
 	int priv;
 
 	if (!static_branch_likely(&sched_numa_balancing))
@@ -2176,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 	 * actively using should be counted as local. This allows the
 	 * scan rate to slow down when a workload has settled down.
 	 */
-	if (!priv && !local && p->numa_group &&
-			node_isset(cpu_node, p->numa_group->active_nodes) &&
-			node_isset(mem_node, p->numa_group->active_nodes))
+	ng = p->numa_group;
+	if (!priv && !local && ng && ng->active_nodes > 1 &&
+				numa_is_active_node(cpu_node, ng) &&
+				numa_is_active_node(mem_node, ng))
 		local = 1;
 
 	task_numa_placement(p);
-- 
cgit v0.10.2


From fed0764fafd8e2e629a033c0f7df4106b0dcb7f0 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 25 Jan 2016 16:33:20 -0500
Subject: locking/atomics: Update comment about READ_ONCE() and structures

The comment is out of data. Also point out the performance drawback
of the barrier();__builtin_memcpy(); barrier() followed by another
copy from stack (__u) to lvalue;

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1453757600-11441-1-git-send-email-konrad.wilk@oracle.com
[ Made it a bit more readable. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 00b042c..4291592 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -263,8 +263,9 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * In contrast to ACCESS_ONCE these two macros will also work on aggregate
  * data types like structs or unions. If the size of the accessed data
  * type exceeds the word size of the machine (e.g., 32 bits or 64 bits)
- * READ_ONCE() and WRITE_ONCE()  will fall back to memcpy and print a
- * compile-time warning.
+ * READ_ONCE() and WRITE_ONCE() will fall back to memcpy(). There's at
+ * least two memcpy()s: one for the __builtin_memcpy() and then one for
+ * the macro doing the copy of variable - '__u' allocated on the stack.
  *
  * Their two major use cases are: (1) Mediating communication between
  * process-level code and irq/NMI handlers, all running on the same CPU,
-- 
cgit v0.10.2


From a91bbe017552b80e12d712c85549b933a62c6ed4 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Tue, 9 Feb 2016 19:44:54 +0600
Subject: x86/boot: Use proper array element type in memset() size calculation

I changed open coded zeroing loops to explicit memset()s in the
following commit:

  5e9ebbd87a99 ("x86/boot: Micro-optimize reset_early_page_tables()")

The base for the size argument of memset was sizeof(pud_p/pmd_p), which
are pointers - but the initialized array has pud_t/pmd_t elements.

Luckily the two types had the same size, so this did not result in any
runtime misbehavior.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Alexander Popov <alpopov@ptsecurity.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455025494-4063-1-git-send-email-kuleshovmail@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 35843ca..7793a17 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -75,7 +75,7 @@ again:
 		}
 
 		pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
-		memset(pud_p, 0, sizeof(pud_p) * PTRS_PER_PUD);
+		memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
 		*pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
 	}
 	pud_p += pud_index(address);
@@ -90,7 +90,7 @@ again:
 		}
 
 		pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
-		memset(pmd_p, 0, sizeof(pmd_p) * PTRS_PER_PMD);
+		memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD);
 		*pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
 	}
 	pmd = (physaddr & PMD_MASK) + early_pmd_flags;
-- 
cgit v0.10.2


From dd7b6847670a84b7bb7c38f8e69b2f12059bca66 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Mon, 25 Jan 2016 12:25:15 -0500
Subject: x86/mm: Honour passed pgprot in track_pfn_insert() and
 track_pfn_remap()

track_pfn_insert() overwrites the pgprot that is passed in with a value
based on the VMA's page_prot.  This is a problem for people trying to
do clever things with the new vm_insert_pfn_prot() as it will simply
overwrite the passed protection flags.  If we use the current value of
the pgprot as the base, then it will behave as people are expecting.

Also fix track_pfn_remap() in the same way.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1453742717-10326-2-git-send-email-matthew.r.wilcox@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index f4ae536..04e2e71 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -943,7 +943,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 			return -EINVAL;
 	}
 
-	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+	*prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
 			 cachemode2protval(pcm));
 
 	return 0;
@@ -959,7 +959,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
 
 	/* Set prot based on lookup */
 	pcm = lookup_memtype(pfn_t_to_phys(pfn));
-	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+	*prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
 			 cachemode2protval(pcm));
 
 	return 0;
-- 
cgit v0.10.2


From 4ecd16ec7059390b430af34bd8bc3ca2b5dcef9a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 24 Jan 2016 14:38:06 -0800
Subject: x86/fpu: Fix math emulation in eager fpu mode

Systems without an FPU are generally old and therefore use lazy FPU
switching. Unsurprisingly, math emulation in eager FPU mode is a
bit buggy. Fix it.

There were two bugs involving kernel code trying to use the FPU
registers in eager mode even if they didn't exist and one BUG_ON()
that was incorrect.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/b4b8d112436bd6fab866e1b4011131507e8d7fbe.1453675014.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 0fd440d..a1f78a9 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -589,7 +589,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
 	 * If the task has used the math, pre-load the FPU on xsave processors
 	 * or if the past 5 consecutive context-switches used math.
 	 */
-	fpu.preload = new_fpu->fpstate_active &&
+	fpu.preload = static_cpu_has(X86_FEATURE_FPU) &&
+		      new_fpu->fpstate_active &&
 		      (use_eager_fpu() || new_fpu->counter > 5);
 
 	if (old_fpu->fpregs_active) {
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index d25097c..08e1e11 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -423,7 +423,7 @@ void fpu__clear(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
 
-	if (!use_eager_fpu()) {
+	if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) {
 		/* FPU state will be reallocated lazily at the first use. */
 		fpu__drop(fpu);
 	} else {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a..87f80fe 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -750,7 +750,6 @@ dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
-	BUG_ON(use_eager_fpu());
 
 #ifdef CONFIG_MATH_EMULATION
 	if (read_cr0() & X86_CR0_EM) {
-- 
cgit v0.10.2


From 5ed73f40735c68d8a656b46d09b1885d3b8740ae Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 24 Jan 2016 14:38:07 -0800
Subject: x86/fpu: Fix FNSAVE usage in eagerfpu mode

In eager fpu mode, having deactivated FPU without immediately
reloading some other context is illegal.  Therefore, to recover from
FNSAVE, we can't just deactivate the state -- we need to reload it
if we're not actively context switching.

We had this wrong in fpu__save() and fpu__copy().  Fix both.
__kernel_fpu_begin() was fine -- add a comment.

This fixes a warning triggerable with nofxsr eagerfpu=on.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/60662444e13c76f06e23c15c5dcdba31b4ac3d67.1453675014.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 08e1e11..7a9244d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -114,6 +114,10 @@ void __kernel_fpu_begin(void)
 	kernel_fpu_disable();
 
 	if (fpu->fpregs_active) {
+		/*
+		 * Ignore return value -- we don't care if reg state
+		 * is clobbered.
+		 */
 		copy_fpregs_to_fpstate(fpu);
 	} else {
 		this_cpu_write(fpu_fpregs_owner_ctx, NULL);
@@ -189,8 +193,12 @@ void fpu__save(struct fpu *fpu)
 
 	preempt_disable();
 	if (fpu->fpregs_active) {
-		if (!copy_fpregs_to_fpstate(fpu))
-			fpregs_deactivate(fpu);
+		if (!copy_fpregs_to_fpstate(fpu)) {
+			if (use_eager_fpu())
+				copy_kernel_to_fpregs(&fpu->state);
+			else
+				fpregs_deactivate(fpu);
+		}
 	}
 	preempt_enable();
 }
@@ -259,7 +267,11 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	preempt_disable();
 	if (!copy_fpregs_to_fpstate(dst_fpu)) {
 		memcpy(&src_fpu->state, &dst_fpu->state, xstate_size);
-		fpregs_deactivate(src_fpu);
+
+		if (use_eager_fpu())
+			copy_kernel_to_fpregs(&src_fpu->state);
+		else
+			fpregs_deactivate(src_fpu);
 	}
 	preempt_enable();
 }
-- 
cgit v0.10.2


From a20d7297045f7fdcd676c15243192eb0e95a4306 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 24 Jan 2016 14:38:08 -0800
Subject: x86/fpu: Fold fpu_copy() into fpu__copy()

Splitting it into two functions needlessly obfuscated the code.
While we're at it, improve the comment slightly.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/3eb5a63a9c5c84077b2677a7dfe684eef96fe59e.1453675014.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 7a9244d..299b58b 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -231,14 +231,15 @@ void fpstate_init(union fpregs_state *state)
 }
 EXPORT_SYMBOL_GPL(fpstate_init);
 
-/*
- * Copy the current task's FPU state to a new task's FPU context.
- *
- * In both the 'eager' and the 'lazy' case we save hardware registers
- * directly to the destination buffer.
- */
-static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
+	dst_fpu->counter = 0;
+	dst_fpu->fpregs_active = 0;
+	dst_fpu->last_cpu = -1;
+
+	if (!src_fpu->fpstate_active || !cpu_has_fpu)
+		return 0;
+
 	WARN_ON_FPU(src_fpu != &current->thread.fpu);
 
 	/*
@@ -251,10 +252,9 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	/*
 	 * Save current FPU registers directly into the child
 	 * FPU context, without any memory-to-memory copying.
-	 *
-	 * If the FPU context got destroyed in the process (FNSAVE
-	 * done on old CPUs) then copy it back into the source
-	 * context and mark the current task for lazy restore.
+	 * In lazy mode, if the FPU context isn't loaded into
+	 * fpregs, CR0.TS will be set and do_device_not_available
+	 * will load the FPU context.
 	 *
 	 * We have to do all this with preemption disabled,
 	 * mostly because of the FNSAVE case, because in that
@@ -274,16 +274,6 @@ static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 			fpregs_deactivate(src_fpu);
 	}
 	preempt_enable();
-}
-
-int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
-{
-	dst_fpu->counter = 0;
-	dst_fpu->fpregs_active = 0;
-	dst_fpu->last_cpu = -1;
-
-	if (src_fpu->fpstate_active && cpu_has_fpu)
-		fpu_copy(dst_fpu, src_fpu);
 
 	return 0;
 }
-- 
cgit v0.10.2


From c6ab109f7e0eae3bae3bb10f8ddb0df67735c150 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 24 Jan 2016 14:38:09 -0800
Subject: x86/fpu: Speed up lazy FPU restores slightly

If we have an FPU, there's no need to check CR0 for FPU emulation.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/980004297e233c27066d54e71382c44cdd36ef7c.1453675014.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 87f80fe..36a9c01 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -752,7 +752,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 
 #ifdef CONFIG_MATH_EMULATION
-	if (read_cr0() & X86_CR0_EM) {
+	if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
 		struct math_emu_info info = { };
 
 		conditional_sti(regs);
-- 
cgit v0.10.2


From 58122bf1d856a4ea9581d62a07c557d997d46a19 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 24 Jan 2016 14:38:10 -0800
Subject: x86/fpu: Default eagerfpu=on on all CPUs

We have eager and lazy FPU modes, introduced in:

  304bceda6a18 ("x86, fpu: use non-lazy fpu restore for processors supporting xsave")

The result is rather messy.  There are two code paths in almost all
of the FPU code, and only one of them (the eager case) is tested
frequently, since most kernel developers have new enough hardware
that we use eagerfpu.

It seems that, on any remotely recent hardware, eagerfpu is a win:
glibc uses SSE2, so laziness is probably overoptimistic, and, in any
case, manipulating TS is far slower that saving and restoring the
full state.  (Stores to CR0.TS are serializing and are poorly
optimized.)

To try to shake out any latent issues on old hardware, this changes
the default to eager on all CPUs.  If no performance or functionality
problems show up, a subsequent patch could remove lazy mode entirely.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/ac290de61bf08d9cfc2664a4f5080257ffc1075a.1453675014.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6d9f0a7..471fe27 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -260,7 +260,10 @@ static void __init fpu__init_system_xstate_size_legacy(void)
  * not only saved the restores along the way, but we also have the
  * FPU ready to be used for the original task.
  *
- * 'eager' switching is used on modern CPUs, there we switch the FPU
+ * 'lazy' is deprecated because it's almost never a performance win
+ * and it's much more complicated than 'eager'.
+ *
+ * 'eager' switching is by default on all CPUs, there we switch the FPU
  * state during every context switch, regardless of whether the task
  * has used FPU instructions in that time slice or not. This is done
  * because modern FPU context saving instructions are able to optimize
@@ -271,7 +274,7 @@ static void __init fpu__init_system_xstate_size_legacy(void)
  *   to use 'eager' restores, if we detect that a task is using the FPU
  *   frequently. See the fpu->counter logic in fpu/internal.h for that. ]
  */
-static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+static enum { ENABLE, DISABLE } eagerfpu = ENABLE;
 
 /*
  * Find supported xfeatures based on cpu features and command-line input.
@@ -348,15 +351,9 @@ static void __init fpu__init_system_ctx_switch(void)
  */
 static void __init fpu__init_parse_early_param(void)
 {
-	/*
-	 * No need to check "eagerfpu=auto" again, since it is the
-	 * initial default.
-	 */
 	if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
 		eagerfpu = DISABLE;
 		fpu__clear_eager_fpu_features();
-	} else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
-		eagerfpu = ENABLE;
 	}
 
 	if (cmdline_find_option_bool(boot_command_line, "no387"))
-- 
cgit v0.10.2


From 4f388a9210e2a2c1cd83a5580b74aa13876dbf14 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Fri, 5 Feb 2016 11:52:00 -0200
Subject: [media] media: davinci_vpfe: fix missing unlock on error in
 vpfe_prepare_pipeline()

Add the missing unlock before return from function
vpfe_prepare_pipeline() in the error handling case.

video->lock is lock/unlock in function vpfe_open(),
and no need to unlock it here, so remove unlock
video->lock.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>

diff --git a/drivers/staging/media/davinci_vpfe/vpfe_video.c b/drivers/staging/media/davinci_vpfe/vpfe_video.c
index 3ec7e65..db49af9 100644
--- a/drivers/staging/media/davinci_vpfe/vpfe_video.c
+++ b/drivers/staging/media/davinci_vpfe/vpfe_video.c
@@ -147,7 +147,7 @@ static int vpfe_prepare_pipeline(struct vpfe_video_device *video)
 	mutex_lock(&mdev->graph_mutex);
 	ret = media_entity_graph_walk_init(&graph, entity->graph_obj.mdev);
 	if (ret) {
-		mutex_unlock(&video->lock);
+		mutex_unlock(&mdev->graph_mutex);
 		return -ENOMEM;
 	}
 	media_entity_graph_walk_start(&graph, entity);
-- 
cgit v0.10.2


From b888d232142c0ac111c7e82d1731c6fbd56fc1fe Mon Sep 17 00:00:00 2001
From: Anton Protopopov <a.s.protopopov@gmail.com>
Date: Sun, 7 Feb 2016 02:27:32 -0200
Subject: [media] media: i2c/adp1653: probe: fix erroneous return value

The adp1653_probe() function may return positive value EINVAL
which is obviously wrong.

Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>

diff --git a/drivers/media/i2c/adp1653.c b/drivers/media/i2c/adp1653.c
index 7e9cbf7..fb7ed73 100644
--- a/drivers/media/i2c/adp1653.c
+++ b/drivers/media/i2c/adp1653.c
@@ -497,7 +497,7 @@ static int adp1653_probe(struct i2c_client *client,
 		if (!client->dev.platform_data) {
 			dev_err(&client->dev,
 				"Neither DT not platform data provided\n");
-			return EINVAL;
+			return -EINVAL;
 		}
 		flash->platform_data = client->dev.platform_data;
 	}
-- 
cgit v0.10.2


From 17e0521750399205f432966e602e125294879cdd Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Mon, 4 Jan 2016 17:32:26 +0100
Subject: gpu: ipu-v3: Do not bail out on missing optional port nodes

The port nodes are documented as optional, treat them accordingly.

Reported-by: Martin Fuzzey <mfuzzey@parkeon.com>
Reported-by: Chris Healy <Chris.Healy@zii.aero>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Fixes: 304e6be652e2 ("gpu: ipu-v3: Assign of_node of child platform devices to corresponding ports")

diff --git a/drivers/gpu/ipu-v3/ipu-common.c b/drivers/gpu/ipu-v3/ipu-common.c
index f2e13eb..a0e28f3 100644
--- a/drivers/gpu/ipu-v3/ipu-common.c
+++ b/drivers/gpu/ipu-v3/ipu-common.c
@@ -1050,6 +1050,17 @@ static int ipu_add_client_devices(struct ipu_soc *ipu, unsigned long ipu_base)
 	for (i = 0; i < ARRAY_SIZE(client_reg); i++) {
 		const struct ipu_platform_reg *reg = &client_reg[i];
 		struct platform_device *pdev;
+		struct device_node *of_node;
+
+		/* Associate subdevice with the corresponding port node */
+		of_node = of_graph_get_port_by_id(dev->of_node, i);
+		if (!of_node) {
+			dev_info(dev,
+				 "no port@%d node in %s, not using %s%d\n",
+				 i, dev->of_node->full_name,
+				 (i / 2) ? "DI" : "CSI", i % 2);
+			continue;
+		}
 
 		pdev = platform_device_alloc(reg->name, id++);
 		if (!pdev) {
@@ -1057,17 +1068,9 @@ static int ipu_add_client_devices(struct ipu_soc *ipu, unsigned long ipu_base)
 			goto err_register;
 		}
 
+		pdev->dev.of_node = of_node;
 		pdev->dev.parent = dev;
 
-		/* Associate subdevice with the corresponding port node */
-		pdev->dev.of_node = of_graph_get_port_by_id(dev->of_node, i);
-		if (!pdev->dev.of_node) {
-			dev_err(dev, "missing port@%d node in %s\n", i,
-				dev->of_node->full_name);
-			ret = -ENODEV;
-			goto err_register;
-		}
-
 		ret = platform_device_add_data(pdev, &reg->pdata,
 					       sizeof(reg->pdata));
 		if (!ret)
-- 
cgit v0.10.2


From 596a65d152fdc777ee7a2c7cbea9a7916350bed1 Mon Sep 17 00:00:00 2001
From: David Jander <david@protonic.nl>
Date: Thu, 2 Jul 2015 16:21:57 +0200
Subject: gpu: ipu-v3: Reset IPU before activating IRQ

If we don't come out of a clean reset, make sure no IRQ is fired before
everything is setup by resetting the IPU before activating the interrupt
handlers.

Signed-off-by: David Jander <david@protonic.nl>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>

diff --git a/drivers/gpu/ipu-v3/ipu-common.c b/drivers/gpu/ipu-v3/ipu-common.c
index a0e28f3..e00db3f 100644
--- a/drivers/gpu/ipu-v3/ipu-common.c
+++ b/drivers/gpu/ipu-v3/ipu-common.c
@@ -1292,10 +1292,6 @@ static int ipu_probe(struct platform_device *pdev)
 	ipu->irq_sync = irq_sync;
 	ipu->irq_err = irq_err;
 
-	ret = ipu_irq_init(ipu);
-	if (ret)
-		goto out_failed_irq;
-
 	ret = device_reset(&pdev->dev);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to reset: %d\n", ret);
@@ -1305,6 +1301,10 @@ static int ipu_probe(struct platform_device *pdev)
 	if (ret)
 		goto out_failed_reset;
 
+	ret = ipu_irq_init(ipu);
+	if (ret)
+		goto out_failed_irq;
+
 	/* Set MCU_T to divide MCU access window into 2 */
 	ipu_cm_write(ipu, 0x00400000L | (IPU_MCU_T_DEFAULT << 18),
 			IPU_DISP_GEN);
@@ -1327,9 +1327,9 @@ static int ipu_probe(struct platform_device *pdev)
 failed_add_clients:
 	ipu_submodules_exit(ipu);
 failed_submodules_init:
-out_failed_reset:
 	ipu_irq_exit(ipu);
 out_failed_irq:
+out_failed_reset:
 	clk_disable_unprepare(ipu->clk);
 	return ret;
 }
-- 
cgit v0.10.2


From 6c8b66ed0aaf888bfa2545796bf769c1c4593d58 Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach@pengutronix.de>
Date: Tue, 3 Nov 2015 12:02:17 +0100
Subject: drm/imx: notify DRM core about CRTC vblank state

Make sure the DRM core is aware that there will be no vblank interrupts
incoming if the CRTC is disabled. That way the core will reject any
attempts from userspace to wait on a vblank event on a disabled CRTC.

Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>

diff --git a/drivers/gpu/drm/imx/ipuv3-crtc.c b/drivers/gpu/drm/imx/ipuv3-crtc.c
index 30a5718..2872263 100644
--- a/drivers/gpu/drm/imx/ipuv3-crtc.c
+++ b/drivers/gpu/drm/imx/ipuv3-crtc.c
@@ -64,6 +64,7 @@ static void ipu_fb_enable(struct ipu_crtc *ipu_crtc)
 	/* Start DC channel and DI after IDMAC */
 	ipu_dc_enable_channel(ipu_crtc->dc);
 	ipu_di_enable(ipu_crtc->di);
+	drm_crtc_vblank_on(&ipu_crtc->base);
 
 	ipu_crtc->enabled = 1;
 }
@@ -80,6 +81,7 @@ static void ipu_fb_disable(struct ipu_crtc *ipu_crtc)
 	ipu_di_disable(ipu_crtc->di);
 	ipu_plane_disable(ipu_crtc->plane[0]);
 	ipu_dc_disable(ipu);
+	drm_crtc_vblank_off(&ipu_crtc->base);
 
 	ipu_crtc->enabled = 0;
 }
-- 
cgit v0.10.2


From 33bee520cbaee22118fb96ab500ba4cd4e8cb749 Mon Sep 17 00:00:00 2001
From: Enrico Jorns <ejo@pengutronix.de>
Date: Tue, 24 Nov 2015 16:29:22 +0100
Subject: drm/imx: Add missing DRM_FORMAT_RGB565 to ipu_plane_formats

DRM_FORMAT_RGB565 is missing from ipu_plane_formats.
The support is there, just need to make it available to userspace.

Signed-off-by: Enrico Jorns <ejo@pengutronix.de>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>

diff --git a/drivers/gpu/drm/imx/ipuv3-plane.c b/drivers/gpu/drm/imx/ipuv3-plane.c
index 591ba2f..26bb1b6 100644
--- a/drivers/gpu/drm/imx/ipuv3-plane.c
+++ b/drivers/gpu/drm/imx/ipuv3-plane.c
@@ -42,6 +42,7 @@ static const uint32_t ipu_plane_formats[] = {
 	DRM_FORMAT_YVYU,
 	DRM_FORMAT_YUV420,
 	DRM_FORMAT_YVU420,
+	DRM_FORMAT_RGB565,
 };
 
 int ipu_plane_irq(struct ipu_plane *ipu_plane)
-- 
cgit v0.10.2


From f070d6715509dafc0af223577c896fe3d204ca88 Mon Sep 17 00:00:00 2001
From: Suman Tripathi <stripathi@apm.com>
Date: Sat, 6 Feb 2016 11:25:22 +0530
Subject: libahci: Implement the capability to override the generic ahci
 interrupt handler.

This patch implements the capability to override the generic AHCI
interrupt handler so that specific ahci drivers can implement their
own custom interrupt handler routines.  It also exports
ahci_handle_port_intr so that custom irq_handler implementations can
use it.

tj: s/ahci_irq_handler/irq_handler/ and updated description.

Signed-off-by: Suman Tripathi <stripathi@apm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index a44c75d..cf48e3e 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -361,6 +361,7 @@ struct ahci_host_priv {
 	 * be overridden anytime before the host is activated.
 	 */
 	void			(*start_engine)(struct ata_port *ap);
+	irqreturn_t 		(*irq_handler)(int irq, void *dev_instance);
 };
 
 #ifdef CONFIG_PCI_MSI
@@ -424,6 +425,7 @@ int ahci_reset_em(struct ata_host *host);
 void ahci_print_info(struct ata_host *host, const char *scc_s);
 int ahci_host_activate(struct ata_host *host, struct scsi_host_template *sht);
 void ahci_error_handler(struct ata_port *ap);
+u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked);
 
 static inline void __iomem *__ahci_port_base(struct ata_host *host,
 					     unsigned int port_no)
diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index 4029679..cbfe8a2 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -113,6 +113,9 @@ static ssize_t ahci_store_em_buffer(struct device *dev,
 				    const char *buf, size_t size);
 static ssize_t ahci_show_em_supported(struct device *dev,
 				      struct device_attribute *attr, char *buf);
+static irqreturn_t ahci_single_edge_irq_intr(int irq, void *dev_instance);
+
+static irqreturn_t ahci_single_level_irq_intr(int irq, void *dev_instance);
 
 static DEVICE_ATTR(ahci_host_caps, S_IRUGO, ahci_show_host_caps, NULL);
 static DEVICE_ATTR(ahci_host_cap2, S_IRUGO, ahci_show_host_cap2, NULL);
@@ -512,6 +515,11 @@ void ahci_save_initial_config(struct device *dev, struct ahci_host_priv *hpriv)
 
 	if (!hpriv->start_engine)
 		hpriv->start_engine = ahci_start_engine;
+
+	if (!hpriv->irq_handler)
+		hpriv->irq_handler = (hpriv->flags & AHCI_HFLAG_EDGE_IRQ) ?
+				     ahci_single_edge_irq_intr :
+				     ahci_single_level_irq_intr;
 }
 EXPORT_SYMBOL_GPL(ahci_save_initial_config);
 
@@ -1846,7 +1854,7 @@ static irqreturn_t ahci_multi_irqs_intr_hard(int irq, void *dev_instance)
 	return IRQ_HANDLED;
 }
 
-static u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked)
+u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked)
 {
 	unsigned int i, handled = 0;
 
@@ -1872,6 +1880,7 @@ static u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked)
 
 	return handled;
 }
+EXPORT_SYMBOL_GPL(ahci_handle_port_intr);
 
 static irqreturn_t ahci_single_edge_irq_intr(int irq, void *dev_instance)
 {
@@ -2535,14 +2544,18 @@ int ahci_host_activate(struct ata_host *host, struct scsi_host_template *sht)
 	int irq = hpriv->irq;
 	int rc;
 
-	if (hpriv->flags & (AHCI_HFLAG_MULTI_MSI | AHCI_HFLAG_MULTI_MSIX))
+	if (hpriv->flags & (AHCI_HFLAG_MULTI_MSI | AHCI_HFLAG_MULTI_MSIX)) {
+		if (hpriv->irq_handler)
+			dev_warn(host->dev, "both AHCI_HFLAG_MULTI_MSI flag set \
+				 and custom irq handler implemented\n");
+
 		rc = ahci_host_activate_multi_irqs(host, sht);
-	else if (hpriv->flags & AHCI_HFLAG_EDGE_IRQ)
-		rc = ata_host_activate(host, irq, ahci_single_edge_irq_intr,
-				       IRQF_SHARED, sht);
-	else
-		rc = ata_host_activate(host, irq, ahci_single_level_irq_intr,
+	} else {
+		rc = ata_host_activate(host, irq, hpriv->irq_handler,
 				       IRQF_SHARED, sht);
+	}
+
+
 	return rc;
 }
 EXPORT_SYMBOL_GPL(ahci_host_activate);
-- 
cgit v0.10.2


From d867b95f965457b9e85fb061ef8e3fdc029116ed Mon Sep 17 00:00:00 2001
From: Suman Tripathi <stripathi@apm.com>
Date: Sat, 6 Feb 2016 11:25:23 +0530
Subject: ata: Remove the AHCI_HFLAG_EDGE_IRQ support from libahci.

The flexibility to override the irq handles in the LLD's are already
present, so controllers implementing a edge trigger latch can
implement their own interrupt handler inside the driver.  This patch
removes the AHCI_HFLAG_EDGE_IRQ support from libahci and moves edge
irq handling to ahci_xgene.

tj: Minor update to description.

Signed-off-by: Suman Tripathi <stripathi@apm.com>
Signed-off-by: Tejun Heo <tj@kenrel.org>

diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index cf48e3e..167ba7e 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -240,8 +240,7 @@ enum {
 						        error-handling stage) */
 	AHCI_HFLAG_NO_DEVSLP		= (1 << 17), /* no device sleep */
 	AHCI_HFLAG_NO_FBS		= (1 << 18), /* no FBS */
-	AHCI_HFLAG_EDGE_IRQ		= (1 << 19), /* HOST_IRQ_STAT behaves as
-							Edge Triggered */
+
 #ifdef CONFIG_PCI_MSI
 	AHCI_HFLAG_MULTI_MSI		= (1 << 20), /* multiple PCI MSIs */
 	AHCI_HFLAG_MULTI_MSIX		= (1 << 21), /* per-port MSI-X */
diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index e2c6d9e..8b8ccb6 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c
@@ -548,6 +548,43 @@ softreset_retry:
 	return rc;
 }
 
+static irqreturn_t xgene_ahci_irq_intr(int irq, void *dev_instance)
+{
+	struct ata_host *host = dev_instance;
+	struct ahci_host_priv *hpriv;
+	unsigned int rc = 0;
+	void __iomem *mmio;
+	u32 irq_stat, irq_masked;
+
+	VPRINTK("ENTER\n");
+
+	hpriv = host->private_data;
+	mmio = hpriv->mmio;
+
+	/* sigh.  0xffffffff is a valid return from h/w */
+	irq_stat = readl(mmio + HOST_IRQ_STAT);
+	if (!irq_stat)
+		return IRQ_NONE;
+
+	irq_masked = irq_stat & hpriv->port_map;
+
+	spin_lock(&host->lock);
+
+	/*
+	 * HOST_IRQ_STAT behaves as edge triggered latch meaning that
+	 * it should be cleared before all the port events are cleared.
+	 */
+	writel(irq_stat, mmio + HOST_IRQ_STAT);
+
+	rc = ahci_handle_port_intr(host, irq_masked);
+
+	spin_unlock(&host->lock);
+
+	VPRINTK("EXIT\n");
+
+	return IRQ_RETVAL(rc);
+}
+
 static struct ata_port_operations xgene_ahci_v1_ops = {
 	.inherits = &ahci_ops,
 	.host_stop = xgene_ahci_host_stop,
@@ -779,7 +816,8 @@ skip_clk_phy:
 		hpriv->flags = AHCI_HFLAG_NO_NCQ;
 		break;
 	case XGENE_AHCI_V2:
-		hpriv->flags |= AHCI_HFLAG_YES_FBS | AHCI_HFLAG_EDGE_IRQ;
+		hpriv->flags |= AHCI_HFLAG_YES_FBS;
+		hpriv->irq_handler = xgene_ahci_irq_intr;
 		break;
 	default:
 		break;
diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index cbfe8a2..513b3fa 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -113,8 +113,6 @@ static ssize_t ahci_store_em_buffer(struct device *dev,
 				    const char *buf, size_t size);
 static ssize_t ahci_show_em_supported(struct device *dev,
 				      struct device_attribute *attr, char *buf);
-static irqreturn_t ahci_single_edge_irq_intr(int irq, void *dev_instance);
-
 static irqreturn_t ahci_single_level_irq_intr(int irq, void *dev_instance);
 
 static DEVICE_ATTR(ahci_host_caps, S_IRUGO, ahci_show_host_caps, NULL);
@@ -517,9 +515,7 @@ void ahci_save_initial_config(struct device *dev, struct ahci_host_priv *hpriv)
 		hpriv->start_engine = ahci_start_engine;
 
 	if (!hpriv->irq_handler)
-		hpriv->irq_handler = (hpriv->flags & AHCI_HFLAG_EDGE_IRQ) ?
-				     ahci_single_edge_irq_intr :
-				     ahci_single_level_irq_intr;
+		hpriv->irq_handler = ahci_single_level_irq_intr;
 }
 EXPORT_SYMBOL_GPL(ahci_save_initial_config);
 
@@ -1882,43 +1878,6 @@ u32 ahci_handle_port_intr(struct ata_host *host, u32 irq_masked)
 }
 EXPORT_SYMBOL_GPL(ahci_handle_port_intr);
 
-static irqreturn_t ahci_single_edge_irq_intr(int irq, void *dev_instance)
-{
-	struct ata_host *host = dev_instance;
-	struct ahci_host_priv *hpriv;
-	unsigned int rc = 0;
-	void __iomem *mmio;
-	u32 irq_stat, irq_masked;
-
-	VPRINTK("ENTER\n");
-
-	hpriv = host->private_data;
-	mmio = hpriv->mmio;
-
-	/* sigh.  0xffffffff is a valid return from h/w */
-	irq_stat = readl(mmio + HOST_IRQ_STAT);
-	if (!irq_stat)
-		return IRQ_NONE;
-
-	irq_masked = irq_stat & hpriv->port_map;
-
-	spin_lock(&host->lock);
-
-	/*
-	 * HOST_IRQ_STAT behaves as edge triggered latch meaning that
-	 * it should be cleared before all the port events are cleared.
-	 */
-	writel(irq_stat, mmio + HOST_IRQ_STAT);
-
-	rc = ahci_handle_port_intr(host, irq_masked);
-
-	spin_unlock(&host->lock);
-
-	VPRINTK("EXIT\n");
-
-	return IRQ_RETVAL(rc);
-}
-
 static irqreturn_t ahci_single_level_irq_intr(int irq, void *dev_instance)
 {
 	struct ata_host *host = dev_instance;
-- 
cgit v0.10.2


From 32aea2680de01f539d928112150279fdeeabca00 Mon Sep 17 00:00:00 2001
From: Suman Tripathi <stripathi@apm.com>
Date: Sat, 6 Feb 2016 11:25:24 +0530
Subject: ahci_xgene: Implement the workaround to fix the missing of the edge
 interrupt for the HOST_IRQ_STAT.

Due to H/W errata, the HOST_IRQ_STAT register misses the edge interrupt
when clearing the HOST_IRQ_STAT register and hardware reporting the
PORT_IRQ_STAT register happens to be at the same clock cycle.

Signed-off-by: Suman Tripathi <stripathi@apm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index 8b8ccb6..8e3f7fa 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c
@@ -548,6 +548,51 @@ softreset_retry:
 	return rc;
 }
 
+/**
+ * xgene_ahci_handle_broken_edge_irq - Handle the broken irq.
+ * @ata_host: Host that recieved the irq
+ * @irq_masked: HOST_IRQ_STAT value
+ *
+ * For hardware with broken edge trigger latch
+ * the HOST_IRQ_STAT register misses the edge interrupt
+ * when clearing of HOST_IRQ_STAT register and hardware
+ * reporting the PORT_IRQ_STAT register at the
+ * same clock cycle.
+ * As such, the algorithm below outlines the workaround.
+ *
+ * 1. Read HOST_IRQ_STAT register and save the state.
+ * 2. Clear the HOST_IRQ_STAT register.
+ * 3. Read back the HOST_IRQ_STAT register.
+ * 4. If HOST_IRQ_STAT register equals to zero, then
+ *    traverse the rest of port's PORT_IRQ_STAT register
+ *    to check if an interrupt is triggered at that point else
+ *    go to step 6.
+ * 5. If PORT_IRQ_STAT register of rest ports is not equal to zero
+ *    then update the state of HOST_IRQ_STAT saved in step 1.
+ * 6. Handle port interrupts.
+ * 7. Exit
+ */
+static int xgene_ahci_handle_broken_edge_irq(struct ata_host *host,
+					     u32 irq_masked)
+{
+	struct ahci_host_priv *hpriv = host->private_data;
+	void __iomem *port_mmio;
+	int i;
+
+	if (!readl(hpriv->mmio + HOST_IRQ_STAT)) {
+		for (i = 0; i < host->n_ports; i++) {
+			if (irq_masked & (1 << i))
+				continue;
+
+			port_mmio = ahci_port_base(host->ports[i]);
+			if (readl(port_mmio + PORT_IRQ_STAT))
+				irq_masked |= (1 << i);
+		}
+	}
+
+	return ahci_handle_port_intr(host, irq_masked);
+}
+
 static irqreturn_t xgene_ahci_irq_intr(int irq, void *dev_instance)
 {
 	struct ata_host *host = dev_instance;
@@ -576,7 +621,7 @@ static irqreturn_t xgene_ahci_irq_intr(int irq, void *dev_instance)
 	 */
 	writel(irq_stat, mmio + HOST_IRQ_STAT);
 
-	rc = ahci_handle_port_intr(host, irq_masked);
+	rc = xgene_ahci_handle_broken_edge_irq(host, irq_masked);
 
 	spin_unlock(&host->lock);
 
-- 
cgit v0.10.2


From 287e6611ab1eac76c2c5ebf6e345e04c80ca9c61 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 11 Feb 2016 14:16:27 +0100
Subject: libata: fix HDIO_GET_32BIT ioctl

As reported by Soohoon Lee, the HDIO_GET_32BIT ioctl does not
work correctly in compat mode with libata.

I have investigated the issue further and found multiple problems
that all appeared with the same commit that originally introduced
HDIO_GET_32BIT handling in libata back in linux-2.6.8 and presumably
also linux-2.4, as the code uses "copy_to_user(arg, &val, 1)" to copy
a 'long' variable containing either 0 or 1 to user space.

The problems with this are:

* On big-endian machines, this will always write a zero because it
  stores the wrong byte into user space.

* In compat mode, the upper three bytes of the variable are updated
  by the compat_hdio_ioctl() function, but they now contain
  uninitialized stack data.

* The hdparm tool calling this ioctl uses a 'static long' variable
  to store the result. This means at least the upper bytes are
  initialized to zero, but calling another ioctl like HDIO_GET_MULTCOUNT
  would fill them with data that remains stale when the low byte
  is overwritten. Fortunately libata doesn't implement any of the
  affected ioctl commands, so this would only happen when we query
  both an IDE and an ATA device in the same command such as
  "hdparm -N -c /dev/hda /dev/sda"

* The libata code for unknown reasons started using ATA_IOC_GET_IO32
  and ATA_IOC_SET_IO32 as aliases for HDIO_GET_32BIT and HDIO_SET_32BIT,
  while the ioctl commands that were added later use the normal
  HDIO_* names. This is harmless but rather confusing.

This addresses all four issues by changing the code to use put_user()
on an 'unsigned long' variable in HDIO_GET_32BIT, like the IDE subsystem
does, and by clarifying the names of the ioctl commands.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reported-by: Soohoon Lee <Soohoon.Lee@f5.com>
Tested-by: Soohoon Lee <Soohoon.Lee@f5.com>
Cc: stable@vger.kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 7e959f9..e417e1a 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -675,19 +675,18 @@ static int ata_ioc32(struct ata_port *ap)
 int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *scsidev,
 		     int cmd, void __user *arg)
 {
-	int val = -EINVAL, rc = -EINVAL;
+	unsigned long val;
+	int rc = -EINVAL;
 	unsigned long flags;
 
 	switch (cmd) {
-	case ATA_IOC_GET_IO32:
+	case HDIO_GET_32BIT:
 		spin_lock_irqsave(ap->lock, flags);
 		val = ata_ioc32(ap);
 		spin_unlock_irqrestore(ap->lock, flags);
-		if (copy_to_user(arg, &val, 1))
-			return -EFAULT;
-		return 0;
+		return put_user(val, (unsigned long __user *)arg);
 
-	case ATA_IOC_SET_IO32:
+	case HDIO_SET_32BIT:
 		val = (unsigned long) arg;
 		rc = 0;
 		spin_lock_irqsave(ap->lock, flags);
diff --git a/include/linux/ata.h b/include/linux/ata.h
index d2992bf..c1a2f34 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -487,8 +487,8 @@ enum ata_tf_protocols {
 };
 
 enum ata_ioctls {
-	ATA_IOC_GET_IO32	= 0x309,
-	ATA_IOC_SET_IO32	= 0x324,
+	ATA_IOC_GET_IO32	= 0x309, /* HDIO_GET_32BIT */
+	ATA_IOC_SET_IO32	= 0x324, /* HDIO_SET_32BIT */
 };
 
 /* core structures */
-- 
cgit v0.10.2


From c7ac24178c50a01f14eebcddf5c7b7a2e54676cc Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Thu, 11 Feb 2016 02:51:17 +0900
Subject: perf config: Add '--system' and '--user' options to select which
 config file is used

The '--system' option means $(sysconfdir)/perfconfig and '--user' means
$HOME/.perfconfig. If none is used, both system and user config file are
read.  E.g.:

    # perf config [<file-option>] [options]

    With an specific config file:

    # perf config --user | --system

    or both user and system config file:

    # perf config

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1455126685-32367-2-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index c7158bf..15949e2 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -8,7 +8,7 @@ perf-config - Get and set variables in a configuration file.
 SYNOPSIS
 --------
 [verse]
-'perf config' -l | --list
+'perf config' [<file-option>] -l | --list
 
 DESCRIPTION
 -----------
@@ -21,6 +21,14 @@ OPTIONS
 --list::
 	Show current config variables, name and value, for all sections.
 
+--user::
+	For writing and reading options: write to user
+	'$HOME/.perfconfig' file or read it.
+
+--system::
+	For writing and reading options: write to system-wide
+	'$(sysconfdir)/perfconfig' or read it.
+
 CONFIGURATION FILE
 ------------------
 
@@ -30,6 +38,10 @@ The '$HOME/.perfconfig' file is used to store a per-user configuration.
 The file '$(sysconfdir)/perfconfig' can be used to
 store a system-wide default configuration.
 
+When reading or writing, the values are read from the system and user
+configuration files by default, and options '--system' and '--user'
+can be used to tell the command to read from or write to only that location.
+
 Syntax
 ~~~~~~
 
diff --git a/tools/perf/builtin-config.c b/tools/perf/builtin-config.c
index f04e804..c42448e 100644
--- a/tools/perf/builtin-config.c
+++ b/tools/perf/builtin-config.c
@@ -13,8 +13,10 @@
 #include "util/util.h"
 #include "util/debug.h"
 
+static bool use_system_config, use_user_config;
+
 static const char * const config_usage[] = {
-	"perf config [options]",
+	"perf config [<file-option>] [options]",
 	NULL
 };
 
@@ -25,6 +27,8 @@ enum actions {
 static struct option config_options[] = {
 	OPT_SET_UINT('l', "list", &actions,
 		     "show current config variables", ACTION_LIST),
+	OPT_BOOLEAN(0, "system", &use_system_config, "use system config file"),
+	OPT_BOOLEAN(0, "user", &use_user_config, "use user config file"),
 	OPT_END()
 };
 
@@ -42,10 +46,23 @@ static int show_config(const char *key, const char *value,
 int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	int ret = 0;
+	char *user_config = mkpath("%s/.perfconfig", getenv("HOME"));
 
 	argc = parse_options(argc, argv, config_options, config_usage,
 			     PARSE_OPT_STOP_AT_NON_OPTION);
 
+	if (use_system_config && use_user_config) {
+		pr_err("Error: only one config file at a time\n");
+		parse_options_usage(config_usage, config_options, "user", 0);
+		parse_options_usage(NULL, config_options, "system", 0);
+		return -1;
+	}
+
+	if (use_system_config)
+		config_exclusive_filename = perf_etc_perfconfig();
+	else if (use_user_config)
+		config_exclusive_filename = user_config;
+
 	switch (actions) {
 	case ACTION_LIST:
 		if (argc) {
@@ -53,9 +70,13 @@ int cmd_config(int argc, const char **argv, const char *prefix __maybe_unused)
 			parse_options_usage(config_usage, config_options, "l", 1);
 		} else {
 			ret = perf_config(show_config, NULL);
-			if (ret < 0)
+			if (ret < 0) {
+				const char * config_filename = config_exclusive_filename;
+				if (!config_exclusive_filename)
+					config_filename = user_config;
 				pr_err("Nothing configured, "
-				       "please check your ~/.perfconfig file\n");
+				       "please check your %s \n", config_filename);
+			}
 		}
 		break;
 	default:
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 07b5d63..3ca453f 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -23,6 +23,8 @@
 #define PERF_TRACEFS_ENVIRONMENT "PERF_TRACEFS_DIR"
 #define PERF_PAGER_ENVIRONMENT "PERF_PAGER"
 
+extern const char *config_exclusive_filename;
+
 typedef int (*config_fn_t)(const char *, const char *, void *);
 extern int perf_default_config(const char *, const char *, void *);
 extern int perf_config(config_fn_t fn, void *);
@@ -31,6 +33,7 @@ extern u64 perf_config_u64(const char *, const char *);
 extern int perf_config_bool(const char *, const char *);
 extern int config_error_nonbool(const char *);
 extern const char *perf_config_dirname(const char *, const char *);
+extern const char *perf_etc_perfconfig(void);
 
 char *alias_lookup(const char *alias);
 int split_cmdline(char *cmdline, const char ***argv);
diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c
index d3e12e3..4e72763 100644
--- a/tools/perf/util/config.c
+++ b/tools/perf/util/config.c
@@ -26,7 +26,7 @@ static const char *config_file_name;
 static int config_linenr;
 static int config_file_eof;
 
-static const char *config_exclusive_filename;
+const char *config_exclusive_filename;
 
 static int get_next_char(void)
 {
@@ -434,7 +434,7 @@ static int perf_config_from_file(config_fn_t fn, const char *filename, void *dat
 	return ret;
 }
 
-static const char *perf_etc_perfconfig(void)
+const char *perf_etc_perfconfig(void)
 {
 	static const char *system_wide;
 	if (!system_wide)
-- 
cgit v0.10.2


From e7ee404757609067c8f261d90251f1e96459c535 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 5 Feb 2016 14:01:27 +0000
Subject: perf symbols: Fix symbols searching for module in buildid-cache

Before this patch, if a sample is triggered inside a module not in
/lib/modules/`uname -r`/, even if the module is in buildid-cache, 'perf
report' will still be unable to find the correct symbol.  For example:

  # rm -rf ~/.debug/
  # perf buildid-cache -a ./mymodule.ko
  # perf probe -m ./mymodule.ko -a get_mymodule_val
  Added new event:
    probe:get_mymodule_val (on get_mymodule_val in mymodule)

  You can now use it in all perf tools, such as:

 	perf record -e probe:get_mymodule_val -aR sleep 1

  # perf record -e probe:get_mymodule_val cat /proc/mymodule
  mymodule:3
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.011 MB perf.data (1 samples) ]

  # perf report --stdio
  [SNIP]
  #
  # Overhead  Command  Shared Object     Symbol
  # ........  .......  ................  ......................
  #
    100.00%  cat      [mymodule]        [k] 0x0000000000000001

  # perf report -vvvv --stdio
  dso__load_sym: adjusting symbol: st_value: 0 sh_addr: 0 sh_offset: 0x70
  symbol__new: get_mymodule_val 0x70-0x8a
  [SNIP]

This is caused by dso__load() -> dso__load_sym(). In dso__load(), kmod
is true only when its file is found in some well know directories. All
files loaded from buildid-cache are treated as user programs. Following
dso__load_sym() set map->pgoff incorrectly.

This patch gives kernel modules in buildid-cache a chance to adjust
value of kmod. After dso__load() get the type of symbols, if it is
buildid, check the last 3 chars of original filename against '.ko', and
adjust the value of kmod if the file is a kernel module.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1454680939-24963-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c
index b28100e..f1479ee 100644
--- a/tools/perf/util/build-id.c
+++ b/tools/perf/util/build-id.c
@@ -166,6 +166,50 @@ char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size)
 	return build_id__filename(build_id_hex, bf, size);
 }
 
+bool dso__build_id_is_kmod(const struct dso *dso, char *bf, size_t size)
+{
+	char *id_name, *ch;
+	struct stat sb;
+
+	id_name = dso__build_id_filename(dso, bf, size);
+	if (!id_name)
+		goto err;
+	if (access(id_name, F_OK))
+		goto err;
+	if (lstat(id_name, &sb) == -1)
+		goto err;
+	if ((size_t)sb.st_size > size - 1)
+		goto err;
+	if (readlink(id_name, bf, size - 1) < 0)
+		goto err;
+
+	bf[sb.st_size] = '\0';
+
+	/*
+	 * link should be:
+	 * ../../lib/modules/4.4.0-rc4/kernel/net/ipv4/netfilter/nf_nat_ipv4.ko/a09fe3eb3147dafa4e3b31dbd6257e4d696bdc92
+	 */
+	ch = strrchr(bf, '/');
+	if (!ch)
+		goto err;
+	if (ch - 3 < bf)
+		goto err;
+
+	return strncmp(".ko", ch - 3, 3) == 0;
+err:
+	/*
+	 * If dso__build_id_filename work, get id_name again,
+	 * because id_name points to bf and is broken.
+	 */
+	if (id_name)
+		id_name = dso__build_id_filename(dso, bf, size);
+	pr_err("Invalid build id: %s\n", id_name ? :
+					 dso->long_name ? :
+					 dso->short_name ? :
+					 "[unknown]");
+	return false;
+}
+
 #define dsos__for_each_with_build_id(pos, head)	\
 	list_for_each_entry(pos, head, node)	\
 		if (!pos->has_build_id)		\
diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h
index 27a14a8..64af3e2 100644
--- a/tools/perf/util/build-id.h
+++ b/tools/perf/util/build-id.h
@@ -16,6 +16,7 @@ int sysfs__sprintf_build_id(const char *root_dir, char *sbuild_id);
 int filename__sprintf_build_id(const char *pathname, char *sbuild_id);
 
 char *dso__build_id_filename(const struct dso *dso, char *bf, size_t size);
+bool dso__build_id_is_kmod(const struct dso *dso, char *bf, size_t size);
 
 int build_id__mark_dso_hit(struct perf_tool *tool, union perf_event *event,
 			   struct perf_sample *sample, struct perf_evsel *evsel,
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 90cedfa..e7588dc 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1529,6 +1529,10 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
 	if (!runtime_ss && syms_ss)
 		runtime_ss = syms_ss;
 
+	if (syms_ss && syms_ss->type == DSO_BINARY_TYPE__BUILD_ID_CACHE)
+		if (dso__build_id_is_kmod(dso, name, PATH_MAX))
+			kmod = true;
+
 	if (syms_ss)
 		ret = dso__load_sym(dso, map, syms_ss, runtime_ss, filter, kmod);
 	else
-- 
cgit v0.10.2


From 37b4e2020a5f4dbecf22ee3efe92de6dbea1c5f0 Mon Sep 17 00:00:00 2001
From: Zubair Lutfullah Kakakhel <Zubair.Kakakhel@imgtec.com>
Date: Tue, 9 Feb 2016 13:33:38 +0000
Subject: perf build: Add EXTRA_LDFLAGS option to makefile

To compile for little-endian systems, you need to pass -EL to CC and LD.

EXTRA_CFLAGS works to pass -EL to CC.
Add EXTRA_LDFLAGS to pass -EL to LD.

Signed-off-by: Zubair Lutfullah Kakakhel <Zubair.Kakakhel@imgtec.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1455024818-15842-1-git-send-email-Zubair.Kakakhel@imgtec.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index d404117..4a4fad4 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -139,6 +139,8 @@ $(call allow-override,CC,$(CROSS_COMPILE)gcc)
 $(call allow-override,AR,$(CROSS_COMPILE)ar)
 $(call allow-override,LD,$(CROSS_COMPILE)ld)
 
+LD += $(EXTRA_LDFLAGS)
+
 PKG_CONFIG = $(CROSS_COMPILE)pkg-config
 
 RM      = rm -f
-- 
cgit v0.10.2


From b416e204f88dd91d9e99f6deee3d57fbc90aee40 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Tue, 9 Feb 2016 20:53:10 +0900
Subject: perf python scripting: Append examples to err msg about
 audit-libs-python

To print syscall names, the audit-libs-python package is required.. If
not installed, it prints this error string:

    # perf script syscall-counts
    Install the audit-libs-python package to get syscall names.

But the package name is different in Ubuntu, mention that in the error
message, similar to a error message of util/trace-event-scripting.c:

    # perf script syscall-counts
    Install the audit-libs-python package to get syscall names.
    For example:
      # apt-get install python-audit (Ubuntu)
      # yum install audit-libs-python (Fedora)
      etc.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1455018790-13425-1-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
index 15c8400..1d95009 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
@@ -71,7 +71,10 @@ try:
 except:
 	if not audit_package_warned:
 		audit_package_warned = True
-		print "Install the audit-libs-python package to get syscall names"
+		print "Install the audit-libs-python package to get syscall names.\n" \
+                    "For example:\n  # apt-get install python-audit (Ubuntu)" \
+                    "\n  # yum install audit-libs-python (Fedora)" \
+                    "\n  etc.\n"
 
 def syscall_name(id):
 	try:
-- 
cgit v0.10.2


From 37d9bb580aa73c171c51fb93edf67a902bcb186f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 12 Feb 2016 11:27:51 -0300
Subject: perf tools: Add comment explaining the repsep_snprintf function

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-4j67nvlfwbnkg85b969ewnkr@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index de620f7..8b54ede 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -28,7 +28,15 @@ int		sort__has_socket = 0;
 int		sort__has_thread = 0;
 enum sort_mode	sort__mode = SORT_MODE__NORMAL;
 
-
+/*
+ * Replaces all occurrences of a char used with the:
+ *
+ * -t, --field-separator
+ *
+ * option, that uses a special separator character and don't pad with spaces,
+ * replacing all occurances of this separator in symbol names (and other
+ * output) with a '.' character, that thus it's the only non valid separator.
+*/
 static int repsep_snprintf(char *bf, size_t size, const char *fmt, ...)
 {
 	int n;
-- 
cgit v0.10.2


From 89fee70943232d73e3cc328634e0da253b6de9b5 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 11 Feb 2016 17:14:13 -0300
Subject: perf hists: Do column alignment on the format iterator

We were doing column alignment in the format function for each cell,
returning a string padded with spaces so that when the next column is
printed the cursor is at its column alignment.

This ends up needlessly printing trailing spaces, do it at the format
iterator, that is where we know if it is needed, i.e. if there is more
columns to be printed.

This eliminates the need for triming lines when doing a dump using 'P'
in the TUI browser and also produces far saner results with things like
piping 'perf report' to 'less'.

Right now only the formatters for sym->name and the 'locked' column
(perf mem report), that are the ones that end up at the end of lines
in the default 'perf report', 'perf top' and 'perf mem report' tools,
the others will be done in a subsequent patch.

In the end the 'width' parameter for the formatters now mean, in
'printf' terms, the 'precision', where before it was the field 'width'.

Reported-by: Dave Jones <davej@codemonkey.org.uk>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/n/tip-s7iwl2gj23w92l6tibnrcqzr@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index a5a5390..1819771 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1061,7 +1061,6 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 				    struct hist_entry *entry,
 				    unsigned short row)
 {
-	char s[256];
 	int printed = 0;
 	int width = browser->b.width;
 	char folded_sign = ' ';
@@ -1086,16 +1085,18 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 			.folded_sign	= folded_sign,
 			.current_entry	= current_entry,
 		};
-		struct perf_hpp hpp = {
-			.buf		= s,
-			.size		= sizeof(s),
-			.ptr		= &arg,
-		};
 		int column = 0;
 
 		hist_browser__gotorc(browser, row, 0);
 
 		hists__for_each_format(browser->hists, fmt) {
+			char s[2048];
+			struct perf_hpp hpp = {
+				.buf	= s,
+				.size	= sizeof(s),
+				.ptr	= &arg,
+			};
+
 			if (perf_hpp__should_skip(fmt, entry->hists) ||
 			    column++ < browser->b.horiz_scroll)
 				continue;
@@ -1120,11 +1121,18 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 			}
 
 			if (fmt->color) {
-				width -= fmt->color(fmt, &hpp, entry);
+				int ret = fmt->color(fmt, &hpp, entry);
+				hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+				/*
+				 * fmt->color() already used ui_browser to
+				 * print the non alignment bits, skip it (+ret):
+				 */
+				ui_browser__printf(&browser->b, "%s", s + ret);
 			} else {
-				width -= fmt->entry(fmt, &hpp, entry);
+				hist_entry__snprintf_alignment(entry, &hpp, fmt, fmt->entry(fmt, &hpp, entry));
 				ui_browser__printf(&browser->b, "%s", s);
 			}
+			width -= hpp.buf - s;
 		}
 
 		/* The scroll bar isn't being used */
@@ -1452,9 +1460,10 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
 			first = false;
 
 		ret = fmt->entry(fmt, &hpp, he);
+		ret = hist_entry__snprintf_alignment(he, &hpp, fmt, ret);
 		advance_hpp(&hpp, ret);
 	}
-	printed += fprintf(fp, "%s\n", rtrim(s));
+	printed += fprintf(fp, "%s\n", s);
 
 	if (folded_sign == '-')
 		printed += hist_browser__fprintf_callchain(browser, he, fp);
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 1a6e8f7..87b022f 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -403,6 +403,7 @@ static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
 		else
 			ret = fmt->entry(fmt, hpp, he);
 
+		ret = hist_entry__snprintf_alignment(he, hpp, fmt, ret);
 		advance_hpp(hpp, ret);
 	}
 
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 12f2d79..561e947 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1015,6 +1015,27 @@ void hist_entry__delete(struct hist_entry *he)
 }
 
 /*
+ * If this is not the last column, then we need to pad it according to the
+ * pre-calculated max lenght for this column, otherwise don't bother adding
+ * spaces because that would break viewing this with, for instance, 'less',
+ * that would show tons of trailing spaces when a long C++ demangled method
+ * names is sampled.
+*/
+int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp,
+				   struct perf_hpp_fmt *fmt, int printed)
+{
+	if (!list_is_last(&fmt->list, &he->hists->hpp_list->fields)) {
+		const int width = fmt->width(fmt, hpp, hists_to_evsel(he->hists));
+		if (printed < width) {
+			advance_hpp(hpp, printed);
+			printed = scnprintf(hpp->buf, hpp->size, "%-*s", width - printed, " ");
+		}
+	}
+
+	return printed;
+}
+
+/*
  * collapse the histogram
  */
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 1c7544a..840b6d6 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -122,11 +122,16 @@ struct hist_entry *__hists__add_entry(struct hists *hists,
 int hist_entry_iter__add(struct hist_entry_iter *iter, struct addr_location *al,
 			 int max_stack_depth, void *arg);
 
+struct perf_hpp;
+struct perf_hpp_fmt;
+
 int64_t hist_entry__cmp(struct hist_entry *left, struct hist_entry *right);
 int64_t hist_entry__collapse(struct hist_entry *left, struct hist_entry *right);
 int hist_entry__transaction_len(void);
 int hist_entry__sort_snprintf(struct hist_entry *he, char *bf, size_t size,
 			      struct hists *hists);
+int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp,
+				   struct perf_hpp_fmt *fmt, int printed);
 void hist_entry__delete(struct hist_entry *he);
 
 void perf_evsel__output_resort(struct perf_evsel *evsel, struct ui_progress *prog);
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 8b54ede..de71575 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -255,10 +255,8 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
 			ret += repsep_snprintf(bf + ret, size - ret, "%s", sym->name);
 			ret += repsep_snprintf(bf + ret, size - ret, "+0x%llx",
 					ip - map->unmap_ip(map, sym->start));
-			ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
-				       width - ret, "");
 		} else {
-			ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
+			ret += repsep_snprintf(bf + ret, size - ret, "%.*s",
 					       width - ret,
 					       sym->name);
 		}
@@ -266,14 +264,9 @@ static int _hist_entry__sym_snprintf(struct map *map, struct symbol *sym,
 		size_t len = BITS_PER_LONG / 4;
 		ret += repsep_snprintf(bf + ret, size - ret, "%-#.*llx",
 				       len, ip);
-		ret += repsep_snprintf(bf + ret, size - ret, "%-*s",
-				       width - ret, "");
 	}
 
-	if (ret > width)
-		bf[width] = '\0';
-
-	return width;
+	return ret;
 }
 
 static int hist_entry__sym_snprintf(struct hist_entry *he, char *bf,
@@ -819,7 +812,7 @@ static int hist_entry__locked_snprintf(struct hist_entry *he, char *bf,
 	else
 		out = "No";
 
-	return repsep_snprintf(bf, size, "%-*s", width, out);
+	return repsep_snprintf(bf, size, "%.*s", width, out);
 }
 
 static int64_t
-- 
cgit v0.10.2


From 38714fbd299f992080a11b977f609136ec8530c2 Mon Sep 17 00:00:00 2001
From: Han Xu <han.xu@nxp.com>
Date: Fri, 12 Feb 2016 05:13:25 +0000
Subject: MAINTAINERS: update Han's email

Update my email address from freescale to nxp.

Signed-off-by: Han Xu <han.xu@nxp.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index 30aca4a..ce4a0d6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4538,7 +4538,7 @@ F:	include/linux/platform_data/video-imxfb.h
 F:	drivers/video/fbdev/imxfb.c
 
 FREESCALE QUAD SPI DRIVER
-M:	Han Xu <han.xu@freescale.com>
+M:	Han Xu <han.xu@nxp.com>
 L:	linux-mtd@lists.infradead.org
 S:	Maintained
 F:	drivers/mtd/spi-nor/fsl-quadspi.c
-- 
cgit v0.10.2


From a8adfceb389a0045e06af22517fa3326797b160a Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 12 Feb 2016 16:31:23 -0300
Subject: perf tools: Unlink entries from terms list

We were just freeing them, better unlink and init its nodes to catch
bugs faster if we keep dangling references to them.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
[ Spun off from another patch, use list_del_init() instead of list_del() ]
Link: http://lkml.kernel.org/r/1454680939-24963-2-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 813d9b2..133c8d2 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -2072,8 +2072,10 @@ void parse_events__free_terms(struct list_head *terms)
 {
 	struct parse_events_term *term, *h;
 
-	list_for_each_entry_safe(term, h, terms, list)
+	list_for_each_entry_safe(term, h, terms, list) {
+		list_del_init(&term->list);
 		free(term);
+	}
 }
 
 void parse_events_evlist_error(struct parse_events_evlist *data,
-- 
cgit v0.10.2


From fc0a2c1d59beac70b8738f4ce14431b798837374 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 12 Feb 2016 16:43:02 -0300
Subject: perf tools: Introduce parse_events_terms__purge()

Purges 'struct parse_event_term' entries from a list_head.

Some users need this because they don't allocate space for the list
head, it maybe on the stack or embedded into some other struct.

Next patch will convert users that need just purging and then the
perf_events__free_terms() routine will free the list head as well,
finally being renamed to perf_events_terms__delete().

Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/n/tip-4w3zl4ifcl0ed0j4bu3tckqp@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 133c8d2..668afdc 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -2068,7 +2068,7 @@ int parse_events_term__clone(struct parse_events_term **new,
 			term->err_term, term->err_val);
 }
 
-void parse_events__free_terms(struct list_head *terms)
+void parse_events_terms__purge(struct list_head *terms)
 {
 	struct parse_events_term *term, *h;
 
@@ -2078,6 +2078,11 @@ void parse_events__free_terms(struct list_head *terms)
 	}
 }
 
+void parse_events__free_terms(struct list_head *terms)
+{
+	parse_events_terms__purge(terms);
+}
+
 void parse_events_evlist_error(struct parse_events_evlist *data,
 			       int idx, const char *str)
 {
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index f1a6db1..f90a04c 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -116,6 +116,7 @@ int parse_events_term__sym_hw(struct parse_events_term **term,
 int parse_events_term__clone(struct parse_events_term **new,
 			     struct parse_events_term *term);
 void parse_events__free_terms(struct list_head *terms);
+void parse_events_terms__purge(struct list_head *terms);
 int parse_events__modifier_event(struct list_head *list, char *str, bool add);
 int parse_events__modifier_group(struct list_head *list, char *event_mod);
 int parse_events_name(struct list_head *list, char *name);
-- 
cgit v0.10.2


From 682dc24c2a0f13d5a16ac8f4303671eb8f11519f Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 12 Feb 2016 16:48:00 -0300
Subject: perf tools: Use perf_event_terms__purge() for non-malloced terms

In these two cases, a 'perf test' entry and in the PMU code the
list_head is on the stack, so we can't use perf_event__free_terms()
(soon to be renamed to perf_event_terms__delete()), because it will
free the list_head as well.

Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/n/tip-i956ryjhz97gnnqe8iqe7m7s@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index abe8849..6648274 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1666,7 +1666,7 @@ static int test_term(struct terms_test *t)
 	}
 
 	ret = t->check(&terms);
-	parse_events__free_terms(&terms);
+	parse_events_terms__purge(&terms);
 
 	return ret;
 }
diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index 41a9c87..cf59fba 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -354,7 +354,7 @@ static int pmu_alias_terms(struct perf_pmu_alias *alias,
 	list_for_each_entry(term, &alias->terms, list) {
 		ret = parse_events_term__clone(&cloned, term);
 		if (ret) {
-			parse_events__free_terms(&list);
+			parse_events_terms__purge(&list);
 			return ret;
 		}
 		list_add_tail(&cloned->list, &list);
-- 
cgit v0.10.2


From d20a5f2b277b2f46548fb60f2bb95ad9a601d3fe Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 12 Feb 2016 17:01:17 -0300
Subject: perf tools: Free the terms list_head in parse_events__free_terms()

Fixing a leak, since code calling parse_events__free_terms() expect it
to free the list_head too.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
[ Spun off from another patch ]
Link: http://lkml.kernel.org/r/1454680939-24963-2-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 668afdc..d1b49ec 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -2081,6 +2081,7 @@ void parse_events_terms__purge(struct list_head *terms)
 void parse_events__free_terms(struct list_head *terms)
 {
 	parse_events_terms__purge(terms);
+	free(terms);
 }
 
 void parse_events_evlist_error(struct parse_events_evlist *data,
-- 
cgit v0.10.2


From 2146afc6b45b3f1b967f5605d4e6d97dd9e31061 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 12 Feb 2016 17:09:17 -0300
Subject: perf tools: Rename parse_events__free_terms() to
 parse_events_terms__delete()

To follow convention used in other tools/perf/ areas. Also remove the
need to check if it is NULL before calling the destructor, again, to
follow convention that goes back to free().

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/n/tip-w6owu7rb8a46gvunlinxaqwx@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index 6f7d453..a339517 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -89,7 +89,7 @@ static int intel_pt_parse_terms_with_default(struct list_head *formats,
 
 	*config = attr.config;
 out_free:
-	parse_events__free_terms(terms);
+	parse_events_terms__delete(terms);
 	return err;
 }
 
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index d1b49ec..e5583fd 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1386,8 +1386,7 @@ int parse_events_terms(struct list_head *terms, const char *str)
 		return 0;
 	}
 
-	if (data.terms)
-		parse_events__free_terms(data.terms);
+	parse_events_terms__delete(data.terms);
 	return ret;
 }
 
@@ -2078,8 +2077,10 @@ void parse_events_terms__purge(struct list_head *terms)
 	}
 }
 
-void parse_events__free_terms(struct list_head *terms)
+void parse_events_terms__delete(struct list_head *terms)
 {
+	if (!terms)
+		return;
 	parse_events_terms__purge(terms);
 	free(terms);
 }
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index f90a04c..53628bf 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -115,7 +115,7 @@ int parse_events_term__sym_hw(struct parse_events_term **term,
 			      char *config, unsigned idx);
 int parse_events_term__clone(struct parse_events_term **new,
 			     struct parse_events_term *term);
-void parse_events__free_terms(struct list_head *terms);
+void parse_events_terms__delete(struct list_head *terms);
 void parse_events_terms__purge(struct list_head *terms);
 int parse_events__modifier_event(struct list_head *list, char *str, bool add);
 int parse_events__modifier_group(struct list_head *list, char *event_mod);
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index ad37996..c0eac88 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -218,7 +218,7 @@ PE_NAME '/' event_config '/'
 
 	ALLOC_LIST(list);
 	ABORT_ON(parse_events_add_pmu(data, list, $1, $3));
-	parse_events__free_terms($3);
+	parse_events_terms__delete($3);
 	$$ = list;
 }
 |
@@ -246,7 +246,7 @@ PE_KERNEL_PMU_EVENT sep_dc
 
 	ALLOC_LIST(list);
 	ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
-	parse_events__free_terms(head);
+	parse_events_terms__delete(head);
 	$$ = list;
 }
 |
@@ -266,7 +266,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc
 
 	ALLOC_LIST(list);
 	ABORT_ON(parse_events_add_pmu(data, list, "cpu", head));
-	parse_events__free_terms(head);
+	parse_events_terms__delete(head);
 	$$ = list;
 }
 
@@ -285,7 +285,7 @@ value_sym '/' event_config '/'
 
 	ALLOC_LIST(list);
 	ABORT_ON(parse_events_add_numeric(data, list, type, config, $3));
-	parse_events__free_terms($3);
+	parse_events_terms__delete($3);
 	$$ = list;
 }
 |
-- 
cgit v0.10.2


From 5141d7350d3d8a12f1f76b1015b937f14d2b97e2 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 5 Feb 2016 14:01:30 +0000
Subject: perf data: Fix releasing event_class

A new patch in libbabeltrace [1] reveals a object leak problem in
'perf data' CTF support: perf code never releases the event_class
which is allocated in add_event() and stored in evsel's private field.

If libbabeltrace has the above patch applied, leaking event_class
prevents the writer from being destroyed and flushing metadata. For
example:

  $ perf record ls
  perf.data
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.012 MB perf.data (12 samples) ]
  $ perf data convert --to-ctf ./out.ctf
  [ perf data convert: Converted 'perf.data' into CTF data './out.ctf' ]
  [ perf data convert: Converted and wrote 0.000 MB (12 samples) ]
  $ cat ./out.ctf/metadata
  $ ls -l  ./out.ctf/metadata
  -rw-r----- 1 w00229757 mm 0 Jan 27 10:49 ./out.ctf/metadata

The correct result should be:
  ...
  $ cat ./out.ctf/metadata
  /* CTF 1.8 */

  trace {
  [SNIP]

  $ ls -l  ./out.ctf/metadata
  -rw-r----- 1 w00229757 mm 2446 Jan 27 10:52 ./out.ctf/metadata

The full story is:

Patch [1] of babeltrace redesigns its reference counting scheme. In that
patch:

 * writer <- trace (bt_ctf_writer_create)
 * trace <- stream_class (bt_ctf_trace_add_stream_class)
 * stream_class <- event_class (bt_ctf_stream_class_add_event_class)
 ('<-' means 'is a parent of')

Holding of event_class causes reference count of corresponding 'writer'
to increase through parent chain. Perf expects that 'writer' is released
(so metadata is flushed) through bt_ctf_writer_put() in
ctf_writer__cleanup(). However, since it never releases event_class, the
reference of 'writer' won't be dropped, so bt_ctf_writer_put() won't
lead to the release of writer.

Before this CTF patch, !(writer <- trace). Even with event_class leaking,
the writer ends up being released.

[1] https://github.com/efficios/babeltrace/commit/e6a8e8e4744633807083a077ff9f101eb97d9801

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1454680939-24963-6-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 34cd1e4..b722e57 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -858,6 +858,23 @@ static int setup_events(struct ctf_writer *cw, struct perf_session *session)
 	return 0;
 }
 
+static void cleanup_events(struct perf_session *session)
+{
+	struct perf_evlist *evlist = session->evlist;
+	struct perf_evsel *evsel;
+
+	evlist__for_each(evlist, evsel) {
+		struct evsel_priv *priv;
+
+		priv = evsel->priv;
+		bt_ctf_event_class_put(priv->event_class);
+		zfree(&evsel->priv);
+	}
+
+	perf_evlist__delete(evlist);
+	session->evlist = NULL;
+}
+
 static int setup_streams(struct ctf_writer *cw, struct perf_session *session)
 {
 	struct ctf_stream **stream;
@@ -1171,6 +1188,7 @@ int bt_convert__perf2ctf(const char *input, const char *path, bool force)
 		(double) c.events_size / 1024.0 / 1024.0,
 		c.events_count);
 
+	cleanup_events(session);
 	perf_session__delete(session);
 	ctf_writer__cleanup(cw);
 
-- 
cgit v0.10.2


From a947b724069a25eae86d8dfed905374d04c3f93c Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Fri, 12 Feb 2016 09:30:17 -0800
Subject: ASoC: qcom: Don't specify LE device endianness

This reverts commit 18560a4e3 (ASoC: qcom: Specify LE device
endianness).

The commit that caused us to specify LE device endianness here,
29bb45f25ff3 (regmap-mmio: Use native endianness for read/write,
2015-10-29), has been reverted in mainline so now when we specify
LE it actively breaks big endian kernels because the byte
swapping in regmap-mmio is incorrect. Let's revert this change
because it will 1) fix the big endian kernels and 2) be redundant
to specify LE because that will become the default soon.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/qcom/lpass-cpu.c b/sound/soc/qcom/lpass-cpu.c
index 00b6c9d..e5101e0 100644
--- a/sound/soc/qcom/lpass-cpu.c
+++ b/sound/soc/qcom/lpass-cpu.c
@@ -355,7 +355,6 @@ static struct regmap_config lpass_cpu_regmap_config = {
 	.readable_reg = lpass_cpu_regmap_readable,
 	.volatile_reg = lpass_cpu_regmap_volatile,
 	.cache_type = REGCACHE_FLAT,
-	.val_format_endian = REGMAP_ENDIAN_LITTLE,
 };
 
 int asoc_qcom_lpass_cpu_platform_probe(struct platform_device *pdev)
-- 
cgit v0.10.2


From 5fd7a09cfb8c6852f596c1f8c891c6158395250e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 11 Aug 2015 18:03:23 +0200
Subject: atomic: Export fetch_or()

Export fetch_or() that's implemented and used internally by the
scheduler. We are going to use it for NO_HZ so make it generally
available.

Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 301de78..6c502cb 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -548,6 +548,27 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 }
 #endif
 
+/**
+ * fetch_or - perform *ptr |= mask and return old value of *ptr
+ * @ptr: pointer to value
+ * @mask: mask to OR on the value
+ *
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#ifndef fetch_or
+#define fetch_or(ptr, mask)						\
+({	typeof(*(ptr)) __old, __val = *(ptr);				\
+	for (;;) {							\
+		__old = cmpxchg((ptr), __val, __val | (mask));		\
+		if (__old == __val)					\
+			break;						\
+		__val = __old;						\
+	}								\
+	__old;								\
+})
+#endif
+
+
 #ifdef CONFIG_GENERIC_ATOMIC64
 #include <asm-generic/atomic64.h>
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9503d59..7142feb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -453,20 +453,6 @@ static inline void init_hrtick(void)
 }
 #endif	/* CONFIG_SCHED_HRTICK */
 
-/*
- * cmpxchg based fetch_or, macro so it works for different integer types
- */
-#define fetch_or(ptr, val)						\
-({	typeof(*(ptr)) __old, __val = *(ptr);				\
- 	for (;;) {							\
- 		__old = cmpxchg((ptr), __val, __val | (val));		\
- 		if (__old == __val)					\
- 			break;						\
- 		__val = __old;						\
- 	}								\
- 	__old;								\
-})
-
 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
 /*
  * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
-- 
cgit v0.10.2


From 8537bb95a63e4be330359721f0ecd422f4a4c0ca Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 7 Dec 2015 16:55:23 +0100
Subject: nohz: Implement wide kick on top of irq work

It simplifies it and allows wide kick to be performed, even when IRQs
are disabled, without an asynchronous level in the middle.

This comes at a cost of some more overhead on features like perf and
posix cpu timers slow-paths, which is probably not much important
for nohz full users.

Requested-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0b17424..548a4e2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -234,24 +234,20 @@ void tick_nohz_full_kick_cpu(int cpu)
 	irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
 }
 
-static void nohz_full_kick_ipi(void *info)
-{
-	/* Empty, the tick restart happens on tick_nohz_irq_exit() */
-}
-
 /*
  * Kick all full dynticks CPUs in order to force these to re-evaluate
  * their dependency on the tick and restart it if necessary.
  */
 void tick_nohz_full_kick_all(void)
 {
+	int cpu;
+
 	if (!tick_nohz_full_running)
 		return;
 
 	preempt_disable();
-	smp_call_function_many(tick_nohz_full_mask,
-			       nohz_full_kick_ipi, NULL, false);
-	tick_nohz_full_kick();
+	for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
+		tick_nohz_full_kick_cpu(cpu);
 	preempt_enable();
 }
 
-- 
cgit v0.10.2


From e2c7698cd61f11d4077fdb28148b2d31b82ac848 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 15:51:16 +0100
Subject: x86/mm: Fix INVPCID asm constraint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So we want to specify the dependency on both @pcid and @addr so that the
compiler doesn't reorder accesses to them *before* the TLB flush. But
for that to work, we need to express this properly in the inline asm and
deref the whole desc array, not the pointer to it. See clwb() for an
example.

This fixes the build error on 32-bit:

  arch/x86/include/asm/tlbflush.h: In function ‘__invpcid’:
  arch/x86/include/asm/tlbflush.h:26:18: error: memory input 0 is not directly addressable

which gcc4.7 caught but 5.x didn't. Which is strange. :-\

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Michael Matz <matz@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index fc9a2fd..d0cce90 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -10,7 +10,7 @@
 static inline void __invpcid(unsigned long pcid, unsigned long addr,
 			     unsigned long type)
 {
-	u64 desc[2] = { pcid, addr };
+	struct { u64 d[2]; } desc = { { pcid, addr } };
 
 	/*
 	 * The memory clobber is because the whole point is to invalidate
@@ -22,7 +22,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr,
 	 * invpcid (%rcx), %rax in long mode.
 	 */
 	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
-		      : : "m" (desc), "a" (type), "c" (desc) : "memory");
+		      : : "m" (desc), "a" (type), "c" (&desc) : "memory");
 }
 
 #define INVPCID_TYPE_INDIV_ADDR		0
-- 
cgit v0.10.2


From f944b5a7aff05a244a6c8cac297819af09a199e4 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Thu, 14 Jan 2016 10:54:13 +0100
Subject: genirq: Use a common macro to go through the actions list

The irq code browses the list of actions differently to inspect the element
one by one. Even if it is not a problem, for the sake of consistent code,
provide a macro similar to for_each_irq_desc in order to have the same loop to
go through the actions list and use it in the code.

[ tglx: Renamed the macro ]

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: http://lkml.kernel.org/r/1452765253-31148-1-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 57bff78..a15b548 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -136,10 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
 {
 	irqreturn_t retval = IRQ_NONE;
 	unsigned int flags = 0, irq = desc->irq_data.irq;
-	struct irqaction *action = desc->action;
+	struct irqaction *action;
 
-	/* action might have become NULL since we dropped the lock */
-	while (action) {
+	for_each_action_of_desc(desc, action) {
 		irqreturn_t res;
 
 		trace_irq_handler_entry(irq, action);
@@ -173,7 +172,6 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
 		}
 
 		retval |= res;
-		action = action->next;
 	}
 
 	add_interrupt_randomness(irq, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fcab63c..eab521f 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
 #define IRQ_GET_DESC_CHECK_GLOBAL	(_IRQ_DESC_CHECK)
 #define IRQ_GET_DESC_CHECK_PERCPU	(_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
 
+#define for_each_action_of_desc(desc, act)			\
+	for (act = desc->act; act; act = act->next)
+
 struct irq_desc *
 __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
 		    unsigned int check);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8411872..3ddd229 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq)
  */
 void irq_set_thread_affinity(struct irq_desc *desc)
 {
-	struct irqaction *action = desc->action;
+	struct irqaction *action;
 
-	while (action) {
+	for_each_action_of_desc(desc, action)
 		if (action->thread)
 			set_bit(IRQTF_AFFINITY, &action->thread_flags);
-		action = action->next;
-	}
 }
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id)
 		return;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	for (action = desc->action; action; action = action->next) {
+	for_each_action_of_desc(desc, action) {
 		if (action->dev_id == dev_id) {
 			if (action->thread)
 				__irq_wake_thread(desc, action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a2c02fd..4e1b947 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
 	int ret = 1;
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	for (action = desc->action ; action; action = action->next) {
+	for_each_action_of_desc(desc, action) {
 		if ((action != new_action) && action->name &&
 				!strcmp(new_action->name, action->name)) {
 			ret = 0;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3214417..5707f97 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
 	 * desc->lock here. See synchronize_irq().
 	 */
 	raw_spin_lock_irqsave(&desc->lock, flags);
-	action = desc->action;
-	while (action) {
+	for_each_action_of_desc(desc, action) {
 		printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
 		if (action->thread_fn)
 			printk(KERN_CONT " threaded [<%p>] %pf",
 					action->thread_fn, action->thread_fn);
 		printk(KERN_CONT "\n");
-		action = action->next;
 	}
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
-- 
cgit v0.10.2


From b8659adda9e295eca0b10a67f3b15a8644c8ed6f Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Mon, 15 Feb 2016 16:27:58 +0800
Subject: spi: rockchip: disable runtime pm when in err case

Before registering master, driver enables runtime pm.
This patch  pm_runtime_disable in err case while probing
driver to balance pm reference count.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c
index 79a8bc4..c0f9bae 100644
--- a/drivers/spi/spi-rockchip.c
+++ b/drivers/spi/spi-rockchip.c
@@ -749,6 +749,7 @@ static int rockchip_spi_probe(struct platform_device *pdev)
 	return 0;
 
 err_register_master:
+	pm_runtime_disable(&pdev->dev);
 	if (rs->dma_tx.ch)
 		dma_release_channel(rs->dma_tx.ch);
 	if (rs->dma_rx.ch)
-- 
cgit v0.10.2


From 844c9f476a43db0bdf61df409026a026ce98ec1b Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Mon, 15 Feb 2016 16:28:12 +0800
Subject: spi: rockchip: add missing spi_master_put

Add missing spi_master_put for rockchip_spi_remove since
it calls spi_master_get already.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c
index c0f9bae..7cb1b2d 100644
--- a/drivers/spi/spi-rockchip.c
+++ b/drivers/spi/spi-rockchip.c
@@ -779,6 +779,8 @@ static int rockchip_spi_remove(struct platform_device *pdev)
 	if (rs->dma_rx.ch)
 		dma_release_channel(rs->dma_rx.ch);
 
+	spi_master_put(master);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 1ad826bad5bd0b6ccfb203f78c70302b764df0be Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 12 Feb 2016 18:30:01 -0300
Subject: perf tests: Fix build on older systems where 'signal' is reserved
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fixing the following problems, for instance, on RHEL6.7:

    CC       /tmp/build/perf/tests/bp_signal.o
  cc1: warnings being treated as errors
  tests/bp_signal.c: In function ‘__event’:
  tests/bp_signal.c:106: error: declaration of ‘signal’ shadows a global declaration
  /usr/include/signal.h:101: error: shadowed declaration is here
  tests/bp_signal.c: In function ‘bp_event’:
  tests/bp_signal.c:144: error: declaration of ‘signal’ shadows a global declaration
  /usr/include/signal.h:101: error: shadowed declaration is here
  tests/bp_signal.c: In function ‘wp_event’:
  tests/bp_signal.c:149: error: declaration of ‘signal’ shadows a global declaration
  /usr/include/signal.h:101: error: shadowed declaration is here
  mv: cannot stat `/tmp/build/perf/tests/.bp_signal.o.tmp': No such file or directory
  make[3]: *** [/tmp/build/perf/tests/bp_signal.o] Error 1
  make[2]: *** [tests] Error 2
  make[1]: *** [/tmp/build/perf/perf-in.o] Error 2
  make[1]: *** Waiting for unfinished jobs....

Reported-by: Vinson Lee <vlee@freedesktop.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: He Kuang <hekuang@huawei.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: pi3orama@163.com
Fixes: 8fd34e1cce18 ("perf test: Improve bp_signal")
Link: http://lkml.kernel.org/n/tip-wlpx6tik1b0jirlkw64bv400@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/bp_signal.c b/tools/perf/tests/bp_signal.c
index 1d1bb48..e7664fe 100644
--- a/tools/perf/tests/bp_signal.c
+++ b/tools/perf/tests/bp_signal.c
@@ -103,7 +103,7 @@ static void sig_handler(int signum __maybe_unused,
 	}
 }
 
-static int __event(bool is_x, void *addr, int signal)
+static int __event(bool is_x, void *addr, int sig)
 {
 	struct perf_event_attr pe;
 	int fd;
@@ -133,7 +133,7 @@ static int __event(bool is_x, void *addr, int signal)
 	}
 
 	fcntl(fd, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC);
-	fcntl(fd, F_SETSIG, signal);
+	fcntl(fd, F_SETSIG, sig);
 	fcntl(fd, F_SETOWN, getpid());
 
 	ioctl(fd, PERF_EVENT_IOC_RESET, 0);
@@ -141,14 +141,14 @@ static int __event(bool is_x, void *addr, int signal)
 	return fd;
 }
 
-static int bp_event(void *addr, int signal)
+static int bp_event(void *addr, int sig)
 {
-	return __event(true, addr, signal);
+	return __event(true, addr, sig);
 }
 
-static int wp_event(void *addr, int signal)
+static int wp_event(void *addr, int sig)
 {
-	return __event(false, addr, signal);
+	return __event(false, addr, sig);
 }
 
 static long long bp_count(int fd)
-- 
cgit v0.10.2


From f2cc8e0791c70833758101d9756609a08dd601ec Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 16 Feb 2016 00:19:18 +0100
Subject: x86/cpufeature: Speed up cpu_feature_enabled()

When GCC cannot do constant folding for this macro, it falls back to
cpu_has(). But static_cpu_has() is optimal and it works at all times
now. So use it and speedup the fallback case.

Before we had this:

  mov    0x99d674(%rip),%rdx        # ffffffff81b0d9f4 <boot_cpu_data+0x34>
  shr    $0x2e,%rdx
  and    $0x1,%edx
  jne    ffffffff811704e9 <do_munmap+0x3f9>

After alternatives patching, it turns into:

		  jmp    0xffffffff81170390
		  nopl   (%rax)
		  ...
		  callq  ffffffff81056e00 <mpx_notify_unmap>
ffffffff81170390: mov    0x170(%r12),%rdi

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455578358-28347-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9fba7a5..68e4e82 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -88,8 +88,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
  * is not relevant.
  */
 #define cpu_feature_enabled(bit)	\
-	(__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 :	\
-	 cpu_has(&boot_cpu_data, bit))
+	(__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit))
 
 #define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
 
-- 
cgit v0.10.2


From 7f5301b7e66a1fd096b5d10dbb0bb2a8832516b4 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 14 Feb 2016 18:09:52 -0500
Subject: x86/platform: Make platform/intel-quark/imr.c explicitly non-modular

The Kconfig currently controlling compilation of this code is:

  drivers/platform/x86/Kconfig:config INTEL_IMR
  drivers/platform/x86/Kconfig:   bool "Intel Isolated Memory Region support"

...meaning that it currently is not being built as a module by anyone.

Lets remove the modular code that is essentially orphaned, so that
when reading the driver there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

Also note that MODULE_DEVICE_TABLE is a no-op for non-modular code.

We also delete the MODULE_LICENSE tag etc. since all that information
was (or is now) contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Reviewed-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455491396-30977-2-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index c61b6c3..0a3736f 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -1,5 +1,5 @@
 /**
- * imr.c
+ * imr.c -- Intel Isolated Memory Region driver
  *
  * Copyright(c) 2013 Intel Corporation.
  * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
@@ -31,7 +31,6 @@
 #include <linux/debugfs.h>
 #include <linux/init.h>
 #include <linux/mm.h>
-#include <linux/module.h>
 #include <linux/types.h>
 
 struct imr_device {
@@ -270,17 +269,6 @@ static int imr_debugfs_register(struct imr_device *idev)
 }
 
 /**
- * imr_debugfs_unregister - unregister debugfs hooks.
- *
- * @idev:	pointer to imr_device structure.
- * @return:
- */
-static void imr_debugfs_unregister(struct imr_device *idev)
-{
-	debugfs_remove(idev->file);
-}
-
-/**
  * imr_check_params - check passed address range IMR alignment and non-zero size
  *
  * @base:	base address of intended IMR.
@@ -614,7 +602,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = {
 	{ X86_VENDOR_INTEL, 5, 9 },	/* Intel Quark SoC X1000. */
 	{}
 };
-MODULE_DEVICE_TABLE(x86cpu, imr_ids);
 
 /**
  * imr_init - entry point for IMR driver.
@@ -640,22 +627,4 @@ static int __init imr_init(void)
 	imr_fixup_memmap(idev);
 	return 0;
 }
-
-/**
- * imr_exit - exit point for IMR code.
- *
- * Deregisters debugfs, leave IMR state as-is.
- *
- * return:
- */
-static void __exit imr_exit(void)
-{
-	imr_debugfs_unregister(&imr_dev);
-}
-
-module_init(imr_init);
-module_exit(imr_exit);
-
-MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
-MODULE_DESCRIPTION("Intel Isolated Memory Region driver");
-MODULE_LICENSE("Dual BSD/GPL");
+device_initcall(imr_init);
-- 
cgit v0.10.2


From 32ed42ad6c42d6e0e78914a25c216fbbd595e74d Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 14 Feb 2016 18:09:53 -0500
Subject: x86/platform: Make platform/intel-quark/imr_selftest.c explicitly
 non-modular

The Kconfig currently controlling compilation of this code is:

  arch/x86/Kconfig.debug:config DEBUG_IMR_SELFTEST
  arch/x86/Kconfig.debug:    bool "Isolated Memory Region self test"

...meaning that it currently is not being built as a module by anyone.

Lets remove the modular code that is essentially orphaned, so that
when reading the driver there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

Also note that MODULE_DEVICE_TABLE is a no-op for non-modular code.

We also delete the MODULE_LICENSE tag etc. since all that information
was (or is now) contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Reviewed-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455491396-30977-3-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index 278e4da..0381343 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -1,5 +1,5 @@
 /**
- * imr_selftest.c
+ * imr_selftest.c -- Intel Isolated Memory Region self-test driver
  *
  * Copyright(c) 2013 Intel Corporation.
  * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
@@ -15,7 +15,6 @@
 #include <asm/imr.h>
 #include <linux/init.h>
 #include <linux/mm.h>
-#include <linux/module.h>
 #include <linux/types.h>
 
 #define SELFTEST KBUILD_MODNAME ": "
@@ -106,7 +105,6 @@ static const struct x86_cpu_id imr_ids[] __initconst = {
 	{ X86_VENDOR_INTEL, 5, 9 },	/* Intel Quark SoC X1000. */
 	{}
 };
-MODULE_DEVICE_TABLE(x86cpu, imr_ids);
 
 /**
  * imr_self_test_init - entry point for IMR driver.
@@ -125,13 +123,4 @@ static int __init imr_self_test_init(void)
  *
  * return:
  */
-static void __exit imr_self_test_exit(void)
-{
-}
-
-module_init(imr_self_test_init);
-module_exit(imr_self_test_exit);
-
-MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
-MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver");
-MODULE_LICENSE("Dual BSD/GPL");
+device_initcall(imr_self_test_init);
-- 
cgit v0.10.2


From eb61aee743ec647f22071479a5137241a7dcab17 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 14 Feb 2016 18:09:54 -0500
Subject: x86/platform: Make platform/geode/geos.c explicitly non-modular

The Kconfig currently controlling compilation of this code is:

  arch/x86/Kconfig:config GEOS
  arch/x86/Kconfig:       bool "Traverse Technologies GEOS System Support (LEDS, GPIO, etc)"

...meaning that it currently is not being built as a module by anyone.

Lets remove the couple traces of modularity, so that when reading
the code there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

We also delete the MODULE_LICENSE tag etc. since all that information
is already contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Philip Prindeville <philipp@redfish-solutions.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455491396-30977-4-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c
index aa733fb..4fcdb91 100644
--- a/arch/x86/platform/geode/geos.c
+++ b/arch/x86/platform/geode/geos.c
@@ -19,7 +19,6 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/string.h>
-#include <linux/module.h>
 #include <linux/leds.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
@@ -120,9 +119,4 @@ static int __init geos_init(void)
 
 	return 0;
 }
-
-module_init(geos_init);
-
-MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
-MODULE_DESCRIPTION("Traverse Technologies Geos System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(geos_init);
-- 
cgit v0.10.2


From 52d856e88171e1ec000d0479363c1e4e81991130 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 14 Feb 2016 18:09:55 -0500
Subject: x86/platform: Make platform/geode/alix.c explicitly non-modular

The Kconfig currently controlling compilation of this code is:

  arch/x86/Kconfig:config ALIX
  arch/x86/Kconfig:       bool "PCEngines ALIX System Support (LED setup)"

...meaning that it currently is not being built as a module by anyone.

Lets remove the modular code that is essentially orphaned, so that
when reading the driver there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

We replace module.h with moduleparam.h since the file does declare
some module parameters, and leaving them as such is currently the
easiest way to remain compatible with existing boot arg use cases.

We also delete the MODULE_LICENSE tag etc. since all that information
is already contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Ed Wildgoose <kernel@wildgooses.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455491396-30977-5-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index 76b6632..1865c19 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -21,7 +21,7 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/string.h>
-#include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/leds.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
@@ -35,6 +35,11 @@
 #define BIOS_SIGNATURE_COREBOOT		0x500
 #define BIOS_REGION_SIZE		0x10000
 
+/*
+ * This driver is not modular, but to keep back compatibility
+ * with existing use cases, continuing with module_param is
+ * the easiest way forward.
+ */
 static bool force = 0;
 module_param(force, bool, 0444);
 /* FIXME: Award bios is not automatically detected as Alix platform */
@@ -192,9 +197,4 @@ static int __init alix_init(void)
 
 	return 0;
 }
-
-module_init(alix_init);
-
-MODULE_AUTHOR("Ed Wildgoose <kernel@wildgooses.com>");
-MODULE_DESCRIPTION("PCEngines ALIX System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(alix_init);
-- 
cgit v0.10.2


From 605a46ee8353e8292e93baa5dc13e6be98bbec43 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Sun, 14 Feb 2016 18:09:56 -0500
Subject: x86/platform: Make platform/geode/net5501.c explicitly non-modular

The Kconfig currently controlling compilation of this code is:

  arch/x86/Kconfig:config NET5501
  arch/x86/Kconfig:       bool "Soekris Engineering net5501 System Support (LEDS, GPIO, etc)"

...meaning that it currently is not being built as a module by anyone.

Lets remove the couple traces of modularity, so that when reading
the driver there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.

We also delete the MODULE_LICENSE tag etc. since all that information
is already contained at the top of the file in the comments.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Philip Prindeville <philipp@redfish-solutions.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455491396-30977-6-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c
index 927e38c..a2f6b98 100644
--- a/arch/x86/platform/geode/net5501.c
+++ b/arch/x86/platform/geode/net5501.c
@@ -20,7 +20,6 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/string.h>
-#include <linux/module.h>
 #include <linux/leds.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
@@ -146,9 +145,4 @@ static int __init net5501_init(void)
 
 	return 0;
 }
-
-module_init(net5501_init);
-
-MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>");
-MODULE_DESCRIPTION("Soekris net5501 System Setup");
-MODULE_LICENSE("GPL");
+device_initcall(net5501_init);
-- 
cgit v0.10.2


From 25b4caf7c50e8c501310e8c515d8518b1850c948 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Sun, 14 Feb 2016 13:35:58 +0100
Subject: x86/boot: Remove unused 'is_big_kernel' variable

Variable is_big_kernel is defined in arch/x86/boot/tools/build.c but
never used anywhere.

Boris noted that its usage went away 7 years ago, as of:

  5e47c478b0b6 ("x86: remove zImage support")

Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455453358-4088-1-git-send-email-nicolas.iooss_linux@m4x.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index a7661c4..0702d25 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -49,7 +49,6 @@ typedef unsigned int   u32;
 
 /* This must be large enough to hold the entire setup */
 u8 buf[SETUP_SECT_MAX*512];
-int is_big_kernel;
 
 #define PECOFF_RELOC_RESERVE 0x20
 
-- 
cgit v0.10.2


From fb896c44f88a75843a072cd6961b1615732f7811 Mon Sep 17 00:00:00 2001
From: Liad Kaufman <liad.kaufman@intel.com>
Date: Sun, 14 Feb 2016 15:32:58 +0200
Subject: iwlwifi: mvm: inc pending frames counter also when txing non-sta

Until this patch, when TXing non-sta the pending_frames counter
wasn't increased, but it WAS decreased in
iwl_mvm_rx_tx_cmd_single(), what makes it negative in certain
conditions. This in turn caused much trouble when we need to
remove the station since we won't be waiting forever until
pending_frames gets 0. In certain cases, we were exhausting
the station table even in BSS mode, because we had a lot of
stale stations.

Increase the counter also in iwl_mvm_tx_skb_non_sta() after a
successful TX to avoid this outcome.

CC: <stable@vger.kernel.org> [3.18+]
Signed-off-by: Liad Kaufman <liad.kaufman@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index 0914ec2..a040edc 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -423,6 +423,15 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
 		return -1;
 	}
 
+	/*
+	 * Increase the pending frames counter, so that later when a reply comes
+	 * in and the counter is decreased - we don't start getting negative
+	 * values.
+	 * Note that we don't need to make sure it isn't agg'd, since we're
+	 * TXing non-sta
+	 */
+	atomic_inc(&mvm->pending_frames[sta_id]);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 1f4522400e7e49464da5e584a463bd315283790f Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Thu, 11 Feb 2016 12:28:55 -0200
Subject: [media] [for,v4.5] media.h: increase the spacing between function
 ranges

Each function range is quite narrow and especially for connectors this
will pose a problem. Increase the function ranges while we still can and
move the connector range to the end so that range is practically limitless.

[mchehab@osg.samsung.com: Rebased to apply at Linus tree]
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>

Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>

diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 1e3c8cb..8b87bdd 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -72,21 +72,21 @@ struct media_device_info {
 #define MEDIA_ENT_F_DTV_NET_DECAP	(MEDIA_ENT_F_BASE + 4)
 
 /*
- * Connectors
+ * I/O entities
  */
-/* It is a responsibility of the entity drivers to add connectors and links */
-#define MEDIA_ENT_F_CONN_RF		(MEDIA_ENT_F_BASE + 21)
-#define MEDIA_ENT_F_CONN_SVIDEO		(MEDIA_ENT_F_BASE + 22)
-#define MEDIA_ENT_F_CONN_COMPOSITE	(MEDIA_ENT_F_BASE + 23)
-/* For internal test signal generators and other debug connectors */
-#define MEDIA_ENT_F_CONN_TEST		(MEDIA_ENT_F_BASE + 24)
+#define MEDIA_ENT_F_IO_DTV		(MEDIA_ENT_F_BASE + 1001)
+#define MEDIA_ENT_F_IO_VBI		(MEDIA_ENT_F_BASE + 1002)
+#define MEDIA_ENT_F_IO_SWRADIO		(MEDIA_ENT_F_BASE + 1003)
 
 /*
- * I/O entities
+ * Connectors
  */
-#define MEDIA_ENT_F_IO_DTV  		(MEDIA_ENT_F_BASE + 31)
-#define MEDIA_ENT_F_IO_VBI  		(MEDIA_ENT_F_BASE + 32)
-#define MEDIA_ENT_F_IO_SWRADIO		(MEDIA_ENT_F_BASE + 33)
+/* It is a responsibility of the entity drivers to add connectors and links */
+#define MEDIA_ENT_F_CONN_RF		(MEDIA_ENT_F_BASE + 10001)
+#define MEDIA_ENT_F_CONN_SVIDEO		(MEDIA_ENT_F_BASE + 10002)
+#define MEDIA_ENT_F_CONN_COMPOSITE	(MEDIA_ENT_F_BASE + 10003)
+/* For internal test signal generators and other debug connectors */
+#define MEDIA_ENT_F_CONN_TEST		(MEDIA_ENT_F_BASE + 10004)
 
 /*
  * Don't touch on those. The ranges MEDIA_ENT_F_OLD_BASE and
-- 
cgit v0.10.2


From 9727a9545adec59f4bccb83d1a709711f4acf242 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Date: Fri, 12 Feb 2016 15:44:31 -0200
Subject: [media] media.h: get rid of MEDIA_ENT_F_CONN_TEST

Defining it as a connector was a bad idea. Remove it while it is
not too late.

Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>

diff --git a/Documentation/DocBook/media/v4l/media-types.xml b/Documentation/DocBook/media/v4l/media-types.xml
index 1af3842..0ee0f33 100644
--- a/Documentation/DocBook/media/v4l/media-types.xml
+++ b/Documentation/DocBook/media/v4l/media-types.xml
@@ -57,10 +57,6 @@
 	    <entry>Connector for a RGB composite signal.</entry>
 	  </row>
 	  <row>
-	    <entry><constant>MEDIA_ENT_F_CONN_TEST</constant></entry>
-	    <entry>Connector for a test generator.</entry>
-	  </row>
-	  <row>
 	    <entry><constant>MEDIA_ENT_F_CAM_SENSOR</constant></entry>
 	    <entry>Camera video sensor entity.</entry>
 	  </row>
diff --git a/drivers/media/usb/au0828/au0828-video.c b/drivers/media/usb/au0828/au0828-video.c
index 8c54fd2..a136257 100644
--- a/drivers/media/usb/au0828/au0828-video.c
+++ b/drivers/media/usb/au0828/au0828-video.c
@@ -1843,8 +1843,7 @@ static void au0828_analog_create_entities(struct au0828_dev *dev)
 			ent->function = MEDIA_ENT_F_CONN_RF;
 			break;
 		default: /* AU0828_VMUX_DEBUG */
-			ent->function = MEDIA_ENT_F_CONN_TEST;
-			break;
+			continue;
 		}
 
 		ret = media_entity_pads_init(ent, 1, &dev->input_pad[i]);
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 8b87bdd..f032852 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -85,8 +85,6 @@ struct media_device_info {
 #define MEDIA_ENT_F_CONN_RF		(MEDIA_ENT_F_BASE + 10001)
 #define MEDIA_ENT_F_CONN_SVIDEO		(MEDIA_ENT_F_BASE + 10002)
 #define MEDIA_ENT_F_CONN_COMPOSITE	(MEDIA_ENT_F_BASE + 10003)
-/* For internal test signal generators and other debug connectors */
-#define MEDIA_ENT_F_CONN_TEST		(MEDIA_ENT_F_BASE + 10004)
 
 /*
  * Don't touch on those. The ranges MEDIA_ENT_F_OLD_BASE and
-- 
cgit v0.10.2


From 0ba4581c84cfb39fd527f6b3457f1c97f6356c04 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hansverk@cisco.com>
Date: Wed, 10 Feb 2016 08:09:10 -0200
Subject: [media] adv7604: fix tx 5v detect regression

The 5 volt detect functionality broke in 3.14: the code reads IO register 0x70
again after it has already been cleared. Instead it should use the cached
irq_reg_0x70 value and the io_write to 0x71 to clear 0x70 can be dropped since
this has already been done.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Cc: <stable@vger.kernel.org>      # for v3.14 and up
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>

diff --git a/drivers/media/i2c/adv7604.c b/drivers/media/i2c/adv7604.c
index f8dd750..e1719ff 100644
--- a/drivers/media/i2c/adv7604.c
+++ b/drivers/media/i2c/adv7604.c
@@ -1960,10 +1960,9 @@ static int adv76xx_isr(struct v4l2_subdev *sd, u32 status, bool *handled)
 	}
 
 	/* tx 5v detect */
-	tx_5v = io_read(sd, 0x70) & info->cable_det_mask;
+	tx_5v = irq_reg_0x70 & info->cable_det_mask;
 	if (tx_5v) {
 		v4l2_dbg(1, debug, sd, "%s: tx_5v: 0x%x\n", __func__, tx_5v);
-		io_write(sd, 0x71, tx_5v);
 		adv76xx_s_detect_tx_5v_ctrl(sd);
 		if (handled)
 			*handled = true;
-- 
cgit v0.10.2


From fed6d3363182fd499cd7f70cf424cd3cba7aef26 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 10 Feb 2016 15:46:56 +0100
Subject: irqchip/armada-370-xp: Add Kconfig option for the driver

Instead of building the irq-armada-370-xp driver directly when
CONFIG_ARCH_MVEBU is enabled, this commit introduces an intermediate
CONFIG_ARMADA_370_XP_IRQ hidden Kconfig option.

This allows this option to select other interrupt-related Kconfig
options (which will be needed in follow-up commits) rather than having
such selects done from arch/arm/mach-<foo>/.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Acked-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Link: https://lkml.kernel.org/r/1455115621-22846-2-git-send-email-thomas.petazzoni@free-electrons.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index fb50911..dfe4ed8 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -60,6 +60,11 @@ config ARM_VIC_NR
 	  The maximum number of VICs available in the system, for
 	  power management.
 
+config ARMADA_370_XP_IRQ
+	bool
+	default y if ARCH_MVEBU
+	select GENERIC_IRQ_CHIP
+
 config ATMEL_AIC_IRQ
 	bool
 	select GENERIC_IRQ_CHIP
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 18caacb..30dba04 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -5,7 +5,6 @@ obj-$(CONFIG_ARCH_BCM2835)		+= irq-bcm2836.o
 obj-$(CONFIG_ARCH_EXYNOS)		+= exynos-combiner.o
 obj-$(CONFIG_ARCH_HIP04)		+= irq-hip04.o
 obj-$(CONFIG_ARCH_MMP)			+= irq-mmp.o
-obj-$(CONFIG_ARCH_MVEBU)		+= irq-armada-370-xp.o
 obj-$(CONFIG_IRQ_MXS)			+= irq-mxs.o
 obj-$(CONFIG_ARCH_TEGRA)		+= irq-tegra.o
 obj-$(CONFIG_ARCH_S3C24XX)		+= irq-s3c24xx.o
@@ -28,6 +27,7 @@ obj-$(CONFIG_ARM_GIC_V3_ITS)		+= irq-gic-v3-its.o irq-gic-v3-its-pci-msi.o irq-g
 obj-$(CONFIG_HISILICON_IRQ_MBIGEN)	+= irq-mbigen.o
 obj-$(CONFIG_ARM_NVIC)			+= irq-nvic.o
 obj-$(CONFIG_ARM_VIC)			+= irq-vic.o
+obj-$(CONFIG_ARMADA_370_XP_IRQ)		+= irq-armada-370-xp.o
 obj-$(CONFIG_ATMEL_AIC_IRQ)		+= irq-atmel-aic-common.o irq-atmel-aic.o
 obj-$(CONFIG_ATMEL_AIC5_IRQ)	+= irq-atmel-aic-common.o irq-atmel-aic5.o
 obj-$(CONFIG_I8259)			+= irq-i8259.o
-- 
cgit v0.10.2


From fcc392d501bd2befdf35180abcf07b4849499ed6 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 10 Feb 2016 15:46:57 +0100
Subject: irqchip/armada-370-xp: Use the generic MSI infrastructure

This commit moves the irq-armada-370-xp driver from using the
PCI-specific MSI infrastructure to the generic MSI infrastructure, to
which drivers are progressively converted.

In this hardware, the MSI controller is directly bundled inside the
interrupt controller, so we have a single Device Tree node to which
multiple IRQ domaines are attached: the wired interrupt domain and the
MSI interrupt domain. In order to ensure that they can be
differentiated, we have to force the bus_token of the wired interrupt
domain to be DOMAIN_BUS_WIRED. The MSI domain bus_token is
automatically set to the appropriate value by
pci_msi_create_irq_domain().

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Suggested-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Link: https://lkml.kernel.org/r/1455115621-22846-3-git-send-email-thomas.petazzoni@free-electrons.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index dfe4ed8..ba6a084 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -64,6 +64,7 @@ config ARMADA_370_XP_IRQ
 	bool
 	default y if ARCH_MVEBU
 	select GENERIC_IRQ_CHIP
+	select PCI_MSI_IRQ_DOMAIN if PCI_MSI
 
 config ATMEL_AIC_IRQ
 	bool
diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c
index 3f3a8c3..e5738c5 100644
--- a/drivers/irqchip/irq-armada-370-xp.c
+++ b/drivers/irqchip/irq-armada-370-xp.c
@@ -71,6 +71,7 @@ static u32 doorbell_mask_reg;
 static int parent_irq;
 #ifdef CONFIG_PCI_MSI
 static struct irq_domain *armada_370_xp_msi_domain;
+static struct irq_domain *armada_370_xp_msi_inner_domain;
 static DECLARE_BITMAP(msi_used, PCI_MSI_DOORBELL_NR);
 static DEFINE_MUTEX(msi_used_lock);
 static phys_addr_t msi_doorbell_addr;
@@ -115,127 +116,99 @@ static void armada_370_xp_irq_unmask(struct irq_data *d)
 
 #ifdef CONFIG_PCI_MSI
 
-static int armada_370_xp_alloc_msi(void)
-{
-	int hwirq;
-
-	mutex_lock(&msi_used_lock);
-	hwirq = find_first_zero_bit(&msi_used, PCI_MSI_DOORBELL_NR);
-	if (hwirq >= PCI_MSI_DOORBELL_NR)
-		hwirq = -ENOSPC;
-	else
-		set_bit(hwirq, msi_used);
-	mutex_unlock(&msi_used_lock);
+static struct irq_chip armada_370_xp_msi_irq_chip = {
+	.name = "armada_370_xp_msi_irq",
+	.irq_mask = pci_msi_mask_irq,
+	.irq_unmask = pci_msi_unmask_irq,
+};
 
-	return hwirq;
-}
+static struct msi_domain_info armada_370_xp_msi_domain_info = {
+	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS),
+	.chip	= &armada_370_xp_msi_irq_chip,
+};
 
-static void armada_370_xp_free_msi(int hwirq)
+static void armada_370_xp_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	mutex_lock(&msi_used_lock);
-	if (!test_bit(hwirq, msi_used))
-		pr_err("trying to free unused MSI#%d\n", hwirq);
-	else
-		clear_bit(hwirq, msi_used);
-	mutex_unlock(&msi_used_lock);
+	msg->address_lo = lower_32_bits(msi_doorbell_addr);
+	msg->address_hi = upper_32_bits(msi_doorbell_addr);
+	msg->data = 0xf00 | (data->hwirq + PCI_MSI_DOORBELL_START);
 }
 
-static int armada_370_xp_setup_msi_irq(struct msi_controller *chip,
-				       struct pci_dev *pdev,
-				       struct msi_desc *desc)
+static int armada_370_xp_msi_set_affinity(struct irq_data *irq_data,
+					  const struct cpumask *mask, bool force)
 {
-	struct msi_msg msg;
-	int virq, hwirq;
+	 return -EINVAL;
+}
 
-	/* We support MSI, but not MSI-X */
-	if (desc->msi_attrib.is_msix)
-		return -EINVAL;
+static struct irq_chip armada_370_xp_msi_bottom_irq_chip = {
+	.name			= "armada_370_xp_msi_irq",
+	.irq_compose_msi_msg	= armada_370_xp_compose_msi_msg,
+	.irq_set_affinity	= armada_370_xp_msi_set_affinity,
+};
 
-	hwirq = armada_370_xp_alloc_msi();
-	if (hwirq < 0)
-		return hwirq;
+static int armada_370_xp_msi_alloc(struct irq_domain *domain, unsigned int virq,
+				   unsigned int nr_irqs, void *args)
+{
+	int hwirq;
 
-	virq = irq_create_mapping(armada_370_xp_msi_domain, hwirq);
-	if (!virq) {
-		armada_370_xp_free_msi(hwirq);
-		return -EINVAL;
+	mutex_lock(&msi_used_lock);
+	hwirq = find_first_zero_bit(&msi_used, PCI_MSI_DOORBELL_NR);
+	if (hwirq >= PCI_MSI_DOORBELL_NR) {
+		mutex_unlock(&msi_used_lock);
+		return -ENOSPC;
 	}
 
-	irq_set_msi_desc(virq, desc);
+	set_bit(hwirq, msi_used);
+	mutex_unlock(&msi_used_lock);
 
-	msg.address_lo = msi_doorbell_addr;
-	msg.address_hi = 0;
-	msg.data = 0xf00 | (hwirq + 16);
+	irq_domain_set_info(domain, virq, hwirq, &armada_370_xp_msi_bottom_irq_chip,
+			    domain->host_data, handle_simple_irq,
+			    NULL, NULL);
 
-	pci_write_msi_msg(virq, &msg);
-	return 0;
+	return hwirq;
 }
 
-static void armada_370_xp_teardown_msi_irq(struct msi_controller *chip,
-					   unsigned int irq)
+static void armada_370_xp_msi_free(struct irq_domain *domain,
+				   unsigned int virq, unsigned int nr_irqs)
 {
-	struct irq_data *d = irq_get_irq_data(irq);
-	unsigned long hwirq = d->hwirq;
-
-	irq_dispose_mapping(irq);
-	armada_370_xp_free_msi(hwirq);
-}
-
-static struct irq_chip armada_370_xp_msi_irq_chip = {
-	.name = "armada_370_xp_msi_irq",
-	.irq_enable = pci_msi_unmask_irq,
-	.irq_disable = pci_msi_mask_irq,
-	.irq_mask = pci_msi_mask_irq,
-	.irq_unmask = pci_msi_unmask_irq,
-};
+	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
 
-static int armada_370_xp_msi_map(struct irq_domain *domain, unsigned int virq,
-				 irq_hw_number_t hw)
-{
-	irq_set_chip_and_handler(virq, &armada_370_xp_msi_irq_chip,
-				 handle_simple_irq);
-
-	return 0;
+	mutex_lock(&msi_used_lock);
+	if (!test_bit(d->hwirq, msi_used))
+		pr_err("trying to free unused MSI#%lu\n", d->hwirq);
+	else
+		clear_bit(d->hwirq, msi_used);
+	mutex_unlock(&msi_used_lock);
 }
 
-static const struct irq_domain_ops armada_370_xp_msi_irq_ops = {
-	.map = armada_370_xp_msi_map,
+static const struct irq_domain_ops armada_370_xp_msi_domain_ops = {
+	.alloc	= armada_370_xp_msi_alloc,
+	.free	= armada_370_xp_msi_free,
 };
 
 static int armada_370_xp_msi_init(struct device_node *node,
 				  phys_addr_t main_int_phys_base)
 {
-	struct msi_controller *msi_chip;
 	u32 reg;
-	int ret;
 
 	msi_doorbell_addr = main_int_phys_base +
 		ARMADA_370_XP_SW_TRIG_INT_OFFS;
 
-	msi_chip = kzalloc(sizeof(*msi_chip), GFP_KERNEL);
-	if (!msi_chip)
+	armada_370_xp_msi_inner_domain =
+		irq_domain_add_linear(NULL, PCI_MSI_DOORBELL_NR,
+				      &armada_370_xp_msi_domain_ops, NULL);
+	if (!armada_370_xp_msi_inner_domain)
 		return -ENOMEM;
 
-	msi_chip->setup_irq = armada_370_xp_setup_msi_irq;
-	msi_chip->teardown_irq = armada_370_xp_teardown_msi_irq;
-	msi_chip->of_node = node;
-
 	armada_370_xp_msi_domain =
-		irq_domain_add_linear(NULL, PCI_MSI_DOORBELL_NR,
-				      &armada_370_xp_msi_irq_ops,
-				      NULL);
+		pci_msi_create_irq_domain(of_node_to_fwnode(node),
+					  &armada_370_xp_msi_domain_info,
+					  armada_370_xp_msi_inner_domain);
 	if (!armada_370_xp_msi_domain) {
-		kfree(msi_chip);
+		irq_domain_remove(armada_370_xp_msi_inner_domain);
 		return -ENOMEM;
 	}
 
-	ret = of_pci_msi_chip_add(msi_chip);
-	if (ret < 0) {
-		irq_domain_remove(armada_370_xp_msi_domain);
-		kfree(msi_chip);
-		return ret;
-	}
-
 	reg = readl(per_cpu_int_base + ARMADA_370_XP_IN_DRBEL_MSK_OFFS)
 		| PCI_MSI_DOORBELL_MASK;
 
@@ -427,12 +400,12 @@ static void armada_370_xp_handle_msi_irq(struct pt_regs *regs, bool is_chained)
 			continue;
 
 		if (is_chained) {
-			irq = irq_find_mapping(armada_370_xp_msi_domain,
+			irq = irq_find_mapping(armada_370_xp_msi_inner_domain,
 					       msinr - 16);
 			generic_handle_irq(irq);
 		} else {
 			irq = msinr - 16;
-			handle_domain_irq(armada_370_xp_msi_domain,
+			handle_domain_irq(armada_370_xp_msi_inner_domain,
 					  irq, regs);
 		}
 	}
@@ -604,8 +577,8 @@ static int __init armada_370_xp_mpic_of_init(struct device_node *node,
 	armada_370_xp_mpic_domain =
 		irq_domain_add_linear(node, nr_irqs,
 				&armada_370_xp_mpic_irq_ops, NULL);
-
 	BUG_ON(!armada_370_xp_mpic_domain);
+	armada_370_xp_mpic_domain->bus_token = DOMAIN_BUS_WIRED;
 
 	/* Setup for the boot CPU */
 	armada_xp_mpic_perf_init();
-- 
cgit v0.10.2


From 0636bab67fa4df68c85b28ba345596a72986fa9e Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 10 Feb 2016 15:46:58 +0100
Subject: irqchip/armada-370-xp: Use PCI_MSI_DOORBELL_START where appropriate

As suggested by Gregory Clement, this commit adjusts the
irq-armada-370-xp driver to use the PCI_MSI_DOORBELL_START define in
the armada_370_xp_handle_msi_irq() function, rather than hardcoding
its value.

Suggested-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Acked-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Link: https://lkml.kernel.org/r/1455115621-22846-4-git-send-email-thomas.petazzoni@free-electrons.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c
index e5738c5..f53eb71 100644
--- a/drivers/irqchip/irq-armada-370-xp.c
+++ b/drivers/irqchip/irq-armada-370-xp.c
@@ -401,10 +401,10 @@ static void armada_370_xp_handle_msi_irq(struct pt_regs *regs, bool is_chained)
 
 		if (is_chained) {
 			irq = irq_find_mapping(armada_370_xp_msi_inner_domain,
-					       msinr - 16);
+					       msinr - PCI_MSI_DOORBELL_START);
 			generic_handle_irq(irq);
 		} else {
-			irq = msinr - 16;
+			irq = msinr - PCI_MSI_DOORBELL_START;
 			handle_domain_irq(armada_370_xp_msi_inner_domain,
 					  irq, regs);
 		}
-- 
cgit v0.10.2


From f692a172deb5cda8874a44782401901597c526a2 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 10 Feb 2016 15:46:59 +0100
Subject: irqchip/armada-370-xp: Use shorter names for irq_chip

In order to make the output of /proc/interrupts, use shorter names for
the irq_chip registered by the irq-armada-370-xp driver. Using capital
letters also matches better what is done for the GIC driver, which
uses just "GIC" as the irq_chip->name.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Acked-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Link: https://lkml.kernel.org/r/1455115621-22846-5-git-send-email-thomas.petazzoni@free-electrons.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c
index f53eb71..c99ae5f 100644
--- a/drivers/irqchip/irq-armada-370-xp.c
+++ b/drivers/irqchip/irq-armada-370-xp.c
@@ -117,7 +117,7 @@ static void armada_370_xp_irq_unmask(struct irq_data *d)
 #ifdef CONFIG_PCI_MSI
 
 static struct irq_chip armada_370_xp_msi_irq_chip = {
-	.name = "armada_370_xp_msi_irq",
+	.name = "MPIC MSI",
 	.irq_mask = pci_msi_mask_irq,
 	.irq_unmask = pci_msi_unmask_irq,
 };
@@ -141,7 +141,7 @@ static int armada_370_xp_msi_set_affinity(struct irq_data *irq_data,
 }
 
 static struct irq_chip armada_370_xp_msi_bottom_irq_chip = {
-	.name			= "armada_370_xp_msi_irq",
+	.name			= "MPIC MSI",
 	.irq_compose_msi_msg	= armada_370_xp_compose_msi_msg,
 	.irq_set_affinity	= armada_370_xp_msi_set_affinity,
 };
@@ -253,7 +253,7 @@ static int armada_xp_set_affinity(struct irq_data *d,
 #endif
 
 static struct irq_chip armada_370_xp_irq_chip = {
-	.name		= "armada_370_xp_irq",
+	.name		= "MPIC",
 	.irq_mask       = armada_370_xp_irq_mask,
 	.irq_mask_ack   = armada_370_xp_irq_mask,
 	.irq_unmask     = armada_370_xp_irq_unmask,
-- 
cgit v0.10.2


From a71b9412c90ca784ae68d32ec307cc527b5962a9 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 10 Feb 2016 15:47:00 +0100
Subject: irqchip/armada-370-xp: Allow allocation of multiple MSIs

Add support for allocating multiple MSIs at the same time, so that the
MSI_FLAG_MULTI_PCI_MSI flag can be added to the msi_domain_info
structure.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Reviewed-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Link: https://lkml.kernel.org/r/1455115621-22846-6-git-send-email-thomas.petazzoni@free-electrons.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/drivers/irqchip/irq-armada-370-xp.c b/drivers/irqchip/irq-armada-370-xp.c
index c99ae5f..e7dc6cb 100644
--- a/drivers/irqchip/irq-armada-370-xp.c
+++ b/drivers/irqchip/irq-armada-370-xp.c
@@ -123,7 +123,8 @@ static struct irq_chip armada_370_xp_msi_irq_chip = {
 };
 
 static struct msi_domain_info armada_370_xp_msi_domain_info = {
-	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS),
+	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+		   MSI_FLAG_MULTI_PCI_MSI),
 	.chip	= &armada_370_xp_msi_irq_chip,
 };
 
@@ -149,21 +150,26 @@ static struct irq_chip armada_370_xp_msi_bottom_irq_chip = {
 static int armada_370_xp_msi_alloc(struct irq_domain *domain, unsigned int virq,
 				   unsigned int nr_irqs, void *args)
 {
-	int hwirq;
+	int hwirq, i;
 
 	mutex_lock(&msi_used_lock);
-	hwirq = find_first_zero_bit(&msi_used, PCI_MSI_DOORBELL_NR);
+
+	hwirq = bitmap_find_next_zero_area(msi_used, PCI_MSI_DOORBELL_NR,
+					   0, nr_irqs, 0);
 	if (hwirq >= PCI_MSI_DOORBELL_NR) {
 		mutex_unlock(&msi_used_lock);
 		return -ENOSPC;
 	}
 
-	set_bit(hwirq, msi_used);
+	bitmap_set(msi_used, hwirq, nr_irqs);
 	mutex_unlock(&msi_used_lock);
 
-	irq_domain_set_info(domain, virq, hwirq, &armada_370_xp_msi_bottom_irq_chip,
-			    domain->host_data, handle_simple_irq,
-			    NULL, NULL);
+	for (i = 0; i < nr_irqs; i++) {
+		irq_domain_set_info(domain, virq + i, hwirq + i,
+				    &armada_370_xp_msi_bottom_irq_chip,
+				    domain->host_data, handle_simple_irq,
+				    NULL, NULL);
+	}
 
 	return hwirq;
 }
@@ -174,10 +180,7 @@ static void armada_370_xp_msi_free(struct irq_domain *domain,
 	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
 
 	mutex_lock(&msi_used_lock);
-	if (!test_bit(d->hwirq, msi_used))
-		pr_err("trying to free unused MSI#%lu\n", d->hwirq);
-	else
-		clear_bit(d->hwirq, msi_used);
+	bitmap_clear(msi_used, d->hwirq, nr_irqs);
 	mutex_unlock(&msi_used_lock);
 }
 
-- 
cgit v0.10.2


From cb49d86dbe7a3277fb329c30206d4d95470c879c Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 10 Feb 2016 15:47:01 +0100
Subject: ARM: mvebu: Use the ARMADA_370_XP_IRQ option

Now that there is a ARMADA_370_XP_IRQ option to enable the irqchip
driver for Armada 370, XP, 375, 38x and 39x, let's select this option
when needed. Note that this selection is currently not mandatory
because ARMADA_370_XP_IRQ is for now always enabled when ARCH_MVEBU=y,
but this is something that we will change in the future, and therefore
we should make the relevant platforms select ARMADA_370_XP_IRQ when
needed.

Due to this, selecting GENERIC_IRQ_CHIP is no longer needed.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Acked-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Link: https://lkml.kernel.org/r/1455115621-22846-7-git-send-email-thomas.petazzoni@free-electrons.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig
index 64e3d2c..b003e3a 100644
--- a/arch/arm/mach-mvebu/Kconfig
+++ b/arch/arm/mach-mvebu/Kconfig
@@ -3,7 +3,6 @@ menuconfig ARCH_MVEBU
 	depends on ARCH_MULTI_V7 || ARCH_MULTI_V5
 	select ARCH_SUPPORTS_BIG_ENDIAN
 	select CLKSRC_MMIO
-	select GENERIC_IRQ_CHIP
 	select PINCTRL
 	select PLAT_ORION
 	select SOC_BUS
@@ -29,6 +28,7 @@ config MACH_ARMADA_370
 	bool "Marvell Armada 370 boards"
 	depends on ARCH_MULTI_V7
 	select ARMADA_370_CLK
+	select ARMADA_370_XP_IRQ
 	select CPU_PJ4B
 	select MACH_MVEBU_V7
 	select PINCTRL_ARMADA_370
@@ -39,6 +39,7 @@ config MACH_ARMADA_370
 config MACH_ARMADA_375
 	bool "Marvell Armada 375 boards"
 	depends on ARCH_MULTI_V7
+	select ARMADA_370_XP_IRQ
 	select ARM_ERRATA_720789
 	select ARM_ERRATA_753970
 	select ARM_GIC
@@ -58,6 +59,7 @@ config MACH_ARMADA_38X
 	select ARM_ERRATA_720789
 	select ARM_ERRATA_753970
 	select ARM_GIC
+	select ARMADA_370_XP_IRQ
 	select ARMADA_38X_CLK
 	select HAVE_ARM_SCU
 	select HAVE_ARM_TWD if SMP
@@ -72,6 +74,7 @@ config MACH_ARMADA_39X
 	bool "Marvell Armada 39x boards"
 	depends on ARCH_MULTI_V7
 	select ARM_GIC
+	select ARMADA_370_XP_IRQ
 	select ARMADA_39X_CLK
 	select CACHE_L2X0
 	select HAVE_ARM_SCU
@@ -86,6 +89,7 @@ config MACH_ARMADA_39X
 config MACH_ARMADA_XP
 	bool "Marvell Armada XP boards"
 	depends on ARCH_MULTI_V7
+	select ARMADA_370_XP_IRQ
 	select ARMADA_XP_CLK
 	select CPU_PJ4B
 	select MACH_MVEBU_V7
-- 
cgit v0.10.2


From 63131b636a0ec969e8b98122c1a928b5a2649d3b Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Mon, 8 Feb 2016 18:14:10 +0100
Subject: irqchip/armada-370-xp: Do not enable it by default when ARCH_MVEBU is
 selected

The irq-armada-370-xp driver can only be built for ARM 32 bits. The mvebu
family had grown with a new ARM64 SoC which will also select the
ARCH_MEVBU configuration. Since "ARM: mvebu: use the ARMADA_370_XP_IRQ
option", the ARM32 mvebu SoC directly select this new option. Selecting
it by default when ARCH_MEVBU is selected is no more needed.

This patch removes this dependency, thanks to this, a kernel for ARM64
mvebu SoC can be built without error due this driver.

Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Link: https://lkml.kernel.org/r/1454951660-13289-3-git-send-email-gregory.clement@free-electrons.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index ba6a084..8115a32 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -62,7 +62,6 @@ config ARM_VIC_NR
 
 config ARMADA_370_XP_IRQ
 	bool
-	default y if ARCH_MVEBU
 	select GENERIC_IRQ_CHIP
 	select PCI_MSI_IRQ_DOMAIN if PCI_MSI
 
-- 
cgit v0.10.2


From d646ae0a73deb0d80792a6a9c0757317ad8049c5 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 16 Feb 2016 07:37:41 +0100
Subject: perf jvmti: Add check for java alternatives cmd in Makefile

This patch modifies the jvmti makefile to check if the
/usr/sbin/java-update-alternatives utility is present.  If so, then use
it, if not then use the altenatives command.

This helps handle the difference between Ubuntu and Fedora Linux
distributions.

Signed-off-by: Stephane Eranian <eranian@google.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1455604661-9357-1-git-send-email-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/jvmti/Makefile b/tools/perf/jvmti/Makefile
index 5968f83..0277a64 100644
--- a/tools/perf/jvmti/Makefile
+++ b/tools/perf/jvmti/Makefile
@@ -35,8 +35,12 @@ SOLIBEXT=so
 
 # The following works at least on fedora 23, you may need the next
 # line for other distros.
+ifeq (,$(wildcard /usr/sbin/update-java-alternatives))
 JDIR=$(shell alternatives --display java | tail -1 | cut -d' ' -f 5 | sed 's%/jre/bin/java.%%g')
-#JDIR=$(shell /usr/sbin/update-java-alternatives -l | head -1 | cut -d ' ' -f 3)
+else
+JDIR=$(shell /usr/sbin/update-java-alternatives -l | head -1 | cut -d ' ' -f 3)
+endif
+
 # -lrt required in 32-bit mode for clock_gettime()
 LIBS=-lelf -lrt
 INCDIR=-I $(JDIR)/include -I $(JDIR)/include/linux
-- 
cgit v0.10.2


From 975f14fa8f2996604f248552eee4403def34bf5e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 14 Feb 2016 17:03:42 +0100
Subject: tools lib api: Add debug output support

Adding support for warning/info/debug output within libapi code. Adding
following macros:

  pr_warning(fmt, ...)
  pr_info(fmt, ...)
  pr_debug(fmt, ...)

Also adding libapi_set_print function to set above functions. This will
be used in perf to set standard debug handlers for libapi.

Adding 2 header files:
  debug.h
    - to be used outside libapi, contains
      libapi_set_print interface

  debug-internal.h
    - to be used within libapi, contains
      pr_warning/pr_info/pr_debug definitions

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1455465826-8426-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/lib/api/Build b/tools/lib/api/Build
index e8b8a23..954c644 100644
--- a/tools/lib/api/Build
+++ b/tools/lib/api/Build
@@ -1,3 +1,4 @@
 libapi-y += fd/
 libapi-y += fs/
 libapi-y += cpu.o
+libapi-y += debug.o
diff --git a/tools/lib/api/Makefile b/tools/lib/api/Makefile
index d85904d..bbc82c6 100644
--- a/tools/lib/api/Makefile
+++ b/tools/lib/api/Makefile
@@ -18,6 +18,7 @@ LIBFILE = $(OUTPUT)libapi.a
 CFLAGS := $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
 CFLAGS += -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -fPIC
 CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
+CFLAGS += -I$(srctree)/tools/lib/api
 
 RM = rm -f
 
diff --git a/tools/lib/api/debug-internal.h b/tools/lib/api/debug-internal.h
new file mode 100644
index 0000000..188f788
--- /dev/null
+++ b/tools/lib/api/debug-internal.h
@@ -0,0 +1,20 @@
+#ifndef __API_DEBUG_INTERNAL_H__
+#define __API_DEBUG_INTERNAL_H__
+
+#include "debug.h"
+
+#define __pr(func, fmt, ...)	\
+do {				\
+	if ((func))		\
+		(func)("libapi: " fmt, ##__VA_ARGS__); \
+} while (0)
+
+extern libapi_print_fn_t __pr_warning;
+extern libapi_print_fn_t __pr_info;
+extern libapi_print_fn_t __pr_debug;
+
+#define pr_warning(fmt, ...)	__pr(__pr_warning, fmt, ##__VA_ARGS__)
+#define pr_info(fmt, ...)	__pr(__pr_info, fmt, ##__VA_ARGS__)
+#define pr_debug(fmt, ...)	__pr(__pr_debug, fmt, ##__VA_ARGS__)
+
+#endif /* __API_DEBUG_INTERNAL_H__ */
diff --git a/tools/lib/api/debug.c b/tools/lib/api/debug.c
new file mode 100644
index 0000000..5fa5cf5
--- /dev/null
+++ b/tools/lib/api/debug.c
@@ -0,0 +1,28 @@
+#include <stdio.h>
+#include <stdarg.h>
+#include "debug.h"
+#include "debug-internal.h"
+
+static int __base_pr(const char *format, ...)
+{
+	va_list args;
+	int err;
+
+	va_start(args, format);
+	err = vfprintf(stderr, format, args);
+	va_end(args);
+	return err;
+}
+
+libapi_print_fn_t __pr_warning = __base_pr;
+libapi_print_fn_t __pr_info    = __base_pr;
+libapi_print_fn_t __pr_debug;
+
+void libapi_set_print(libapi_print_fn_t warn,
+		      libapi_print_fn_t info,
+		      libapi_print_fn_t debug)
+{
+	__pr_warning = warn;
+	__pr_info    = info;
+	__pr_debug   = debug;
+}
diff --git a/tools/lib/api/debug.h b/tools/lib/api/debug.h
new file mode 100644
index 0000000..a0872f6
--- /dev/null
+++ b/tools/lib/api/debug.h
@@ -0,0 +1,10 @@
+#ifndef __API_DEBUG_H__
+#define __API_DEBUG_H__
+
+typedef int (*libapi_print_fn_t)(const char *, ...);
+
+void libapi_set_print(libapi_print_fn_t warn,
+		      libapi_print_fn_t info,
+		      libapi_print_fn_t debug);
+
+#endif /* __API_DEBUG_H__ */
-- 
cgit v0.10.2


From 607bfbd7ffc60156ae0831c917497dc91a57dd8d Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 14 Feb 2016 17:03:43 +0100
Subject: tools lib api fs: Adopt filename__read_str from perf

We already moved similar functions in here, also it'll be useful for
sysfs__read_str addition in following patch.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1455465826-8426-3-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index 459599d..2cbf677 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -13,6 +13,7 @@
 #include <sys/mount.h>
 
 #include "fs.h"
+#include "debug-internal.h"
 
 #define _STR(x) #x
 #define STR(x) _STR(x)
@@ -300,6 +301,56 @@ int filename__read_ull(const char *filename, unsigned long long *value)
 	return err;
 }
 
+#define STRERR_BUFSIZE  128     /* For the buffer size of strerror_r */
+
+int filename__read_str(const char *filename, char **buf, size_t *sizep)
+{
+	size_t size = 0, alloc_size = 0;
+	void *bf = NULL, *nbf;
+	int fd, n, err = 0;
+	char sbuf[STRERR_BUFSIZE];
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+
+	do {
+		if (size == alloc_size) {
+			alloc_size += BUFSIZ;
+			nbf = realloc(bf, alloc_size);
+			if (!nbf) {
+				err = -ENOMEM;
+				break;
+			}
+
+			bf = nbf;
+		}
+
+		n = read(fd, bf + size, alloc_size - size);
+		if (n < 0) {
+			if (size) {
+				pr_warning("read failed %d: %s\n", errno,
+					 strerror_r(errno, sbuf, sizeof(sbuf)));
+				err = 0;
+			} else
+				err = -errno;
+
+			break;
+		}
+
+		size += n;
+	} while (n > 0);
+
+	if (!err) {
+		*sizep = size;
+		*buf   = bf;
+	} else
+		free(bf);
+
+	close(fd);
+	return err;
+}
+
 int sysfs__read_ull(const char *entry, unsigned long long *value)
 {
 	char path[PATH_MAX];
diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h
index d024a7f..858922b 100644
--- a/tools/lib/api/fs/fs.h
+++ b/tools/lib/api/fs/fs.h
@@ -2,6 +2,7 @@
 #define __API_FS__
 
 #include <stdbool.h>
+#include <unistd.h>
 
 /*
  * On most systems <limits.h> would have given us this, but  not on some systems
@@ -26,6 +27,7 @@ FS(tracefs)
 
 int filename__read_int(const char *filename, int *value);
 int filename__read_ull(const char *filename, unsigned long long *value);
+int filename__read_str(const char *filename, char **buf, size_t *sizep);
 
 int sysctl__read_int(const char *sysctl, int *value);
 int sysfs__read_int(const char *entry, int *value);
diff --git a/tools/perf/util/trace-event.c b/tools/perf/util/trace-event.c
index 802bb86..8ae051e 100644
--- a/tools/perf/util/trace-event.c
+++ b/tools/perf/util/trace-event.c
@@ -10,6 +10,7 @@
 #include <linux/err.h>
 #include <traceevent/event-parse.h>
 #include <api/fs/tracing_path.h>
+#include <api/fs/fs.h>
 #include "trace-event.h"
 #include "machine.h"
 #include "util.h"
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index b9e2843..35b20dd 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -507,54 +507,6 @@ int parse_callchain_record(const char *arg, struct callchain_param *param)
 	return ret;
 }
 
-int filename__read_str(const char *filename, char **buf, size_t *sizep)
-{
-	size_t size = 0, alloc_size = 0;
-	void *bf = NULL, *nbf;
-	int fd, n, err = 0;
-	char sbuf[STRERR_BUFSIZE];
-
-	fd = open(filename, O_RDONLY);
-	if (fd < 0)
-		return -errno;
-
-	do {
-		if (size == alloc_size) {
-			alloc_size += BUFSIZ;
-			nbf = realloc(bf, alloc_size);
-			if (!nbf) {
-				err = -ENOMEM;
-				break;
-			}
-
-			bf = nbf;
-		}
-
-		n = read(fd, bf + size, alloc_size - size);
-		if (n < 0) {
-			if (size) {
-				pr_warning("read failed %d: %s\n", errno,
-					 strerror_r(errno, sbuf, sizeof(sbuf)));
-				err = 0;
-			} else
-				err = -errno;
-
-			break;
-		}
-
-		size += n;
-	} while (n > 0);
-
-	if (!err) {
-		*sizep = size;
-		*buf   = bf;
-	} else
-		free(bf);
-
-	close(fd);
-	return err;
-}
-
 const char *get_filename_for_perf_kvm(void)
 {
 	const char *filename;
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index a861581..3dd0408 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -303,7 +303,6 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym,
 		  bool show_sym, bool unwind_inlines);
 void free_srcline(char *srcline);
 
-int filename__read_str(const char *filename, char **buf, size_t *sizep);
 int perf_event_paranoid(void);
 
 void mem_bswap_64(void *src, int byte_size);
-- 
cgit v0.10.2


From 51c0396c600f49de9c3ff8f6fd83055de3709c3d Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 14 Feb 2016 17:03:44 +0100
Subject: tools lib api fs: Add sysfs__read_str function

Adding sysfs__read_str function to ease up reading string files from
sysfs. New interface is:

  int sysfs__read_str(const char *entry, char **buf, size_t *sizep);

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1455465826-8426-4-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index 2cbf677..ef78c22 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -377,6 +377,19 @@ int sysfs__read_int(const char *entry, int *value)
 	return filename__read_int(path, value);
 }
 
+int sysfs__read_str(const char *entry, char **buf, size_t *sizep)
+{
+	char path[PATH_MAX];
+	const char *sysfs = sysfs__mountpoint();
+
+	if (!sysfs)
+		return -1;
+
+	snprintf(path, sizeof(path), "%s/%s", sysfs, entry);
+
+	return filename__read_str(path, buf, sizep);
+}
+
 int sysctl__read_int(const char *sysctl, int *value)
 {
 	char path[PATH_MAX];
diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h
index 858922b..9f65980 100644
--- a/tools/lib/api/fs/fs.h
+++ b/tools/lib/api/fs/fs.h
@@ -32,4 +32,5 @@ int filename__read_str(const char *filename, char **buf, size_t *sizep);
 int sysctl__read_int(const char *sysctl, int *value);
 int sysfs__read_int(const char *entry, int *value);
 int sysfs__read_ull(const char *entry, unsigned long long *value);
+int sysfs__read_str(const char *entry, char **buf, size_t *sizep);
 #endif /* __API_FS__ */
-- 
cgit v0.10.2


From bedbdd4297224efcd7d668198e32fab14b76b98b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 16 Feb 2016 11:48:38 -0300
Subject: perf debug: Rename __eprintf(va_list args) to veprintf

Adhering to the naming convention used when va_args is in a printf like
function, e.g. stdio.h.

Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-b5l3wt77ct28dcnriguxtvn6@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index 86d9c73..d6c8d2b 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -22,7 +22,7 @@ int debug_ordered_events;
 static int redirect_to_stderr;
 int debug_data_convert;
 
-static int _eprintf(int level, int var, const char *fmt, va_list args)
+int veprintf(int level, int var, const char *fmt, va_list args)
 {
 	int ret = 0;
 
@@ -36,24 +36,19 @@ static int _eprintf(int level, int var, const char *fmt, va_list args)
 	return ret;
 }
 
-int veprintf(int level, int var, const char *fmt, va_list args)
-{
-	return _eprintf(level, var, fmt, args);
-}
-
 int eprintf(int level, int var, const char *fmt, ...)
 {
 	va_list args;
 	int ret;
 
 	va_start(args, fmt);
-	ret = _eprintf(level, var, fmt, args);
+	ret = veprintf(level, var, fmt, args);
 	va_end(args);
 
 	return ret;
 }
 
-static int __eprintf_time(u64 t, const char *fmt, va_list args)
+static int veprintf_time(u64 t, const char *fmt, va_list args)
 {
 	int ret = 0;
 	u64 secs, usecs, nsecs = t;
@@ -75,7 +70,7 @@ int eprintf_time(int level, int var, u64 t, const char *fmt, ...)
 
 	if (var >= level) {
 		va_start(args, fmt);
-		ret = __eprintf_time(t, fmt, args);
+		ret = veprintf_time(t, fmt, args);
 		va_end(args);
 	}
 
@@ -91,7 +86,7 @@ void pr_stat(const char *fmt, ...)
 	va_list args;
 
 	va_start(args, fmt);
-	_eprintf(1, verbose, fmt, args);
+	veprintf(1, verbose, fmt, args);
 	va_end(args);
 	eprintf(1, verbose, "\n");
 }
-- 
cgit v0.10.2


From dd629cc0975349c99d5483402bca1ef16313c209 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sun, 14 Feb 2016 17:03:45 +0100
Subject: perf tools: Initialize libapi debug output

Setting libapi debug output functions to use perf functions.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1455465826-8426-5-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index a929618..144047c 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -613,6 +613,8 @@ int main(int argc, const char **argv)
 	 */
 	pthread__block_sigwinch();
 
+	perf_debug_setup();
+
 	while (1) {
 		static int done_help;
 		int was_alias = run_argv(&argc, &argv);
diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index d6c8d2b..ff7e86a 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -5,6 +5,7 @@
 #include <string.h>
 #include <stdarg.h>
 #include <stdio.h>
+#include <api/debug.h>
 
 #include "cache.h"
 #include "color.h"
@@ -187,3 +188,23 @@ int perf_debug_option(const char *str)
 	free(s);
 	return 0;
 }
+
+#define DEBUG_WRAPPER(__n, __l)				\
+static int pr_ ## __n ## _wrapper(const char *fmt, ...)	\
+{							\
+	va_list args;					\
+	int ret;					\
+							\
+	va_start(args, fmt);				\
+	ret = veprintf(__l, verbose, fmt, args);	\
+	va_end(args);					\
+	return ret;					\
+}
+
+DEBUG_WRAPPER(warning, 0);
+DEBUG_WRAPPER(debug, 1);
+
+void perf_debug_setup(void)
+{
+	libapi_set_print(pr_warning_wrapper, pr_warning_wrapper, pr_debug_wrapper);
+}
diff --git a/tools/perf/util/debug.h b/tools/perf/util/debug.h
index 8b9a088..14bafda 100644
--- a/tools/perf/util/debug.h
+++ b/tools/perf/util/debug.h
@@ -53,5 +53,6 @@ int eprintf_time(int level, int var, u64 t, const char *fmt, ...) __attribute__(
 int veprintf(int level, int var, const char *fmt, va_list args);
 
 int perf_debug_option(const char *str);
+void perf_debug_setup(void);
 
 #endif	/* __PERF_DEBUG_H */
-- 
cgit v0.10.2


From 720e98b5faf10cfd12b7821dbdcc41c9747bd13e Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 16 Feb 2016 16:01:43 +0100
Subject: perf tools: Add perf data cache feature

Storing CPU cache details under perf data. It's stored as new
HEADER_CACHE feature and it's displayed under header info with -I
option:

  $ perf report --header-only -I
  ...
  # CPU cache info:
  #  L1 Data                 32K [0-1]
  #  L1 Instruction          32K [0-1]
  #  L1 Data                 32K [2-3]
  #  L1 Instruction          32K [2-3]
  #  L2 Unified             256K [0-1]
  #  L2 Unified             256K [2-3]
  #  L3 Unified            4096K [0-3]
  ...

All distinct caches are stored/displayed.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20160216150143.GA7119@krava.brq.redhat.com
[ Fixed leak on process_caches(), s/cache_level/cpu_cache_level/g ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index 7dd5939..49a11d9 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -6,6 +6,8 @@ struct perf_env perf_env;
 
 void perf_env__exit(struct perf_env *env)
 {
+	int i;
+
 	zfree(&env->hostname);
 	zfree(&env->os_release);
 	zfree(&env->version);
@@ -19,6 +21,10 @@ void perf_env__exit(struct perf_env *env)
 	zfree(&env->numa_nodes);
 	zfree(&env->pmu_mappings);
 	zfree(&env->cpu);
+
+	for (i = 0; i < env->caches_cnt; i++)
+		cpu_cache_level__free(&env->caches[i]);
+	zfree(&env->caches);
 }
 
 int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[])
@@ -75,3 +81,10 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
 	env->nr_cpus_avail = nr_cpus;
 	return 0;
 }
+
+void cpu_cache_level__free(struct cpu_cache_level *cache)
+{
+	free(cache->type);
+	free(cache->map);
+	free(cache->size);
+}
diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h
index 0132b95..56cffb6 100644
--- a/tools/perf/util/env.h
+++ b/tools/perf/util/env.h
@@ -1,11 +1,23 @@
 #ifndef __PERF_ENV_H
 #define __PERF_ENV_H
 
+#include <linux/types.h>
+
 struct cpu_topology_map {
 	int	socket_id;
 	int	core_id;
 };
 
+struct cpu_cache_level {
+	u32	level;
+	u32	line_size;
+	u32	sets;
+	u32	ways;
+	char	*type;
+	char	*size;
+	char	*map;
+};
+
 struct perf_env {
 	char			*hostname;
 	char			*os_release;
@@ -31,6 +43,8 @@ struct perf_env {
 	char			*numa_nodes;
 	char			*pmu_mappings;
 	struct cpu_topology_map	*cpu;
+	struct cpu_cache_level	*caches;
+	int			 caches_cnt;
 };
 
 extern struct perf_env perf_env;
@@ -41,4 +55,5 @@ int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[]);
 
 int perf_env__read_cpu_topology_map(struct perf_env *env);
 
+void cpu_cache_level__free(struct cpu_cache_level *cache);
 #endif /* __PERF_ENV_H */
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index f50b723..73e38e4 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -23,6 +23,8 @@
 #include "strbuf.h"
 #include "build-id.h"
 #include "data.h"
+#include <api/fs/fs.h>
+#include "asm/bug.h"
 
 /*
  * magic2 = "PERFILE2"
@@ -868,6 +870,199 @@ static int write_auxtrace(int fd, struct perf_header *h,
 	return err;
 }
 
+static int cpu_cache_level__sort(const void *a, const void *b)
+{
+	struct cpu_cache_level *cache_a = (struct cpu_cache_level *)a;
+	struct cpu_cache_level *cache_b = (struct cpu_cache_level *)b;
+
+	return cache_a->level - cache_b->level;
+}
+
+static bool cpu_cache_level__cmp(struct cpu_cache_level *a, struct cpu_cache_level *b)
+{
+	if (a->level != b->level)
+		return false;
+
+	if (a->line_size != b->line_size)
+		return false;
+
+	if (a->sets != b->sets)
+		return false;
+
+	if (a->ways != b->ways)
+		return false;
+
+	if (strcmp(a->type, b->type))
+		return false;
+
+	if (strcmp(a->size, b->size))
+		return false;
+
+	if (strcmp(a->map, b->map))
+		return false;
+
+	return true;
+}
+
+static int cpu_cache_level__read(struct cpu_cache_level *cache, u32 cpu, u16 level)
+{
+	char path[PATH_MAX], file[PATH_MAX];
+	struct stat st;
+	size_t len;
+
+	scnprintf(path, PATH_MAX, "devices/system/cpu/cpu%d/cache/index%d/", cpu, level);
+	scnprintf(file, PATH_MAX, "%s/%s", sysfs__mountpoint(), path);
+
+	if (stat(file, &st))
+		return 1;
+
+	scnprintf(file, PATH_MAX, "%s/level", path);
+	if (sysfs__read_int(file, (int *) &cache->level))
+		return -1;
+
+	scnprintf(file, PATH_MAX, "%s/coherency_line_size", path);
+	if (sysfs__read_int(file, (int *) &cache->line_size))
+		return -1;
+
+	scnprintf(file, PATH_MAX, "%s/number_of_sets", path);
+	if (sysfs__read_int(file, (int *) &cache->sets))
+		return -1;
+
+	scnprintf(file, PATH_MAX, "%s/ways_of_associativity", path);
+	if (sysfs__read_int(file, (int *) &cache->ways))
+		return -1;
+
+	scnprintf(file, PATH_MAX, "%s/type", path);
+	if (sysfs__read_str(file, &cache->type, &len))
+		return -1;
+
+	cache->type[len] = 0;
+	cache->type = rtrim(cache->type);
+
+	scnprintf(file, PATH_MAX, "%s/size", path);
+	if (sysfs__read_str(file, &cache->size, &len)) {
+		free(cache->type);
+		return -1;
+	}
+
+	cache->size[len] = 0;
+	cache->size = rtrim(cache->size);
+
+	scnprintf(file, PATH_MAX, "%s/shared_cpu_list", path);
+	if (sysfs__read_str(file, &cache->map, &len)) {
+		free(cache->map);
+		free(cache->type);
+		return -1;
+	}
+
+	cache->map[len] = 0;
+	cache->map = rtrim(cache->map);
+	return 0;
+}
+
+static void cpu_cache_level__fprintf(FILE *out, struct cpu_cache_level *c)
+{
+	fprintf(out, "L%d %-15s %8s [%s]\n", c->level, c->type, c->size, c->map);
+}
+
+static int build_caches(struct cpu_cache_level caches[], u32 size, u32 *cntp)
+{
+	u32 i, cnt = 0;
+	long ncpus;
+	u32 nr, cpu;
+	u16 level;
+
+	ncpus = sysconf(_SC_NPROCESSORS_CONF);
+	if (ncpus < 0)
+		return -1;
+
+	nr = (u32)(ncpus & UINT_MAX);
+
+	for (cpu = 0; cpu < nr; cpu++) {
+		for (level = 0; level < 10; level++) {
+			struct cpu_cache_level c;
+			int err;
+
+			err = cpu_cache_level__read(&c, cpu, level);
+			if (err < 0)
+				return err;
+
+			if (err == 1)
+				break;
+
+			for (i = 0; i < cnt; i++) {
+				if (cpu_cache_level__cmp(&c, &caches[i]))
+					break;
+			}
+
+			if (i == cnt)
+				caches[cnt++] = c;
+			else
+				cpu_cache_level__free(&c);
+
+			if (WARN_ONCE(cnt == size, "way too many cpu caches.."))
+				goto out;
+		}
+	}
+ out:
+	*cntp = cnt;
+	return 0;
+}
+
+#define MAX_CACHES 2000
+
+static int write_cache(int fd, struct perf_header *h __maybe_unused,
+			  struct perf_evlist *evlist __maybe_unused)
+{
+	struct cpu_cache_level caches[MAX_CACHES];
+	u32 cnt = 0, i, version = 1;
+	int ret;
+
+	ret = build_caches(caches, MAX_CACHES, &cnt);
+	if (ret)
+		goto out;
+
+	qsort(&caches, cnt, sizeof(struct cpu_cache_level), cpu_cache_level__sort);
+
+	ret = do_write(fd, &version, sizeof(u32));
+	if (ret < 0)
+		goto out;
+
+	ret = do_write(fd, &cnt, sizeof(u32));
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < cnt; i++) {
+		struct cpu_cache_level *c = &caches[i];
+
+		#define _W(v)					\
+			ret = do_write(fd, &c->v, sizeof(u32));	\
+			if (ret < 0)				\
+				goto out;
+
+		_W(level)
+		_W(line_size)
+		_W(sets)
+		_W(ways)
+		#undef _W
+
+		#define _W(v)						\
+			ret = do_write_string(fd, (const char *) c->v);	\
+			if (ret < 0)					\
+				goto out;
+
+		_W(type)
+		_W(size)
+		_W(map)
+		#undef _W
+	}
+
+out:
+	for (i = 0; i < cnt; i++)
+		cpu_cache_level__free(&caches[i]);
+	return ret;
+}
+
 static int write_stat(int fd __maybe_unused,
 		      struct perf_header *h __maybe_unused,
 		      struct perf_evlist *evlist __maybe_unused)
@@ -1172,6 +1367,18 @@ static void print_stat(struct perf_header *ph __maybe_unused,
 	fprintf(fp, "# contains stat data\n");
 }
 
+static void print_cache(struct perf_header *ph __maybe_unused,
+			int fd __maybe_unused, FILE *fp __maybe_unused)
+{
+	int i;
+
+	fprintf(fp, "# CPU cache info:\n");
+	for (i = 0; i < ph->env.caches_cnt; i++) {
+		fprintf(fp, "#  ");
+		cpu_cache_level__fprintf(fp, &ph->env.caches[i]);
+	}
+}
+
 static void print_pmu_mappings(struct perf_header *ph, int fd __maybe_unused,
 			       FILE *fp)
 {
@@ -1920,6 +2127,68 @@ static int process_auxtrace(struct perf_file_section *section,
 	return err;
 }
 
+static int process_cache(struct perf_file_section *section __maybe_unused,
+			 struct perf_header *ph __maybe_unused, int fd __maybe_unused,
+			 void *data __maybe_unused)
+{
+	struct cpu_cache_level *caches;
+	u32 cnt, i, version;
+
+	if (readn(fd, &version, sizeof(version)) != sizeof(version))
+		return -1;
+
+	if (ph->needs_swap)
+		version = bswap_32(version);
+
+	if (version != 1)
+		return -1;
+
+	if (readn(fd, &cnt, sizeof(cnt)) != sizeof(cnt))
+		return -1;
+
+	if (ph->needs_swap)
+		cnt = bswap_32(cnt);
+
+	caches = zalloc(sizeof(*caches) * cnt);
+	if (!caches)
+		return -1;
+
+	for (i = 0; i < cnt; i++) {
+		struct cpu_cache_level c;
+
+		#define _R(v)						\
+			if (readn(fd, &c.v, sizeof(u32)) != sizeof(u32))\
+				goto out_free_caches;			\
+			if (ph->needs_swap)				\
+				c.v = bswap_32(c.v);			\
+
+		_R(level)
+		_R(line_size)
+		_R(sets)
+		_R(ways)
+		#undef _R
+
+		#define _R(v)				\
+			c.v = do_read_string(fd, ph);	\
+			if (!c.v)			\
+				goto out_free_caches;
+
+		_R(type)
+		_R(size)
+		_R(map)
+		#undef _R
+
+		caches[i] = c;
+	}
+
+	ph->env.caches = caches;
+	ph->env.caches_cnt = cnt;
+	return 0;
+out_free_caches:
+	free(caches);
+	return -1;
+}
+
 struct feature_ops {
 	int (*write)(int fd, struct perf_header *h, struct perf_evlist *evlist);
 	void (*print)(struct perf_header *h, int fd, FILE *fp);
@@ -1962,6 +2231,7 @@ static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = {
 	FEAT_OPP(HEADER_GROUP_DESC,	group_desc),
 	FEAT_OPP(HEADER_AUXTRACE,	auxtrace),
 	FEAT_OPA(HEADER_STAT,		stat),
+	FEAT_OPF(HEADER_CACHE,		cache),
 };
 
 struct header_print_data {
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index cff9892..3d87ca8 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -32,6 +32,7 @@ enum {
 	HEADER_GROUP_DESC,
 	HEADER_AUXTRACE,
 	HEADER_STAT,
+	HEADER_CACHE,
 	HEADER_LAST_FEATURE,
 	HEADER_FEAT_BITS	= 256,
 };
-- 
cgit v0.10.2


From 140aeadc1fb51d38130fd06a272a381f22e3070c Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 30 Jan 2016 09:06:49 -0800
Subject: perf stat: Abstract stat metrics printing

Abstract the printing of shadow metrics. Instead of every metric calling
fprintf directly and taking care of indentation, use two call backs: one
to print metrics and another to start a new line.

This will allow adding metrics to CSV mode and also using them for other
purposes.

The computation of padding is now done in the central callback, instead
of every metric doing it manually.  This makes it easier to add new
metrics.

v2: Refactor functions, printout now does more. Move
shadow printing. Improve fallback callbacks. Don't
use void * callback data.
v3: Remove unnecessary hunk. Add typedef for new_line
v4: Remove unnecessary hunk. Don't print metrics for CSV/interval
mode yet.  Move printout change to separate patch.
v5: Fix bisect bugs. Avoid bogus frontend cycles printing.
Fix indentation in different aggregation modes.
v6: Delay newline handling

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1454173616-17710-2-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 038e877..fabcadb 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -735,6 +735,58 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
 	}
 }
 
+struct outstate {
+	FILE *fh;
+	bool newline;
+};
+
+#define METRIC_LEN  35
+
+static void new_line_std(void *ctx)
+{
+	struct outstate *os = ctx;
+
+	os->newline = true;
+}
+
+static void do_new_line_std(struct outstate *os)
+{
+	fputc('\n', os->fh);
+	if (stat_config.aggr_mode == AGGR_NONE)
+		fprintf(os->fh, "        ");
+	if (stat_config.aggr_mode == AGGR_CORE)
+		fprintf(os->fh, "                  ");
+	if (stat_config.aggr_mode == AGGR_SOCKET)
+		fprintf(os->fh, "            ");
+	fprintf(os->fh, "                                                 ");
+}
+
+static void print_metric_std(void *ctx, const char *color, const char *fmt,
+			     const char *unit, double val)
+{
+	struct outstate *os = ctx;
+	FILE *out = os->fh;
+	int n;
+	bool newline = os->newline;
+
+	os->newline = false;
+
+	if (unit == NULL || fmt == NULL) {
+		fprintf(out, "%-*s", METRIC_LEN, "");
+		return;
+	}
+
+	if (newline)
+		do_new_line_std(os);
+
+	n = fprintf(out, " # ");
+	if (color)
+		n += color_fprintf(out, color, fmt, val);
+	else
+		n += fprintf(out, fmt, val);
+	fprintf(out, " %-*s", METRIC_LEN - n - 1, unit);
+}
+
 static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
 	FILE *output = stat_config.output;
@@ -795,20 +847,27 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 
 static void printout(int id, int nr, struct perf_evsel *counter, double uval)
 {
-	int cpu = cpu_map__id_to_cpu(id);
+	struct outstate os = { .fh = stat_config.output };
+	struct perf_stat_output_ctx out;
+	print_metric_t pm = print_metric_std;
+	void (*nl)(void *);
 
-	if (stat_config.aggr_mode == AGGR_GLOBAL)
-		cpu = 0;
+	nl = new_line_std;
 
 	if (nsec_counter(counter))
 		nsec_printout(id, nr, counter, uval);
 	else
 		abs_printout(id, nr, counter, uval);
 
+	out.print_metric = pm;
+	out.new_line = nl;
+	out.ctx = &os;
+
 	if (!csv_output && !stat_config.interval)
-		perf_stat__print_shadow_stats(stat_config.output, counter,
-					      uval, cpu,
-					      stat_config.aggr_mode);
+		perf_stat__print_shadow_stats(counter, uval,
+				stat_config.aggr_mode == AGGR_GLOBAL ? 0 :
+				cpu_map__id_to_cpu(id),
+				&out);
 }
 
 static void print_aggr(char *prefix)
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 6ac0314..4d8f185 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -137,9 +137,10 @@ static const char *get_ratio_color(enum grc_type type, double ratio)
 	return color;
 }
 
-static void print_stalled_cycles_frontend(FILE *out, int cpu,
+static void print_stalled_cycles_frontend(int cpu,
 					  struct perf_evsel *evsel
-					  __maybe_unused, double avg)
+					  __maybe_unused, double avg,
+					  struct perf_stat_output_ctx *out)
 {
 	double total, ratio = 0.0;
 	const char *color;
@@ -152,14 +153,17 @@ static void print_stalled_cycles_frontend(FILE *out, int cpu,
 
 	color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
 
-	fprintf(out, " #  ");
-	color_fprintf(out, color, "%6.2f%%", ratio);
-	fprintf(out, " frontend cycles idle   ");
+	if (ratio)
+		out->print_metric(out->ctx, color, "%7.2f%%", "frontend cycles idle",
+				  ratio);
+	else
+		out->print_metric(out->ctx, NULL, NULL, "frontend cycles idle", 0);
 }
 
-static void print_stalled_cycles_backend(FILE *out, int cpu,
+static void print_stalled_cycles_backend(int cpu,
 					 struct perf_evsel *evsel
-					 __maybe_unused, double avg)
+					 __maybe_unused, double avg,
+					 struct perf_stat_output_ctx *out)
 {
 	double total, ratio = 0.0;
 	const char *color;
@@ -172,14 +176,13 @@ static void print_stalled_cycles_backend(FILE *out, int cpu,
 
 	color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
 
-	fprintf(out, " #  ");
-	color_fprintf(out, color, "%6.2f%%", ratio);
-	fprintf(out, " backend  cycles idle   ");
+	out->print_metric(out->ctx, color, "%6.2f%%", "backend cycles idle", ratio);
 }
 
-static void print_branch_misses(FILE *out, int cpu,
+static void print_branch_misses(int cpu,
 				struct perf_evsel *evsel __maybe_unused,
-				double avg)
+				double avg,
+				struct perf_stat_output_ctx *out)
 {
 	double total, ratio = 0.0;
 	const char *color;
@@ -192,14 +195,13 @@ static void print_branch_misses(FILE *out, int cpu,
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
-	fprintf(out, " #  ");
-	color_fprintf(out, color, "%6.2f%%", ratio);
-	fprintf(out, " of all branches        ");
+	out->print_metric(out->ctx, color, "%7.2f%%", "of all branches", ratio);
 }
 
-static void print_l1_dcache_misses(FILE *out, int cpu,
+static void print_l1_dcache_misses(int cpu,
 				   struct perf_evsel *evsel __maybe_unused,
-				   double avg)
+				   double avg,
+				   struct perf_stat_output_ctx *out)
 {
 	double total, ratio = 0.0;
 	const char *color;
@@ -212,14 +214,13 @@ static void print_l1_dcache_misses(FILE *out, int cpu,
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
 
-	fprintf(out, " #  ");
-	color_fprintf(out, color, "%6.2f%%", ratio);
-	fprintf(out, " of all L1-dcache hits  ");
+	out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-dcache hits", ratio);
 }
 
-static void print_l1_icache_misses(FILE *out, int cpu,
+static void print_l1_icache_misses(int cpu,
 				   struct perf_evsel *evsel __maybe_unused,
-				   double avg)
+				   double avg,
+				   struct perf_stat_output_ctx *out)
 {
 	double total, ratio = 0.0;
 	const char *color;
@@ -231,15 +232,13 @@ static void print_l1_icache_misses(FILE *out, int cpu,
 		ratio = avg / total * 100.0;
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-	fprintf(out, " #  ");
-	color_fprintf(out, color, "%6.2f%%", ratio);
-	fprintf(out, " of all L1-icache hits  ");
+	out->print_metric(out->ctx, color, "%7.2f%%", "of all L1-icache hits", ratio);
 }
 
-static void print_dtlb_cache_misses(FILE *out, int cpu,
+static void print_dtlb_cache_misses(int cpu,
 				    struct perf_evsel *evsel __maybe_unused,
-				    double avg)
+				    double avg,
+				    struct perf_stat_output_ctx *out)
 {
 	double total, ratio = 0.0;
 	const char *color;
@@ -251,15 +250,13 @@ static void print_dtlb_cache_misses(FILE *out, int cpu,
 		ratio = avg / total * 100.0;
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-	fprintf(out, " #  ");
-	color_fprintf(out, color, "%6.2f%%", ratio);
-	fprintf(out, " of all dTLB cache hits ");
+	out->print_metric(out->ctx, color, "%7.2f%%", "of all dTLB cache hits", ratio);
 }
 
-static void print_itlb_cache_misses(FILE *out, int cpu,
+static void print_itlb_cache_misses(int cpu,
 				    struct perf_evsel *evsel __maybe_unused,
-				    double avg)
+				    double avg,
+				    struct perf_stat_output_ctx *out)
 {
 	double total, ratio = 0.0;
 	const char *color;
@@ -271,15 +268,13 @@ static void print_itlb_cache_misses(FILE *out, int cpu,
 		ratio = avg / total * 100.0;
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-	fprintf(out, " #  ");
-	color_fprintf(out, color, "%6.2f%%", ratio);
-	fprintf(out, " of all iTLB cache hits ");
+	out->print_metric(out->ctx, color, "%7.2f%%", "of all iTLB cache hits", ratio);
 }
 
-static void print_ll_cache_misses(FILE *out, int cpu,
+static void print_ll_cache_misses(int cpu,
 				  struct perf_evsel *evsel __maybe_unused,
-				  double avg)
+				  double avg,
+				  struct perf_stat_output_ctx *out)
 {
 	double total, ratio = 0.0;
 	const char *color;
@@ -291,15 +286,15 @@ static void print_ll_cache_misses(FILE *out, int cpu,
 		ratio = avg / total * 100.0;
 
 	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
-
-	fprintf(out, " #  ");
-	color_fprintf(out, color, "%6.2f%%", ratio);
-	fprintf(out, " of all LL-cache hits   ");
+	out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
 }
 
-void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
-				   double avg, int cpu, enum aggr_mode aggr)
+void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
+				   double avg, int cpu,
+				   struct perf_stat_output_ctx *out)
 {
+	void *ctxp = out->ctx;
+	print_metric_t print_metric = out->print_metric;
 	double total, ratio = 0.0, total2;
 	int ctx = evsel_context(evsel);
 
@@ -307,119 +302,145 @@ void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
 		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 		if (total) {
 			ratio = avg / total;
-			fprintf(out, " #   %5.2f  insns per cycle        ", ratio);
+			print_metric(ctxp, NULL, "%7.2f ",
+					"insn per cycle", ratio);
 		} else {
-			fprintf(out, "                                   ");
+			print_metric(ctxp, NULL, NULL, "insn per cycle", 0);
 		}
 		total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]);
 		total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu]));
 
+		out->new_line(ctxp);
 		if (total && avg) {
 			ratio = total / avg;
-			fprintf(out, "\n");
-			if (aggr == AGGR_NONE)
-				fprintf(out, "        ");
-			fprintf(out, "                                                  #   %5.2f  stalled cycles per insn", ratio);
+			print_metric(ctxp, NULL, "%7.2f ",
+					"stalled cycles per insn",
+					ratio);
+		} else {
+			print_metric(ctxp, NULL, NULL,
+				     "stalled cycles per insn", 0);
 		}
-
-	} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
-			runtime_branches_stats[ctx][cpu].n != 0) {
-		print_branch_misses(out, cpu, evsel, avg);
+	} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) {
+		if (runtime_branches_stats[ctx][cpu].n != 0)
+			print_branch_misses(cpu, evsel, avg, out);
+		else
+			print_metric(ctxp, NULL, NULL, "of all branches", 0);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_l1_dcache_stats[ctx][cpu].n != 0) {
-		print_l1_dcache_misses(out, cpu, evsel, avg);
+					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+		if (runtime_l1_dcache_stats[ctx][cpu].n != 0)
+			print_l1_dcache_misses(cpu, evsel, avg, out);
+		else
+			print_metric(ctxp, NULL, NULL, "of all L1-dcache hits", 0);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_l1_icache_stats[ctx][cpu].n != 0) {
-		print_l1_icache_misses(out, cpu, evsel, avg);
+					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+		if (runtime_l1_icache_stats[ctx][cpu].n != 0)
+			print_l1_icache_misses(cpu, evsel, avg, out);
+		else
+			print_metric(ctxp, NULL, NULL, "of all L1-icache hits", 0);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_dtlb_cache_stats[ctx][cpu].n != 0) {
-		print_dtlb_cache_misses(out, cpu, evsel, avg);
+					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+		if (runtime_dtlb_cache_stats[ctx][cpu].n != 0)
+			print_dtlb_cache_misses(cpu, evsel, avg, out);
+		else
+			print_metric(ctxp, NULL, NULL, "of all dTLB cache hits", 0);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_itlb_cache_stats[ctx][cpu].n != 0) {
-		print_itlb_cache_misses(out, cpu, evsel, avg);
+					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+		if (runtime_itlb_cache_stats[ctx][cpu].n != 0)
+			print_itlb_cache_misses(cpu, evsel, avg, out);
+		else
+			print_metric(ctxp, NULL, NULL, "of all iTLB cache hits", 0);
 	} else if (
 		evsel->attr.type == PERF_TYPE_HW_CACHE &&
 		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
 					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
-					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
-			runtime_ll_cache_stats[ctx][cpu].n != 0) {
-		print_ll_cache_misses(out, cpu, evsel, avg);
-	} else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
-			runtime_cacherefs_stats[ctx][cpu].n != 0) {
+					 ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) {
+		if (runtime_ll_cache_stats[ctx][cpu].n != 0)
+			print_ll_cache_misses(cpu, evsel, avg, out);
+		else
+			print_metric(ctxp, NULL, NULL, "of all LL-cache hits", 0);
+	} else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) {
 		total = avg_stats(&runtime_cacherefs_stats[ctx][cpu]);
 
 		if (total)
 			ratio = avg * 100 / total;
 
-		fprintf(out, " # %8.3f %% of all cache refs    ", ratio);
-
+		if (runtime_cacherefs_stats[ctx][cpu].n != 0)
+			print_metric(ctxp, NULL, "%8.3f %%",
+				     "of all cache refs", ratio);
+		else
+			print_metric(ctxp, NULL, NULL, "of all cache refs", 0);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
-		print_stalled_cycles_frontend(out, cpu, evsel, avg);
+		print_stalled_cycles_frontend(cpu, evsel, avg, out);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
-		print_stalled_cycles_backend(out, cpu, evsel, avg);
+		print_stalled_cycles_backend(cpu, evsel, avg, out);
 	} else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
 		total = avg_stats(&runtime_nsecs_stats[cpu]);
 
 		if (total) {
 			ratio = avg / total;
-			fprintf(out, " # %8.3f GHz                    ", ratio);
+			print_metric(ctxp, NULL, "%8.3f", "GHz", ratio);
 		} else {
-			fprintf(out, "                                   ");
+			print_metric(ctxp, NULL, NULL, "Ghz", 0);
 		}
 	} else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) {
 		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 		if (total)
-			fprintf(out,
-				" #   %5.2f%% transactional cycles   ",
-				100.0 * (avg / total));
+			print_metric(ctxp, NULL,
+					"%7.2f%%", "transactional cycles",
+					100.0 * (avg / total));
+		else
+			print_metric(ctxp, NULL, NULL, "transactional cycles",
+				     0);
 	} else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) {
 		total = avg_stats(&runtime_cycles_stats[ctx][cpu]);
 		total2 = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 		if (total2 < avg)
 			total2 = avg;
 		if (total)
-			fprintf(out,
-				" #   %5.2f%% aborted cycles         ",
+			print_metric(ctxp, NULL, "%7.2f%%", "aborted cycles",
 				100.0 * ((total2-avg) / total));
-	} else if (perf_stat_evsel__is(evsel, TRANSACTION_START) &&
-		   runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+		else
+			print_metric(ctxp, NULL, NULL, "aborted cycles", 0);
+	} else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) {
 		total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 
 		if (avg)
 			ratio = total / avg;
 
-		fprintf(out, " # %8.0f cycles / transaction   ", ratio);
-	} else if (perf_stat_evsel__is(evsel, ELISION_START) &&
-		   runtime_cycles_in_tx_stats[ctx][cpu].n != 0) {
+		if (runtime_cycles_in_tx_stats[ctx][cpu].n != 0)
+			print_metric(ctxp, NULL, "%8.0f",
+				     "cycles / transaction", ratio);
+		else
+			print_metric(ctxp, NULL, NULL, "cycles / transaction",
+				     0);
+	} else if (perf_stat_evsel__is(evsel, ELISION_START)) {
 		total = avg_stats(&runtime_cycles_in_tx_stats[ctx][cpu]);
 
 		if (avg)
 			ratio = total / avg;
 
-		fprintf(out, " # %8.0f cycles / elision       ", ratio);
+		print_metric(ctxp, NULL, "%8.0f", "cycles / elision", ratio);
 	} else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) {
 		if ((ratio = avg_stats(&walltime_nsecs_stats)) != 0)
-			fprintf(out, " # %8.3f CPUs utilized          ", avg / ratio);
+			print_metric(ctxp, NULL, "%8.3f", "CPUs utilized",
+				     avg / ratio);
 		else
-			fprintf(out, "                                   ");
+			print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
 	} else if (runtime_nsecs_stats[cpu].n != 0) {
 		char unit = 'M';
+		char unit_buf[10];
 
 		total = avg_stats(&runtime_nsecs_stats[cpu]);
 
@@ -429,9 +450,9 @@ void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
 			ratio *= 1000;
 			unit = 'K';
 		}
-
-		fprintf(out, " # %8.3f %c/sec                  ", ratio, unit);
+		snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
+		print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio);
 	} else {
-		fprintf(out, "                                   ");
+		print_metric(ctxp, NULL, NULL, NULL, 0);
 	}
 }
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 2af63c9..f02af68 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -68,11 +68,22 @@ void perf_stat_evsel_id_init(struct perf_evsel *evsel);
 
 extern struct stats walltime_nsecs_stats;
 
+typedef void (*print_metric_t)(void *ctx, const char *color, const char *unit,
+			       const char *fmt, double val);
+typedef void (*new_line_t )(void *ctx);
+
 void perf_stat__reset_shadow_stats(void);
 void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
 				    int cpu);
-void perf_stat__print_shadow_stats(FILE *out, struct perf_evsel *evsel,
-				   double avg, int cpu, enum aggr_mode aggr);
+struct perf_stat_output_ctx {
+	void *ctx;
+	print_metric_t print_metric;
+	new_line_t new_line;
+};
+
+void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
+				   double avg, int cpu,
+				   struct perf_stat_output_ctx *out);
 
 int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw);
 void perf_evlist__free_stats(struct perf_evlist *evlist);
-- 
cgit v0.10.2


From f94833929032ad23412d3970beed6769a2fdbc19 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 30 Jan 2016 09:06:50 -0800
Subject: perf stat: Add support for metrics in interval mode

Now that we can modify the metrics printout functions easily, it's
straight forward to support metric printing for interval mode.  All that
is needed is to print the time stamp on every new line.  Pass the prefix
into the context and print it out.

v2: Move wrong hunk to here.

Committer note:

Before:

  [root@jouet ~]# perf stat -I 1000 -e instructions,cycles sleep 1
  #           time    counts unit events
       1.000168216   538,913      instructions
       1.000168216   748,765      cycles
       1.000660048   153,741      instructions
       1.000660048   214,066      cycles

After:

  # perf stat -I 1000 -e instructions,cycles sleep 1
  #           time    counts unit events
       1.000215928   519,620      instructions              #    0.69  insn per cycle
       1.000215928   752,003      cycles
       1.000946033   148,502      instructions              #    0.33  insn per cycle
       1.000946033   160,104      cycles

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1454173616-17710-3-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index fabcadb..5710bdb 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -738,6 +738,7 @@ static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
 struct outstate {
 	FILE *fh;
 	bool newline;
+	const char *prefix;
 };
 
 #define METRIC_LEN  35
@@ -752,6 +753,7 @@ static void new_line_std(void *ctx)
 static void do_new_line_std(struct outstate *os)
 {
 	fputc('\n', os->fh);
+	fputs(os->prefix, os->fh);
 	if (stat_config.aggr_mode == AGGR_NONE)
 		fprintf(os->fh, "        ");
 	if (stat_config.aggr_mode == AGGR_CORE)
@@ -845,10 +847,14 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 }
 
-static void printout(int id, int nr, struct perf_evsel *counter, double uval)
+static void printout(int id, int nr, struct perf_evsel *counter, double uval,
+		     char *prefix)
 {
-	struct outstate os = { .fh = stat_config.output };
 	struct perf_stat_output_ctx out;
+	struct outstate os = {
+		.fh = stat_config.output,
+		.prefix = prefix ? prefix : ""
+	};
 	print_metric_t pm = print_metric_std;
 	void (*nl)(void *);
 
@@ -863,7 +869,7 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval)
 	out.new_line = nl;
 	out.ctx = &os;
 
-	if (!csv_output && !stat_config.interval)
+	if (!csv_output)
 		perf_stat__print_shadow_stats(counter, uval,
 				stat_config.aggr_mode == AGGR_GLOBAL ? 0 :
 				cpu_map__id_to_cpu(id),
@@ -923,7 +929,7 @@ static void print_aggr(char *prefix)
 				continue;
 			}
 			uval = val * counter->scale;
-			printout(id, nr, counter, uval);
+			printout(id, nr, counter, uval, prefix);
 			if (!csv_output)
 				print_noise(counter, 1.0);
 
@@ -954,7 +960,7 @@ static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
 			fprintf(output, "%s", prefix);
 
 		uval = val * counter->scale;
-		printout(thread, 0, counter, uval);
+		printout(thread, 0, counter, uval, prefix);
 
 		if (!csv_output)
 			print_noise(counter, 1.0);
@@ -1004,7 +1010,7 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 	}
 
 	uval = avg * counter->scale;
-	printout(-1, 0, counter, uval);
+	printout(-1, 0, counter, uval, prefix);
 
 	print_noise(counter, avg);
 
@@ -1057,7 +1063,7 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
 		}
 
 		uval = val * counter->scale;
-		printout(cpu, 0, counter, uval);
+		printout(cpu, 0, counter, uval, prefix);
 		if (!csv_output)
 			print_noise(counter, 1.0);
 		print_running(run, ena);
-- 
cgit v0.10.2


From cb110f471025f3278978aaccb18f3164ea2b8189 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 30 Jan 2016 09:06:51 -0800
Subject: perf stat: Move noise/running printing into printout

Move the running/noise printing into printout to avoid duplicated code
in the callers.

v2: Merged with other patches. Remove unnecessary hunk.
    Readd hunk that ended in earlier patch.
v3: Fix noise/running output in CSV mode
v4: Merge with later patch that also moves not supported printing.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1454173616-17710-4-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 5710bdb..15e4fcf 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -848,7 +848,7 @@ static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 }
 
 static void printout(int id, int nr, struct perf_evsel *counter, double uval,
-		     char *prefix)
+		     char *prefix, u64 run, u64 ena, double noise)
 {
 	struct perf_stat_output_ctx out;
 	struct outstate os = {
@@ -860,6 +860,30 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 
 	nl = new_line_std;
 
+	if (run == 0 || ena == 0) {
+		aggr_printout(counter, id, nr);
+
+		fprintf(stat_config.output, "%*s%s",
+			csv_output ? 0 : 18,
+			counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
+			csv_sep);
+
+		fprintf(stat_config.output, "%-*s%s",
+			csv_output ? 0 : unit_width,
+			counter->unit, csv_sep);
+
+		fprintf(stat_config.output, "%*s",
+			csv_output ? 0 : -25,
+			perf_evsel__name(counter));
+
+		if (counter->cgrp)
+			fprintf(stat_config.output, "%s%s",
+				csv_sep, counter->cgrp->name);
+
+		print_running(run, ena);
+		return;
+	}
+
 	if (nsec_counter(counter))
 		nsec_printout(id, nr, counter, uval);
 	else
@@ -874,6 +898,9 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 				stat_config.aggr_mode == AGGR_GLOBAL ? 0 :
 				cpu_map__id_to_cpu(id),
 				&out);
+
+	print_noise(counter, noise);
+	print_running(run, ena);
 }
 
 static void print_aggr(char *prefix)
@@ -904,36 +931,8 @@ static void print_aggr(char *prefix)
 			if (prefix)
 				fprintf(output, "%s", prefix);
 
-			if (run == 0 || ena == 0) {
-				aggr_printout(counter, id, nr);
-
-				fprintf(output, "%*s%s",
-					csv_output ? 0 : 18,
-					counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-					csv_sep);
-
-				fprintf(output, "%-*s%s",
-					csv_output ? 0 : unit_width,
-					counter->unit, csv_sep);
-
-				fprintf(output, "%*s",
-					csv_output ? 0 : -25,
-					perf_evsel__name(counter));
-
-				if (counter->cgrp)
-					fprintf(output, "%s%s",
-						csv_sep, counter->cgrp->name);
-
-				print_running(run, ena);
-				fputc('\n', output);
-				continue;
-			}
 			uval = val * counter->scale;
-			printout(id, nr, counter, uval, prefix);
-			if (!csv_output)
-				print_noise(counter, 1.0);
-
-			print_running(run, ena);
+			printout(id, nr, counter, uval, prefix, run, ena, 1.0);
 			fputc('\n', output);
 		}
 	}
@@ -960,12 +959,7 @@ static void print_aggr_thread(struct perf_evsel *counter, char *prefix)
 			fprintf(output, "%s", prefix);
 
 		uval = val * counter->scale;
-		printout(thread, 0, counter, uval, prefix);
-
-		if (!csv_output)
-			print_noise(counter, 1.0);
-
-		print_running(run, ena);
+		printout(thread, 0, counter, uval, prefix, run, ena, 1.0);
 		fputc('\n', output);
 	}
 }
@@ -979,7 +973,6 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 	FILE *output = stat_config.output;
 	struct perf_stat_evsel *ps = counter->priv;
 	double avg = avg_stats(&ps->res_stats[0]);
-	int scaled = counter->counts->scaled;
 	double uval;
 	double avg_enabled, avg_running;
 
@@ -989,32 +982,8 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 	if (prefix)
 		fprintf(output, "%s", prefix);
 
-	if (scaled == -1 || !counter->supported) {
-		fprintf(output, "%*s%s",
-			csv_output ? 0 : 18,
-			counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-			csv_sep);
-		fprintf(output, "%-*s%s",
-			csv_output ? 0 : unit_width,
-			counter->unit, csv_sep);
-		fprintf(output, "%*s",
-			csv_output ? 0 : -25,
-			perf_evsel__name(counter));
-
-		if (counter->cgrp)
-			fprintf(output, "%s%s", csv_sep, counter->cgrp->name);
-
-		print_running(avg_running, avg_enabled);
-		fputc('\n', output);
-		return;
-	}
-
 	uval = avg * counter->scale;
-	printout(-1, 0, counter, uval, prefix);
-
-	print_noise(counter, avg);
-
-	print_running(avg_running, avg_enabled);
+	printout(-1, 0, counter, uval, prefix, avg_running, avg_enabled, avg);
 	fprintf(output, "\n");
 }
 
@@ -1037,36 +1006,8 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
 		if (prefix)
 			fprintf(output, "%s", prefix);
 
-		if (run == 0 || ena == 0) {
-			fprintf(output, "CPU%*d%s%*s%s",
-				csv_output ? 0 : -4,
-				perf_evsel__cpus(counter)->map[cpu], csv_sep,
-				csv_output ? 0 : 18,
-				counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
-				csv_sep);
-
-				fprintf(output, "%-*s%s",
-					csv_output ? 0 : unit_width,
-					counter->unit, csv_sep);
-
-				fprintf(output, "%*s",
-					csv_output ? 0 : -25,
-					perf_evsel__name(counter));
-
-			if (counter->cgrp)
-				fprintf(output, "%s%s",
-					csv_sep, counter->cgrp->name);
-
-			print_running(run, ena);
-			fputc('\n', output);
-			continue;
-		}
-
 		uval = val * counter->scale;
-		printout(cpu, 0, counter, uval, prefix);
-		if (!csv_output)
-			print_noise(counter, 1.0);
-		print_running(run, ena);
+		printout(cpu, 0, counter, uval, prefix, run, ena, 1.0);
 
 		fputc('\n', output);
 	}
-- 
cgit v0.10.2


From e54fdcca70a33a7e447e526b305db85e978c0563 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 16 Feb 2016 15:09:01 -0800
Subject: x86/signal/64: Add a comment about sigcontext->fs and gs

These fields have a strange history.  This tries to document it.

This borrows from 9a036b93a344 ("x86/signal/64: Remove 'fs' and 'gs'
from sigcontext"), which was reverted by ed596cde9425 ("Revert x86
sigcontext cleanups").

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stas Sergeev <stsp@list.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/baa78f3c84106fa5acbc319377b1850602f5deec.1455664054.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index d485232..702c404 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -341,6 +341,31 @@ struct sigcontext {
 	__u64				rip;
 	__u64				eflags;		/* RFLAGS */
 	__u16				cs;
+
+	/*
+	 * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
+	 * Linux saved and restored fs and gs in these slots.  This
+	 * was counterproductive, as fsbase and gsbase were never
+	 * saved, so arch_prctl was presumably unreliable.
+	 *
+	 * These slots should never be reused without extreme caution:
+	 *
+	 *  - Some DOSEMU versions stash fs and gs in these slots manually,
+	 *    thus overwriting anything the kernel expects to be preserved
+	 *    in these slots.
+	 *
+	 *  - If these slots are ever needed for any other purpose,
+	 *    there is some risk that very old 64-bit binaries could get
+	 *    confused.  I doubt that many such binaries still work,
+	 *    though, since the same patch in 2.5.64 also removed the
+	 *    64-bit set_thread_area syscall, so it appears that there
+	 *    is no TLS API beyond modify_ldt that works in both pre-
+	 *    and post-2.5.64 kernels.
+	 *
+	 * If the kernel ever adds explicit fs, gs, fsbase, and gsbase
+	 * save/restore, it will most likely need to be opt-in and use
+	 * different context slots.
+	 */
 	__u16				gs;
 	__u16				fs;
 	__u16				__pad0;
-- 
cgit v0.10.2


From 8ff5bd2e1e2767fbf737f84d5f92668dafe7e7b0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 16 Feb 2016 15:09:02 -0800
Subject: x86/signal/64: Fix SS if needed when delivering a 64-bit signal

Signals are always delivered to 64-bit tasks with CS set to a long
mode segment.  In long mode, SS doesn't matter as long as it's a
present writable segment.

If SS starts out invalid (this can happen if the signal was caused
by an IRET fault or was delivered on the way out of set_thread_area
or modify_ldt), then IRET to the signal handler can fail, eventually
killing the task.

The straightforward fix would be to simply reset SS when delivering
a signal.  That breaks DOSEMU, though: 64-bit builds of DOSEMU rely
on SS being set to the faulting SS when signals are delivered.

As a compromise, this patch leaves SS alone so long as it's valid.

The net effect should be that the behavior of successfully delivered
signals is unchanged.  Some signals that would previously have
failed to be delivered will now be delivered successfully.

This has no effect for x32 or 32-bit tasks: their signal handlers
were already called with SS == __USER_DS.

(On Xen, there's a slight hole: if a task sets SS to a writable
 *kernel* data segment, then we will fail to identify it as invalid
 and we'll still kill the task.  If anyone cares, this could be fixed
 with a new paravirt hook.)

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stas Sergeev <stsp@list.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/163c6e1eacde41388f3ff4d2fe6769be651d7b6e.1455664054.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 278441f..eb5deb4 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -98,4 +98,27 @@ struct desc_ptr {
 
 #endif /* !__ASSEMBLY__ */
 
+/* Access rights as returned by LAR */
+#define AR_TYPE_RODATA		(0 * (1 << 9))
+#define AR_TYPE_RWDATA		(1 * (1 << 9))
+#define AR_TYPE_RODATA_EXPDOWN	(2 * (1 << 9))
+#define AR_TYPE_RWDATA_EXPDOWN	(3 * (1 << 9))
+#define AR_TYPE_XOCODE		(4 * (1 << 9))
+#define AR_TYPE_XRCODE		(5 * (1 << 9))
+#define AR_TYPE_XOCODE_CONF	(6 * (1 << 9))
+#define AR_TYPE_XRCODE_CONF	(7 * (1 << 9))
+#define AR_TYPE_MASK		(7 * (1 << 9))
+
+#define AR_DPL0			(0 * (1 << 13))
+#define AR_DPL3			(3 * (1 << 13))
+#define AR_DPL_MASK		(3 * (1 << 13))
+
+#define AR_A			(1 << 8)   /* "Accessed" */
+#define AR_S			(1 << 12)  /* If clear, "System" segment */
+#define AR_P			(1 << 15)  /* "Present" */
+#define AR_AVL			(1 << 20)  /* "AVaiLable" (no HW effect) */
+#define AR_L			(1 << 21)  /* "Long mode" for code segments */
+#define AR_DB			(1 << 22)  /* D/B, effect depends on type */
+#define AR_G			(1 << 23)  /* "Granularity" (limit in pages) */
+
 #endif /* _ASM_X86_DESC_DEFS_H */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index c07ff5d..52f82c7 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,6 +61,35 @@
 	regs->seg = GET_SEG(seg) | 3;			\
 } while (0)
 
+#ifdef CONFIG_X86_64
+/*
+ * If regs->ss will cause an IRET fault, change it.  Otherwise leave it
+ * alone.  Using this generally makes no sense unless
+ * user_64bit_mode(regs) would return true.
+ */
+static void force_valid_ss(struct pt_regs *regs)
+{
+	u32 ar;
+	asm volatile ("lar %[old_ss], %[ar]\n\t"
+		      "jz 1f\n\t"		/* If invalid: */
+		      "xorl %[ar], %[ar]\n\t"	/* set ar = 0 */
+		      "1:"
+		      : [ar] "=r" (ar)
+		      : [old_ss] "rm" ((u16)regs->ss));
+
+	/*
+	 * For a valid 64-bit user context, we need DPL 3, type
+	 * read-write data or read-write exp-down data, and S and P
+	 * set.  We can't use VERW because VERW doesn't check the
+	 * P bit.
+	 */
+	ar &= AR_DPL_MASK | AR_S | AR_P | AR_TYPE_MASK;
+	if (ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA) &&
+	    ar != (AR_DPL3 | AR_S | AR_P | AR_TYPE_RWDATA_EXPDOWN))
+		regs->ss = __USER_DS;
+}
+#endif
+
 int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 {
 	unsigned long buf_val;
@@ -459,10 +488,28 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
 	regs->sp = (unsigned long)frame;
 
-	/* Set up the CS register to run signal handlers in 64-bit mode,
-	   even if the handler happens to be interrupting 32-bit code. */
+	/*
+	 * Set up the CS and SS registers to run signal handlers in
+	 * 64-bit mode, even if the handler happens to be interrupting
+	 * 32-bit or 16-bit code.
+	 *
+	 * SS is subtle.  In 64-bit mode, we don't need any particular
+	 * SS descriptor, but we do need SS to be valid.  It's possible
+	 * that the old SS is entirely bogus -- this can happen if the
+	 * signal we're trying to deliver is #GP or #SS caused by a bad
+	 * SS value.  We also have a compatbility issue here: DOSEMU
+	 * relies on the contents of the SS register indicating the
+	 * SS value at the time of the signal, even though that code in
+	 * DOSEMU predates sigreturn's ability to restore SS.  (DOSEMU
+	 * avoids relying on sigreturn to restore SS; instead it uses
+	 * a trampoline.)  So we do our best: if the old SS was valid,
+	 * we keep it.  Otherwise we replace it.
+	 */
 	regs->cs = __USER_CS;
 
+	if (unlikely(regs->ss != __USER_DS))
+		force_valid_ss(regs);
+
 	return 0;
 }
 #endif /* CONFIG_X86_32 */
-- 
cgit v0.10.2


From 6c25da5ad55d48c41b8909bc1f4e3cd5d85bb499 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 16 Feb 2016 15:09:03 -0800
Subject: x86/signal/64: Re-add support for SS in the 64-bit signal context

This is a second attempt to make the improvements from c6f2062935c8
("x86/signal/64: Fix SS handling for signals delivered to 64-bit
programs"), which was reverted by 51adbfbba5c6 ("x86/signal/64: Add
support for SS in the 64-bit signal context").

This adds two new uc_flags flags.  UC_SIGCONTEXT_SS will be set for
all 64-bit signals (including x32).  It indicates that the saved SS
field is valid and that the kernel supports the new behavior.

The goal is to fix a problems with signal handling in 64-bit tasks:
SS wasn't saved in the 64-bit signal context, making it awkward to
determine what SS was at the time of signal delivery and making it
impossible to return to a non-flat SS (as calling sigreturn clobbers
SS).

This also made it extremely difficult for 64-bit tasks to return to
fully-defined 16-bit contexts, because only the kernel can easily do
espfix64, but sigreturn was unable to set a non-flag SS:ESP.
(DOSEMU has a monstrous hack to partially work around this
limitation.)

If we could go back in time, the correct fix would be to make 64-bit
signals work just like 32-bit signals with respect to SS: save it
in signal context, reset it when delivering a signal, and restore
it in sigreturn.

Unfortunately, doing that (as I tried originally) breaks DOSEMU:
DOSEMU wouldn't reset the signal context's SS when clearing the LDT
and changing the saved CS to 64-bit mode, since it predates the SS
context field existing in the first place.

This patch is a bit more complicated, and it tries to balance a
bunch of goals.  It makes most cases of changing ucontext->ss during
signal handling work as expected.

I do this by special-casing the interesting case.  On sigreturn,
ucontext->ss will be honored by default, unless the ucontext was
created from scratch by an old program and had a 64-bit CS
(unfortunately, CRIU can do this) or was the result of changing a
32-bit signal context to 64-bit without resetting SS (as DOSEMU
does).

For the benefit of new 64-bit software that uses segmentation (new
versions of DOSEMU might), the new behavior can be detected with a
new ucontext flag UC_SIGCONTEXT_SS.

To avoid compilation issues, __pad0 is left as an alias for ss in
ucontext.

The nitty-gritty details are documented in the header file.

This patch also re-enables the sigreturn_64 and ldt_gdt_64 selftests,
as the kernel change allows both of them to pass.

Tested-by: Stas Sergeev <stsp@list.ru>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/749149cbfc3e75cd7fcdad69a854b399d792cc6f.1455664054.git.luto@kernel.org
[ Small readability edit. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 89db467..452c88b 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -13,7 +13,6 @@
 			 X86_EFLAGS_CF | X86_EFLAGS_RF)
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
 int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		     struct pt_regs *regs, unsigned long mask);
 
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 702c404..62d4111 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -256,7 +256,7 @@ struct sigcontext_64 {
 	__u16				cs;
 	__u16				gs;
 	__u16				fs;
-	__u16				__pad0;
+	__u16				ss;
 	__u64				err;
 	__u64				trapno;
 	__u64				oldmask;
@@ -368,7 +368,10 @@ struct sigcontext {
 	 */
 	__u16				gs;
 	__u16				fs;
-	__u16				__pad0;
+	union {
+		__u16			ss;	/* If UC_SIGCONTEXT_SS */
+		__u16			__pad0;	/* Alias name for old (!UC_SIGCONTEXT_SS) user-space */
+	};
 	__u64				err;
 	__u64				trapno;
 	__u64				oldmask;
diff --git a/arch/x86/include/uapi/asm/ucontext.h b/arch/x86/include/uapi/asm/ucontext.h
index b7c29c8..e3d1ec9 100644
--- a/arch/x86/include/uapi/asm/ucontext.h
+++ b/arch/x86/include/uapi/asm/ucontext.h
@@ -1,11 +1,54 @@
 #ifndef _ASM_X86_UCONTEXT_H
 #define _ASM_X86_UCONTEXT_H
 
-#define UC_FP_XSTATE	0x1	/* indicates the presence of extended state
-				 * information in the memory layout pointed
-				 * by the fpstate pointer in the ucontext's
-				 * sigcontext struct (uc_mcontext).
-				 */
+/*
+ * Indicates the presence of extended state information in the memory
+ * layout pointed by the fpstate pointer in the ucontext's sigcontext
+ * struct (uc_mcontext).
+ */
+#define UC_FP_XSTATE	0x1
+
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext.  All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ *     saved CS is not 64-bit)
+ *         new SS = saved SS  (will fail IRET and signal if invalid)
+ * else
+ *         new SS = a flat 32-bit data segment
+ *
+ * This behavior serves three purposes:
+ *
+ * - Legacy programs that construct a 64-bit sigcontext from scratch
+ *   with zero or garbage in the SS slot (e.g. old CRIU) and call
+ *   sigreturn will still work.
+ *
+ * - Old DOSEMU versions sometimes catch a signal from a segmented
+ *   context, delete the old SS segment (with modify_ldt), and change
+ *   the saved CS to a 64-bit segment.  These DOSEMU versions expect
+ *   sigreturn to send them back to 64-bit mode without killing them,
+ *   despite the fact that the SS selector when the signal was raised is
+ *   no longer valid.  UC_STRICT_RESTORE_SS will be clear, so the kernel
+ *   will fix up SS for these DOSEMU versions.
+ *
+ * - Old and new programs that catch a signal and return without
+ *   modifying the saved context will end up in exactly the state they
+ *   started in, even if they were running in a segmented context when
+ *   the signal was raised..  Old kernels would lose track of the
+ *   previous SS value.
+ */
+#define UC_SIGCONTEXT_SS	0x2
+#define UC_STRICT_RESTORE_SS	0x4
+#endif
 
 #include <asm-generic/ucontext.h>
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 52f82c7..548ddf7 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -90,7 +90,9 @@ static void force_valid_ss(struct pt_regs *regs)
 }
 #endif
 
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
+static int restore_sigcontext(struct pt_regs *regs,
+			      struct sigcontext __user *sc,
+			      unsigned long uc_flags)
 {
 	unsigned long buf_val;
 	void __user *buf;
@@ -123,15 +125,18 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 		COPY(r15);
 #endif /* CONFIG_X86_64 */
 
-#ifdef CONFIG_X86_32
 		COPY_SEG_CPL3(cs);
 		COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
-		/* Kernel saves and restores only the CS segment register on signals,
-		 * which is the bare minimum needed to allow mixed 32/64-bit code.
-		 * App's signal handler can save/restore other segments if needed. */
-		COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_64
+		/*
+		 * Fix up SS if needed for the benefit of old DOSEMU and
+		 * CRIU.
+		 */
+		if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) &&
+			     user_64bit_mode(regs)))
+			force_valid_ss(regs);
+#endif
 
 		get_user_ex(tmpflags, &sc->flags);
 		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -194,6 +199,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		put_user_ex(regs->cs, &sc->cs);
 		put_user_ex(0, &sc->gs);
 		put_user_ex(0, &sc->fs);
+		put_user_ex(regs->ss, &sc->ss);
 #endif /* CONFIG_X86_32 */
 
 		put_user_ex(fpstate, &sc->fpstate);
@@ -432,6 +438,21 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 	return 0;
 }
 #else /* !CONFIG_X86_32 */
+static unsigned long frame_uc_flags(struct pt_regs *regs)
+{
+	unsigned long flags;
+
+	if (cpu_has_xsave)
+		flags = UC_FP_XSTATE | UC_SIGCONTEXT_SS;
+	else
+		flags = UC_SIGCONTEXT_SS;
+
+	if (likely(user_64bit_mode(regs)))
+		flags |= UC_STRICT_RESTORE_SS;
+
+	return flags;
+}
+
 static int __setup_rt_frame(int sig, struct ksignal *ksig,
 			    sigset_t *set, struct pt_regs *regs)
 {
@@ -451,10 +472,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
 	put_user_try {
 		/* Create the ucontext.  */
-		if (cpu_has_xsave)
-			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-		else
-			put_user_ex(0, &frame->uc.uc_flags);
+		put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
 		put_user_ex(0, &frame->uc.uc_link);
 		save_altstack_ex(&frame->uc.uc_stack, regs->sp);
 
@@ -536,10 +554,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 
 	put_user_try {
 		/* Create the ucontext.  */
-		if (cpu_has_xsave)
-			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
-		else
-			put_user_ex(0, &frame->uc.uc_flags);
+		put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags);
 		put_user_ex(0, &frame->uc.uc_link);
 		compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp);
 		put_user_ex(0, &frame->uc.uc__pad0);
@@ -601,7 +616,11 @@ asmlinkage unsigned long sys_sigreturn(void)
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->sc))
+	/*
+	 * x86_32 has no uc_flags bits relevant to restore_sigcontext.
+	 * Save a few cycles by skipping the __get_user.
+	 */
+	if (restore_sigcontext(regs, &frame->sc, 0))
 		goto badframe;
 	return regs->ax;
 
@@ -617,16 +636,19 @@ asmlinkage long sys_rt_sigreturn(void)
 	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe __user *frame;
 	sigset_t set;
+	unsigned long uc_flags;
 
 	frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
+	if (__get_user(uc_flags, &frame->uc.uc_flags))
+		goto badframe;
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
 		goto badframe;
 
 	if (restore_altstack(&frame->uc.uc_stack))
@@ -813,6 +835,7 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
 	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe_x32 __user *frame;
 	sigset_t set;
+	unsigned long uc_flags;
 
 	frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
 
@@ -820,10 +843,12 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
+	if (__get_user(uc_flags, &frame->uc.uc_flags))
+		goto badframe;
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
 		goto badframe;
 
 	if (compat_restore_altstack(&frame->uc.uc_stack))
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index df4f767..d5ce7d7 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -5,10 +5,9 @@ include ../lib.mk
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \
-			check_initial_reg_state
-TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
+			check_initial_reg_state sigreturn ldt_gdt
+TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
-			ldt_gdt \
 			vdso_restorer
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
-- 
cgit v0.10.2


From 4f6c893822932622fad2b46cc30467be9f20ee5d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 16 Feb 2016 15:09:04 -0800
Subject: selftests/x86: Add tests for UC_SIGCONTEXT_SS and
 UC_STRICT_RESTORE_SS

This tests the two ABI-preserving cases that DOSEMU cares about, and
it also explicitly tests the new UC_SIGCONTEXT_SS and
UC_STRICT_RESTORE_SS flags.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Stas Sergeev <stsp@list.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f3d08f98541d0bd3030ceb35e05e21f59e30232c.1455664054.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c
index b5aa1ba..8a577e7 100644
--- a/tools/testing/selftests/x86/sigreturn.c
+++ b/tools/testing/selftests/x86/sigreturn.c
@@ -54,6 +54,37 @@
 #include <sys/ptrace.h>
 #include <sys/user.h>
 
+/* Pull in AR_xyz defines. */
+typedef unsigned int u32;
+typedef unsigned short u16;
+#include "../../../../arch/x86/include/asm/desc_defs.h"
+
+/*
+ * Copied from asm/ucontext.h, as asm/ucontext.h conflicts badly with the glibc
+ * headers.
+ */
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext.  All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ *     saved CS is not 64-bit)
+ *         new SS = saved SS  (will fail IRET and signal if invalid)
+ * else
+ *         new SS = a flat 32-bit data segment
+ */
+#define UC_SIGCONTEXT_SS       0x2
+#define UC_STRICT_RESTORE_SS   0x4
+#endif
+
 /*
  * In principle, this test can run on Linux emulation layers (e.g.
  * Illumos "LX branded zones").  Solaris-based kernels reserve LDT
@@ -267,6 +298,9 @@ static gregset_t initial_regs, requested_regs, resulting_regs;
 /* Instructions for the SIGUSR1 handler. */
 static volatile unsigned short sig_cs, sig_ss;
 static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno;
+#ifdef __x86_64__
+static volatile sig_atomic_t sig_corrupt_final_ss;
+#endif
 
 /* Abstractions for some 32-bit vs 64-bit differences. */
 #ifdef __x86_64__
@@ -305,9 +339,105 @@ static greg_t *csptr(ucontext_t *ctx)
 }
 #endif
 
+/*
+ * Checks a given selector for its code bitness or returns -1 if it's not
+ * a usable code segment selector.
+ */
+int cs_bitness(unsigned short cs)
+{
+	uint32_t valid = 0, ar;
+	asm ("lar %[cs], %[ar]\n\t"
+	     "jnz 1f\n\t"
+	     "mov $1, %[valid]\n\t"
+	     "1:"
+	     : [ar] "=r" (ar), [valid] "+rm" (valid)
+	     : [cs] "r" (cs));
+
+	if (!valid)
+		return -1;
+
+	bool db = (ar & (1 << 22));
+	bool l = (ar & (1 << 21));
+
+	if (!(ar & (1<<11)))
+	    return -1;	/* Not code. */
+
+	if (l && !db)
+		return 64;
+	else if (!l && db)
+		return 32;
+	else if (!l && !db)
+		return 16;
+	else
+		return -1;	/* Unknown bitness. */
+}
+
+/*
+ * Checks a given selector for its code bitness or returns -1 if it's not
+ * a usable code segment selector.
+ */
+bool is_valid_ss(unsigned short cs)
+{
+	uint32_t valid = 0, ar;
+	asm ("lar %[cs], %[ar]\n\t"
+	     "jnz 1f\n\t"
+	     "mov $1, %[valid]\n\t"
+	     "1:"
+	     : [ar] "=r" (ar), [valid] "+rm" (valid)
+	     : [cs] "r" (cs));
+
+	if (!valid)
+		return false;
+
+	if ((ar & AR_TYPE_MASK) != AR_TYPE_RWDATA &&
+	    (ar & AR_TYPE_MASK) != AR_TYPE_RWDATA_EXPDOWN)
+		return false;
+
+	return (ar & AR_P);
+}
+
 /* Number of errors in the current test case. */
 static volatile sig_atomic_t nerrs;
 
+static void validate_signal_ss(int sig, ucontext_t *ctx)
+{
+#ifdef __x86_64__
+	bool was_64bit = (cs_bitness(*csptr(ctx)) == 64);
+
+	if (!(ctx->uc_flags & UC_SIGCONTEXT_SS)) {
+		printf("[FAIL]\tUC_SIGCONTEXT_SS was not set\n");
+		nerrs++;
+
+		/*
+		 * This happens on Linux 4.1.  The rest will fail, too, so
+		 * return now to reduce the noise.
+		 */
+		return;
+	}
+
+	/* UC_STRICT_RESTORE_SS is set iff we came from 64-bit mode. */
+	if (!!(ctx->uc_flags & UC_STRICT_RESTORE_SS) != was_64bit) {
+		printf("[FAIL]\tUC_STRICT_RESTORE_SS was wrong in signal %d\n",
+		       sig);
+		nerrs++;
+	}
+
+	if (is_valid_ss(*ssptr(ctx))) {
+		/*
+		 * DOSEMU was written before 64-bit sigcontext had SS, and
+		 * it tries to figure out the signal source SS by looking at
+		 * the physical register.  Make sure that keeps working.
+		 */
+		unsigned short hw_ss;
+		asm ("mov %%ss, %0" : "=rm" (hw_ss));
+		if (hw_ss != *ssptr(ctx)) {
+			printf("[FAIL]\tHW SS didn't match saved SS\n");
+			nerrs++;
+		}
+	}
+#endif
+}
+
 /*
  * SIGUSR1 handler.  Sets CS and SS as requested and points IP to the
  * int3 trampoline.  Sets SP to a large known value so that we can see
@@ -317,6 +447,8 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
 {
 	ucontext_t *ctx = (ucontext_t*)ctx_void;
 
+	validate_signal_ss(sig, ctx);
+
 	memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
 
 	*csptr(ctx) = sig_cs;
@@ -334,13 +466,16 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
 }
 
 /*
- * Called after a successful sigreturn.  Restores our state so that
- * the original raise(SIGUSR1) returns.
+ * Called after a successful sigreturn (via int3) or from a failed
+ * sigreturn (directly by kernel).  Restores our state so that the
+ * original raise(SIGUSR1) returns.
  */
 static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
 {
 	ucontext_t *ctx = (ucontext_t*)ctx_void;
 
+	validate_signal_ss(sig, ctx);
+
 	sig_err = ctx->uc_mcontext.gregs[REG_ERR];
 	sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO];
 
@@ -358,41 +493,62 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
 	memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
 	memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
 
+#ifdef __x86_64__
+	if (sig_corrupt_final_ss) {
+		if (ctx->uc_flags & UC_STRICT_RESTORE_SS) {
+			printf("[FAIL]\tUC_STRICT_RESTORE_SS was set inappropriately\n");
+			nerrs++;
+		} else {
+			/*
+			 * DOSEMU transitions from 32-bit to 64-bit mode by
+			 * adjusting sigcontext, and it requires that this work
+			 * even if the saved SS is bogus.
+			 */
+			printf("\tCorrupting SS on return to 64-bit mode\n");
+			*ssptr(ctx) = 0;
+		}
+	}
+#endif
+
 	sig_trapped = sig;
 }
 
-/*
- * Checks a given selector for its code bitness or returns -1 if it's not
- * a usable code segment selector.
- */
-int cs_bitness(unsigned short cs)
+#ifdef __x86_64__
+/* Tests recovery if !UC_STRICT_RESTORE_SS */
+static void sigusr2(int sig, siginfo_t *info, void *ctx_void)
 {
-	uint32_t valid = 0, ar;
-	asm ("lar %[cs], %[ar]\n\t"
-	     "jnz 1f\n\t"
-	     "mov $1, %[valid]\n\t"
-	     "1:"
-	     : [ar] "=r" (ar), [valid] "+rm" (valid)
-	     : [cs] "r" (cs));
+	ucontext_t *ctx = (ucontext_t*)ctx_void;
 
-	if (!valid)
-		return -1;
+	if (!(ctx->uc_flags & UC_STRICT_RESTORE_SS)) {
+		printf("[FAIL]\traise(2) didn't set UC_STRICT_RESTORE_SS\n");
+		nerrs++;
+		return;  /* We can't do the rest. */
+	}
 
-	bool db = (ar & (1 << 22));
-	bool l = (ar & (1 << 21));
+	ctx->uc_flags &= ~UC_STRICT_RESTORE_SS;
+	*ssptr(ctx) = 0;
 
-	if (!(ar & (1<<11)))
-	    return -1;	/* Not code. */
+	/* Return.  The kernel should recover without sending another signal. */
+}
 
-	if (l && !db)
-		return 64;
-	else if (!l && db)
-		return 32;
-	else if (!l && !db)
-		return 16;
-	else
-		return -1;	/* Unknown bitness. */
+static int test_nonstrict_ss(void)
+{
+	clearhandler(SIGUSR1);
+	clearhandler(SIGTRAP);
+	clearhandler(SIGSEGV);
+	clearhandler(SIGILL);
+	sethandler(SIGUSR2, sigusr2, 0);
+
+	nerrs = 0;
+
+	printf("[RUN]\tClear UC_STRICT_RESTORE_SS and corrupt SS\n");
+	raise(SIGUSR2);
+	if (!nerrs)
+		printf("[OK]\tIt worked\n");
+
+	return nerrs;
 }
+#endif
 
 /* Finds a usable code segment of the requested bitness. */
 int find_cs(int bitness)
@@ -576,6 +732,12 @@ static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs)
 		       errdesc, strsignal(sig_trapped));
 		return 0;
 	} else {
+		/*
+		 * This also implicitly tests UC_STRICT_RESTORE_SS:
+		 * We check that these signals set UC_STRICT_RESTORE_SS and,
+		 * if UC_STRICT_RESTORE_SS doesn't cause strict behavior,
+		 * then we won't get SIGSEGV.
+		 */
 		printf("[FAIL]\tDid not get SIGSEGV\n");
 		return 1;
 	}
@@ -632,6 +794,14 @@ int main()
 						    GDT3(gdt_data16_idx));
 	}
 
+#ifdef __x86_64__
+	/* Nasty ABI case: check SS corruption handling. */
+	sig_corrupt_final_ss = 1;
+	total_nerrs += test_valid_sigreturn(32, false, -1);
+	total_nerrs += test_valid_sigreturn(32, true, -1);
+	sig_corrupt_final_ss = 0;
+#endif
+
 	/*
 	 * We're done testing valid sigreturn cases.  Now we test states
 	 * for which sigreturn itself will succeed but the subsequent
@@ -680,5 +850,9 @@ int main()
 	if (gdt_npdata32_idx)
 		test_bad_iret(32, GDT3(gdt_npdata32_idx), -1);
 
+#ifdef __x86_64__
+	total_nerrs += test_nonstrict_ss();
+#endif
+
 	return total_nerrs ? 1 : 0;
 }
-- 
cgit v0.10.2


From 84aba677f009e20185aea322563389ad56e0ef7e Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Tue, 16 Feb 2016 09:43:19 +0100
Subject: x86/microcode: Remove unnecessary paravirt_enabled check

Commit:

  a18a0f6850d4 ("x86, microcode: Don't initialize microcode code on paravirt")

added a paravirt test in microcode_init(), primarily to avoid making
mc_bp_resume()->load_ucode_ap()->check_loader_disabled_ap() calls
because on 32-bit kernels this callchain ends up using __pa_nodebug()
macro which is invalid for Xen PV guests.

A subsequent commit:

  fbae4ba8c4a3 ("x86, microcode: Reload microcode on resume")

eliminated this callchain thus making a18a0f6850d4 unnecessary.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: david.vrabel@citrix.com
Cc: konrad.wilk@oracle.com
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1455612202-14414-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index cea8552..ac360bf 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -623,7 +623,7 @@ int __init microcode_init(void)
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 	int error;
 
-	if (paravirt_enabled() || dis_ucode_ldr)
+	if (dis_ucode_ldr)
 		return -EINVAL;
 
 	if (c->x86_vendor == X86_VENDOR_INTEL)
-- 
cgit v0.10.2


From 9cc6f743c7724eb9abaf27904194c169db85dd31 Mon Sep 17 00:00:00 2001
From: Andrzej Hajda <a.hajda@samsung.com>
Date: Tue, 16 Feb 2016 09:43:20 +0100
Subject: x86/microcode: Use kmemdup() rather than duplicating its
 implementation

The patch was generated using fixed coccinelle semantic patch
scripts/coccinelle/api/memdup.cocci.

Signed-off-by: Andrzej Hajda <a.hajda@samsung.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455612202-14414-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index f66cbfe..e397fc1 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -788,15 +788,13 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
 		return -EINVAL;
 	}
 
-	patch->data = kzalloc(patch_size, GFP_KERNEL);
+	patch->data = kmemdup(fw + SECTION_HDR_SIZE, patch_size, GFP_KERNEL);
 	if (!patch->data) {
 		pr_err("Patch data allocation failure.\n");
 		kfree(patch);
 		return -EINVAL;
 	}
 
-	/* All looks ok, copy patch... */
-	memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
 	INIT_LIST_HEAD(&patch->plist);
 	patch->patch_id  = mc_hdr->patch_id;
 	patch->equiv_cpu = proc_id;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index cb397947..cbb3cf0 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -210,13 +210,11 @@ save_microcode(struct mc_saved_data *mcs,
 		mc_hdr = &mc->hdr;
 		size   = get_totalsize(mc_hdr);
 
-		saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+		saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL);
 		if (!saved_ptr[i]) {
 			ret = -ENOMEM;
 			goto err;
 		}
-
-		memcpy(saved_ptr[i], mc, size);
 	}
 
 	/*
-- 
cgit v0.10.2


From f1b92bb6b5a4e17b508f128b084fa00e0eda590c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 16 Feb 2016 09:43:21 +0100
Subject: x86/ftrace, x86/asm: Kill ftrace_caller_end label

One of ftrace_caller_end and ftrace_return is redundant so unify them.
Rename ftrace_return to ftrace_epilogue to mean that everything after
that label represents, like an afterword, work which happens *after* the
ftrace call, e.g., the function graph tracer for one.

Steve wants this to rather mean "[a]n event which reflects meaningfully
on a recently ended conflict or struggle." I can imagine that ftrace can
be a struggle sometimes.

Anyway, beef up the comment about the code contents and layout before
ftrace_epilogue label.

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455612202-14414-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 29408d6..04f9641e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -697,9 +697,8 @@ static inline void tramp_free(void *tramp) { }
 #endif
 
 /* Defined as markers to the end of the ftrace default trampolines */
-extern void ftrace_caller_end(void);
 extern void ftrace_regs_caller_end(void);
-extern void ftrace_return(void);
+extern void ftrace_epilogue(void);
 extern void ftrace_caller_op_ptr(void);
 extern void ftrace_regs_caller_op_ptr(void);
 
@@ -746,7 +745,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 		op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
 	} else {
 		start_offset = (unsigned long)ftrace_caller;
-		end_offset = (unsigned long)ftrace_caller_end;
+		end_offset = (unsigned long)ftrace_epilogue;
 		op_offset = (unsigned long)ftrace_caller_op_ptr;
 	}
 
@@ -754,7 +753,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 
 	/*
 	 * Allocate enough size to store the ftrace_caller code,
-	 * the jmp to ftrace_return, as well as the address of
+	 * the jmp to ftrace_epilogue, as well as the address of
 	 * the ftrace_ops this trampoline is used for.
 	 */
 	trampoline = alloc_tramp(size + MCOUNT_INSN_SIZE + sizeof(void *));
@@ -772,8 +771,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
 
 	ip = (unsigned long)trampoline + size;
 
-	/* The trampoline ends with a jmp to ftrace_return */
-	jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_return);
+	/* The trampoline ends with a jmp to ftrace_epilogue */
+	jmp = ftrace_jmp_replace(ip, (unsigned long)ftrace_epilogue);
 	memcpy(trampoline + size, jmp, MCOUNT_INSN_SIZE);
 
 	/*
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index 87e1762..ed48a9f 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -168,12 +168,14 @@ GLOBAL(ftrace_call)
 	restore_mcount_regs
 
 	/*
-	 * The copied trampoline must call ftrace_return as it
+	 * The copied trampoline must call ftrace_epilogue as it
 	 * still may need to call the function graph tracer.
+	 *
+	 * The code up to this label is copied into trampolines so
+	 * think twice before adding any new code or changing the
+	 * layout here.
 	 */
-GLOBAL(ftrace_caller_end)
-
-GLOBAL(ftrace_return)
+GLOBAL(ftrace_epilogue)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 GLOBAL(ftrace_graph_call)
@@ -244,14 +246,14 @@ GLOBAL(ftrace_regs_call)
 	popfq
 
 	/*
-	 * As this jmp to ftrace_return can be a short jump
+	 * As this jmp to ftrace_epilogue can be a short jump
 	 * it must not be copied into the trampoline.
 	 * The trampoline will add the code to jump
 	 * to the return.
 	 */
 GLOBAL(ftrace_regs_caller_end)
 
-	jmp ftrace_return
+	jmp ftrace_epilogue
 
 END(ftrace_regs_caller)
 
-- 
cgit v0.10.2


From 053080a9d1c8cf1950115ad92ce94242ebc5f25c Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 16 Feb 2016 09:43:22 +0100
Subject: x86/msr: Document msr-index.h rule for addition

In order to keep this file's size sensible and not cause too much
unnecessary churn, make the rule explicit - similar to pci_ids.h - that
only MSRs which are used in multiple compilation units, should get added
to it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: alex.williamson@redhat.com
Cc: gleb@kernel.org
Cc: joro@8bytes.org
Cc: kvm@vger.kernel.org
Cc: sherry.hurwitz@amd.com
Cc: wei@redhat.com
Link: http://lkml.kernel.org/r/1455612202-14414-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b05402e..984ab75 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -1,7 +1,12 @@
 #ifndef _ASM_X86_MSR_INDEX_H
 #define _ASM_X86_MSR_INDEX_H
 
-/* CPU model specific register (MSR) numbers */
+/*
+ * CPU model specific register (MSR) numbers.
+ *
+ * Do not add new entries to this file unless the definitions are shared
+ * between multiple compilation units.
+ */
 
 /* x86-64 specific MSRs */
 #define MSR_EFER		0xc0000080 /* extended feature register */
-- 
cgit v0.10.2


From 3223d052b79eb9b620d170584c417d60a8bfd649 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Thu, 11 Feb 2016 11:59:38 +0900
Subject: sched/core: Remove dead statement in __schedule()

Remove an unnecessary assignment of variable not used any more.

( This has no runtime effects as GCC is smart enough to optimize
  this out. )

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455159578-17256-1-git-send-email-byungchul.park@lge.com
[ Edited the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7e548bd..87ca0be 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3346,7 +3346,6 @@ static void __sched notrace __schedule(bool preempt)
 
 		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
-		cpu = cpu_of(rq);
 	} else {
 		lockdep_unpin_lock(&rq->lock);
 		raw_spin_unlock_irq(&rq->lock);
-- 
cgit v0.10.2


From adcfd23ead69965e3ac3e69f56451dab5e39157a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Feb 2016 14:15:25 -0800
Subject: selftests/x86: Fix some error messages in ptrace_syscall

I had some obvious typos.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert O'Callahan <robert@ocallahan.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e5e6772d4802986cf7df702e646fa24ac14f2204.1455142412.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c
index 5105b49..c6a0acc 100644
--- a/tools/testing/selftests/x86/ptrace_syscall.c
+++ b/tools/testing/selftests/x86/ptrace_syscall.c
@@ -187,7 +187,7 @@ static void test_ptrace_syscall_restart(void)
 
 	printf("[RUN]\tSYSEMU\n");
 	if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
-		err(1, "PTRACE_SYSCALL");
+		err(1, "PTRACE_SYSEMU");
 	wait_trap(chld);
 
 	if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
@@ -218,7 +218,7 @@ static void test_ptrace_syscall_restart(void)
 		err(1, "PTRACE_SETREGS");
 
 	if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
-		err(1, "PTRACE_SYSCALL");
+		err(1, "PTRACE_SYSEMU");
 	wait_trap(chld);
 
 	if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
@@ -250,7 +250,7 @@ static void test_ptrace_syscall_restart(void)
 		err(1, "PTRACE_SETREGS");
 
 	if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
-		err(1, "PTRACE_SYSCALL");
+		err(1, "PTRACE_SYSEMU");
 	wait_trap(chld);
 
 	if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
-- 
cgit v0.10.2


From 403613432222549727e5d70cf44da8987653202a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Feb 2016 14:15:26 -0800
Subject: selftests/x86: Add a test for syscall restart under ptrace

This catches a regression from the compat syscall rework.  The
32-bit variant of this test currently fails.  The issue is that, for
a 32-bit tracer and a 32-bit tracee, GETREGS+SETREGS with no changes
should be a no-op.  It currently isn't a no-op if RAX indicates
signal restart, because the high bits get cleared and the kernel
loses track of the restart state.

Reported-by: Robert O'Callahan <robert@ocallahan.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c4040b40b5b4a37ed31375a69b683f753ec6788a.1455142412.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c
index c6a0acc..4214567 100644
--- a/tools/testing/selftests/x86/ptrace_syscall.c
+++ b/tools/testing/selftests/x86/ptrace_syscall.c
@@ -103,6 +103,17 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
 		err(1, "sigaction");
 }
 
+static void setsigign(int sig, int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = (void *)SIG_IGN;
+	sa.sa_flags = flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
 static void clearhandler(int sig)
 {
 	struct sigaction sa;
@@ -277,6 +288,119 @@ static void test_ptrace_syscall_restart(void)
 	}
 }
 
+static void test_restart_under_ptrace(void)
+{
+	printf("[RUN]\tkernel syscall restart under ptrace\n");
+	pid_t chld = fork();
+	if (chld < 0)
+		err(1, "fork");
+
+	if (chld == 0) {
+		if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
+			err(1, "PTRACE_TRACEME");
+
+		printf("\tChild will take a nap until signaled\n");
+		setsigign(SIGUSR1, SA_RESTART);
+		raise(SIGSTOP);
+
+		syscall(SYS_pause, 0, 0, 0, 0, 0, 0);
+		_exit(0);
+	}
+
+	int status;
+
+	/* Wait for SIGSTOP. */
+	if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
+		err(1, "waitpid");
+
+	struct user_regs_struct regs;
+
+	printf("[RUN]\tSYSCALL\n");
+	if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+		err(1, "PTRACE_SYSCALL");
+	wait_trap(chld);
+
+	/* We should be stopped at pause(2) entry. */
+
+	if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+		err(1, "PTRACE_GETREGS");
+
+	if (regs.user_syscall_nr != SYS_pause ||
+	    regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+	    regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+	    regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+		printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+		nerrs++;
+	} else {
+		printf("[OK]\tInitial nr and args are correct\n");
+	}
+
+	/* Interrupt it. */
+	kill(chld, SIGUSR1);
+
+	/* Advance.  We should be stopped at exit. */
+	printf("[RUN]\tSYSCALL\n");
+	if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+		err(1, "PTRACE_SYSCALL");
+	wait_trap(chld);
+
+	if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+		err(1, "PTRACE_GETREGS");
+
+	if (regs.user_syscall_nr != SYS_pause ||
+	    regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+	    regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+	    regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+		printf("[FAIL]\tArgs after SIGUSR1 are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+		nerrs++;
+	} else {
+		printf("[OK]\tArgs after SIGUSR1 are correct (ax = %ld)\n",
+		       (long)regs.user_ax);
+	}
+
+	/* Poke the regs back in.  This must not break anything. */
+	if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
+		err(1, "PTRACE_SETREGS");
+
+	/* Catch the (ignored) SIGUSR1. */
+	if (ptrace(PTRACE_CONT, chld, 0, 0) != 0)
+		err(1, "PTRACE_CONT");
+	if (waitpid(chld, &status, 0) != chld)
+		err(1, "waitpid");
+	if (!WIFSTOPPED(status)) {
+		printf("[FAIL]\tChild was stopped for SIGUSR1 (status = 0x%x)\n", status);
+		nerrs++;
+	} else {
+		printf("[OK]\tChild got SIGUSR1\n");
+	}
+
+	/* The next event should be pause(2) again. */
+	printf("[RUN]\tStep again\n");
+	if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+		err(1, "PTRACE_SYSCALL");
+	wait_trap(chld);
+
+	/* We should be stopped at pause(2) entry. */
+
+	if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+		err(1, "PTRACE_GETREGS");
+
+	if (regs.user_syscall_nr != SYS_pause ||
+	    regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+	    regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+	    regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+		printf("[FAIL]\tpause did not restart (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+		nerrs++;
+	} else {
+		printf("[OK]\tpause(2) restarted correctly\n");
+	}
+
+	/* Kill it. */
+	kill(chld, SIGKILL);
+	if (waitpid(chld, &status, 0) != chld)
+		err(1, "waitpid");
+}
+
 int main()
 {
 	printf("[RUN]\tCheck int80 return regs\n");
@@ -290,5 +414,7 @@ int main()
 
 	test_ptrace_syscall_restart();
 
+	test_restart_under_ptrace();
+
 	return 0;
 }
-- 
cgit v0.10.2


From 4e79e182b419172e35936a47f098509092d69817 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Feb 2016 14:15:27 -0800
Subject: x86/entry/compat: Keep TS_COMPAT set during signal delivery

Signal delivery needs to know the sign of an interrupted syscall's
return value in order to detect -ERESTART variants.  Normally this
works independently of bitness because syscalls internally return
long.  Under ptrace, however, this can break, and syscall_get_error
is supposed to sign-extend regs->ax if needed.

We were clearing TS_COMPAT too early, though, and this prevented
sign extension, which subtly broke syscall restart under ptrace.

Reported-by: Robert O'Callahan <robert@ocallahan.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org # 4.3.x-
Fixes: c5c46f59e4e7 ("x86/entry: Add new, comprehensible entry and exit handlers written in C")
Link: http://lkml.kernel.org/r/cbce3cf545522f64eb37f5478cb59746230db3b5.1455142412.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index c6ab2eb..1a000f5 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -269,6 +269,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
 /* Called with IRQs disabled. */
 __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 {
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
 	u32 cached_flags;
 
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -276,12 +277,22 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 
 	lockdep_sys_exit();
 
-	cached_flags =
-		READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+	cached_flags = READ_ONCE(ti->flags);
 
 	if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
 		exit_to_usermode_loop(regs, cached_flags);
 
+#ifdef CONFIG_COMPAT
+	/*
+	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
+	 * returning to user mode.  We need to clear it *after* signal
+	 * handling, because syscall restart has a fixup for compat
+	 * syscalls.  The fixup is exercised by the ptrace_syscall_32
+	 * selftest.
+	 */
+	ti->status &= ~TS_COMPAT;
+#endif
+
 	user_enter();
 }
 
@@ -333,14 +344,6 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
 	if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
 		syscall_slow_exit_work(regs, cached_flags);
 
-#ifdef CONFIG_COMPAT
-	/*
-	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
-	 * returning to user mode.
-	 */
-	ti->status &= ~TS_COMPAT;
-#endif
-
 	local_irq_disable();
 	prepare_exit_to_usermode(regs);
 }
-- 
cgit v0.10.2


From af5d3aabc04a4b7732b1d3404feebadfe5ae9362 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:07 +0100
Subject: perf/x86: Move perf_event_intel_bts.c ........ =>
 x86/events/intel/bts.c

Start moving the Intel bits.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-2-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 7d1ecff..bcd1e21 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/bts.o
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
new file mode 100644
index 0000000..2bd4833
--- /dev/null
+++ b/arch/x86/events/intel/bts.c
@@ -0,0 +1,544 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+struct bts_ctx {
+	struct perf_output_handle	handle;
+	struct debug_store		ds_back;
+	int				started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE		24
+#define BTS_SAFETY_MARGIN	4080
+
+struct bts_phys {
+	struct page	*page;
+	unsigned long	size;
+	unsigned long	offset;
+	unsigned long	displacement;
+};
+
+struct bts_buffer {
+	size_t		real_size;	/* multiple of BTS_RECORD_SIZE */
+	unsigned int	nr_pages;
+	unsigned int	nr_bufs;
+	unsigned int	cur_buf;
+	bool		snapshot;
+	local_t		data_size;
+	local_t		lost;
+	local_t		head;
+	unsigned long	end;
+	void		**data_pages;
+	struct bts_phys	buf[0];
+};
+
+struct pmu bts_pmu;
+
+static size_t buf_size(struct page *page)
+{
+	return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+	struct bts_buffer *buf;
+	struct page *page;
+	int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+	unsigned long offset;
+	size_t size = nr_pages << PAGE_SHIFT;
+	int pg, nbuf, pad;
+
+	/* count all the high order buffers */
+	for (pg = 0, nbuf = 0; pg < nr_pages;) {
+		page = virt_to_page(pages[pg]);
+		if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+			return NULL;
+		pg += 1 << page_private(page);
+		nbuf++;
+	}
+
+	/*
+	 * to avoid interrupts in overwrite mode, only allow one physical
+	 */
+	if (overwrite && nbuf > 1)
+		return NULL;
+
+	buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+	if (!buf)
+		return NULL;
+
+	buf->nr_pages = nr_pages;
+	buf->nr_bufs = nbuf;
+	buf->snapshot = overwrite;
+	buf->data_pages = pages;
+	buf->real_size = size - size % BTS_RECORD_SIZE;
+
+	for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+		unsigned int __nr_pages;
+
+		page = virt_to_page(pages[pg]);
+		__nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+		buf->buf[nbuf].page = page;
+		buf->buf[nbuf].offset = offset;
+		buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+		buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+		pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+		buf->buf[nbuf].size -= pad;
+
+		pg += __nr_pages;
+		offset += __nr_pages << PAGE_SHIFT;
+	}
+
+	return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+	kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+	return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+	int cpu = raw_smp_processor_id();
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct bts_phys *phys = &buf->buf[buf->cur_buf];
+	unsigned long index, thresh = 0, end = phys->size;
+	struct page *page = phys->page;
+
+	index = local_read(&buf->head);
+
+	if (!buf->snapshot) {
+		if (buf->end < phys->offset + buf_size(page))
+			end = buf->end - phys->offset - phys->displacement;
+
+		index -= phys->offset + phys->displacement;
+
+		if (end - index > BTS_SAFETY_MARGIN)
+			thresh = end - BTS_SAFETY_MARGIN;
+		else if (end - index > BTS_RECORD_SIZE)
+			thresh = end - BTS_RECORD_SIZE;
+		else
+			thresh = end;
+	}
+
+	ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+	ds->bts_index = ds->bts_buffer_base + index;
+	ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+	ds->bts_interrupt_threshold = !buf->snapshot
+		? ds->bts_buffer_base + thresh
+		: ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+	unsigned long index = head - phys->offset;
+
+	memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+	if (buf->snapshot)
+		return false;
+
+	if (local_read(&buf->data_size) >= bts->handle.size ||
+	    bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+		return true;
+
+	return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+	int cpu = raw_smp_processor_id();
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+	unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+	if (!buf)
+		return;
+
+	head = index + bts_buffer_offset(buf, buf->cur_buf);
+	old = local_xchg(&buf->head, head);
+
+	if (!buf->snapshot) {
+		if (old == head)
+			return;
+
+		if (ds->bts_index >= ds->bts_absolute_maximum)
+			local_inc(&buf->lost);
+
+		/*
+		 * old and head are always in the same physical buffer, so we
+		 * can subtract them to get the data size.
+		 */
+		local_add(head - old, &buf->data_size);
+	} else {
+		local_set(&buf->data_size, head);
+	}
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+	u64 config = 0;
+
+	if (!buf || bts_buffer_is_full(buf, bts))
+		return;
+
+	event->hw.itrace_started = 1;
+	event->hw.state = 0;
+
+	if (!buf->snapshot)
+		config |= ARCH_PERFMON_EVENTSEL_INT;
+	if (!event->attr.exclude_kernel)
+		config |= ARCH_PERFMON_EVENTSEL_OS;
+	if (!event->attr.exclude_user)
+		config |= ARCH_PERFMON_EVENTSEL_USR;
+
+	bts_config_buffer(buf);
+
+	/*
+	 * local barrier to make sure that ds configuration made it
+	 * before we enable BTS
+	 */
+	wmb();
+
+	intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	__bts_event_start(event);
+
+	/* PMI handler: this counter is running and likely generating PMIs */
+	ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+	/*
+	 * No extra synchronization is mandated by the documentation to have
+	 * BTS data stores globally visible.
+	 */
+	intel_pmu_disable_bts();
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
+	ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	/* PMI handler: don't restart this counter */
+	ACCESS_ONCE(bts->started) = 0;
+
+	__bts_event_stop(event);
+
+	if (flags & PERF_EF_UPDATE)
+		bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	if (bts->handle.event && bts->started)
+		__bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	if (bts->handle.event)
+		__bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+	unsigned long head, space, next_space, pad, gap, skip, wakeup;
+	unsigned int next_buf;
+	struct bts_phys *phys, *next_phys;
+	int ret;
+
+	if (buf->snapshot)
+		return 0;
+
+	head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+	if (WARN_ON_ONCE(head != local_read(&buf->head)))
+		return -EINVAL;
+
+	phys = &buf->buf[buf->cur_buf];
+	space = phys->offset + phys->displacement + phys->size - head;
+	pad = space;
+	if (space > handle->size) {
+		space = handle->size;
+		space -= space % BTS_RECORD_SIZE;
+	}
+	if (space <= BTS_SAFETY_MARGIN) {
+		/* See if next phys buffer has more space */
+		next_buf = buf->cur_buf + 1;
+		if (next_buf >= buf->nr_bufs)
+			next_buf = 0;
+		next_phys = &buf->buf[next_buf];
+		gap = buf_size(phys->page) - phys->displacement - phys->size +
+		      next_phys->displacement;
+		skip = pad + gap;
+		if (handle->size >= skip) {
+			next_space = next_phys->size;
+			if (next_space + skip > handle->size) {
+				next_space = handle->size - skip;
+				next_space -= next_space % BTS_RECORD_SIZE;
+			}
+			if (next_space > space || !space) {
+				if (pad)
+					bts_buffer_pad_out(phys, head);
+				ret = perf_aux_output_skip(handle, skip);
+				if (ret)
+					return ret;
+				/* Advance to next phys buffer */
+				phys = next_phys;
+				space = next_space;
+				head = phys->offset + phys->displacement;
+				/*
+				 * After this, cur_buf and head won't match ds
+				 * anymore, so we must not be racing with
+				 * bts_update().
+				 */
+				buf->cur_buf = next_buf;
+				local_set(&buf->head, head);
+			}
+		}
+	}
+
+	/* Don't go far beyond wakeup watermark */
+	wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+		 handle->head;
+	if (space > wakeup) {
+		space = wakeup;
+		space -= space % BTS_RECORD_SIZE;
+	}
+
+	buf->end = head + space;
+
+	/*
+	 * If we have no space, the lost notification would have been sent when
+	 * we hit absolute_maximum - see bts_update()
+	 */
+	if (!space)
+		return -ENOSPC;
+
+	return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct perf_event *event = bts->handle.event;
+	struct bts_buffer *buf;
+	s64 old_head;
+	int err;
+
+	if (!event || !bts->started)
+		return 0;
+
+	buf = perf_get_aux(&bts->handle);
+	/*
+	 * Skip snapshot counters: they don't use the interrupt, but
+	 * there's no other way of telling, because the pointer will
+	 * keep moving
+	 */
+	if (!buf || buf->snapshot)
+		return 0;
+
+	old_head = local_read(&buf->head);
+	bts_update(bts);
+
+	/* no new data */
+	if (old_head == local_read(&buf->head))
+		return 0;
+
+	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+			    !!local_xchg(&buf->lost, 0));
+
+	buf = perf_aux_output_begin(&bts->handle, event);
+	if (!buf)
+		return 1;
+
+	err = bts_buffer_reset(buf, &bts->handle);
+	if (err)
+		perf_aux_output_end(&bts->handle, 0, false);
+
+	return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+	bts_event_stop(event, PERF_EF_UPDATE);
+
+	if (buf) {
+		if (buf->snapshot)
+			bts->handle.head =
+				local_xchg(&buf->data_size,
+					   buf->nr_pages << PAGE_SHIFT);
+		perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+				    !!local_xchg(&buf->lost, 0));
+	}
+
+	cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+	cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+	cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+	cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+	struct bts_buffer *buf;
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = -EBUSY;
+
+	event->hw.state = PERF_HES_STOPPED;
+
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+		return -EBUSY;
+
+	if (bts->handle.event)
+		return -EBUSY;
+
+	buf = perf_aux_output_begin(&bts->handle, event);
+	if (!buf)
+		return -EINVAL;
+
+	ret = bts_buffer_reset(buf, &bts->handle);
+	if (ret) {
+		perf_aux_output_end(&bts->handle, 0, false);
+		return ret;
+	}
+
+	bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+	if (mode & PERF_EF_START) {
+		bts_event_start(event, 0);
+		if (hwc->state & PERF_HES_STOPPED) {
+			bts_event_del(event, 0);
+			return -EBUSY;
+		}
+	}
+
+	return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+	x86_release_hardware();
+	x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+	int ret;
+
+	if (event->attr.type != bts_pmu.type)
+		return -ENOENT;
+
+	if (x86_add_exclusive(x86_lbr_exclusive_bts))
+		return -EBUSY;
+
+	/*
+	 * BTS leaks kernel addresses even when CPL0 tracing is
+	 * disabled, so disallow intel_bts driver for unprivileged
+	 * users on paranoid systems since it provides trace data
+	 * to the user in a zero-copy fashion.
+	 *
+	 * Note that the default paranoia setting permits unprivileged
+	 * users to profile the kernel.
+	 */
+	if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	ret = x86_reserve_hardware();
+	if (ret) {
+		x86_del_exclusive(x86_lbr_exclusive_bts);
+		return ret;
+	}
+
+	event->destroy = bts_event_destroy;
+
+	return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+		return -ENODEV;
+
+	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+	bts_pmu.task_ctx_nr	= perf_sw_context;
+	bts_pmu.event_init	= bts_event_init;
+	bts_pmu.add		= bts_event_add;
+	bts_pmu.del		= bts_event_del;
+	bts_pmu.start		= bts_event_start;
+	bts_pmu.stop		= bts_event_stop;
+	bts_pmu.read		= bts_event_read;
+	bts_pmu.setup_aux	= bts_buffer_setup_aux;
+	bts_pmu.free_aux	= bts_buffer_free_aux;
+
+	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+arch_initcall(bts_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7edbeb9..e28f931 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -34,7 +34,7 @@ ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o perf_event_intel_bts.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_cstate.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
deleted file mode 100644
index 2cad71d..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ /dev/null
@@ -1,544 +0,0 @@
-/*
- * BTS PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- */
-
-#undef DEBUG
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/coredump.h>
-
-#include <asm-generic/sizes.h>
-#include <asm/perf_event.h>
-
-#include "perf_event.h"
-
-struct bts_ctx {
-	struct perf_output_handle	handle;
-	struct debug_store		ds_back;
-	int				started;
-};
-
-static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
-
-#define BTS_RECORD_SIZE		24
-#define BTS_SAFETY_MARGIN	4080
-
-struct bts_phys {
-	struct page	*page;
-	unsigned long	size;
-	unsigned long	offset;
-	unsigned long	displacement;
-};
-
-struct bts_buffer {
-	size_t		real_size;	/* multiple of BTS_RECORD_SIZE */
-	unsigned int	nr_pages;
-	unsigned int	nr_bufs;
-	unsigned int	cur_buf;
-	bool		snapshot;
-	local_t		data_size;
-	local_t		lost;
-	local_t		head;
-	unsigned long	end;
-	void		**data_pages;
-	struct bts_phys	buf[0];
-};
-
-struct pmu bts_pmu;
-
-static size_t buf_size(struct page *page)
-{
-	return 1 << (PAGE_SHIFT + page_private(page));
-}
-
-static void *
-bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
-{
-	struct bts_buffer *buf;
-	struct page *page;
-	int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
-	unsigned long offset;
-	size_t size = nr_pages << PAGE_SHIFT;
-	int pg, nbuf, pad;
-
-	/* count all the high order buffers */
-	for (pg = 0, nbuf = 0; pg < nr_pages;) {
-		page = virt_to_page(pages[pg]);
-		if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
-			return NULL;
-		pg += 1 << page_private(page);
-		nbuf++;
-	}
-
-	/*
-	 * to avoid interrupts in overwrite mode, only allow one physical
-	 */
-	if (overwrite && nbuf > 1)
-		return NULL;
-
-	buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
-	if (!buf)
-		return NULL;
-
-	buf->nr_pages = nr_pages;
-	buf->nr_bufs = nbuf;
-	buf->snapshot = overwrite;
-	buf->data_pages = pages;
-	buf->real_size = size - size % BTS_RECORD_SIZE;
-
-	for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
-		unsigned int __nr_pages;
-
-		page = virt_to_page(pages[pg]);
-		__nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
-		buf->buf[nbuf].page = page;
-		buf->buf[nbuf].offset = offset;
-		buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
-		buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
-		pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
-		buf->buf[nbuf].size -= pad;
-
-		pg += __nr_pages;
-		offset += __nr_pages << PAGE_SHIFT;
-	}
-
-	return buf;
-}
-
-static void bts_buffer_free_aux(void *data)
-{
-	kfree(data);
-}
-
-static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
-{
-	return buf->buf[idx].offset + buf->buf[idx].displacement;
-}
-
-static void
-bts_config_buffer(struct bts_buffer *buf)
-{
-	int cpu = raw_smp_processor_id();
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	struct bts_phys *phys = &buf->buf[buf->cur_buf];
-	unsigned long index, thresh = 0, end = phys->size;
-	struct page *page = phys->page;
-
-	index = local_read(&buf->head);
-
-	if (!buf->snapshot) {
-		if (buf->end < phys->offset + buf_size(page))
-			end = buf->end - phys->offset - phys->displacement;
-
-		index -= phys->offset + phys->displacement;
-
-		if (end - index > BTS_SAFETY_MARGIN)
-			thresh = end - BTS_SAFETY_MARGIN;
-		else if (end - index > BTS_RECORD_SIZE)
-			thresh = end - BTS_RECORD_SIZE;
-		else
-			thresh = end;
-	}
-
-	ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
-	ds->bts_index = ds->bts_buffer_base + index;
-	ds->bts_absolute_maximum = ds->bts_buffer_base + end;
-	ds->bts_interrupt_threshold = !buf->snapshot
-		? ds->bts_buffer_base + thresh
-		: ds->bts_absolute_maximum + BTS_RECORD_SIZE;
-}
-
-static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
-{
-	unsigned long index = head - phys->offset;
-
-	memset(page_address(phys->page) + index, 0, phys->size - index);
-}
-
-static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
-{
-	if (buf->snapshot)
-		return false;
-
-	if (local_read(&buf->data_size) >= bts->handle.size ||
-	    bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
-		return true;
-
-	return false;
-}
-
-static void bts_update(struct bts_ctx *bts)
-{
-	int cpu = raw_smp_processor_id();
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	struct bts_buffer *buf = perf_get_aux(&bts->handle);
-	unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
-
-	if (!buf)
-		return;
-
-	head = index + bts_buffer_offset(buf, buf->cur_buf);
-	old = local_xchg(&buf->head, head);
-
-	if (!buf->snapshot) {
-		if (old == head)
-			return;
-
-		if (ds->bts_index >= ds->bts_absolute_maximum)
-			local_inc(&buf->lost);
-
-		/*
-		 * old and head are always in the same physical buffer, so we
-		 * can subtract them to get the data size.
-		 */
-		local_add(head - old, &buf->data_size);
-	} else {
-		local_set(&buf->data_size, head);
-	}
-}
-
-static void __bts_event_start(struct perf_event *event)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-	struct bts_buffer *buf = perf_get_aux(&bts->handle);
-	u64 config = 0;
-
-	if (!buf || bts_buffer_is_full(buf, bts))
-		return;
-
-	event->hw.itrace_started = 1;
-	event->hw.state = 0;
-
-	if (!buf->snapshot)
-		config |= ARCH_PERFMON_EVENTSEL_INT;
-	if (!event->attr.exclude_kernel)
-		config |= ARCH_PERFMON_EVENTSEL_OS;
-	if (!event->attr.exclude_user)
-		config |= ARCH_PERFMON_EVENTSEL_USR;
-
-	bts_config_buffer(buf);
-
-	/*
-	 * local barrier to make sure that ds configuration made it
-	 * before we enable BTS
-	 */
-	wmb();
-
-	intel_pmu_enable_bts(config);
-}
-
-static void bts_event_start(struct perf_event *event, int flags)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-	__bts_event_start(event);
-
-	/* PMI handler: this counter is running and likely generating PMIs */
-	ACCESS_ONCE(bts->started) = 1;
-}
-
-static void __bts_event_stop(struct perf_event *event)
-{
-	/*
-	 * No extra synchronization is mandated by the documentation to have
-	 * BTS data stores globally visible.
-	 */
-	intel_pmu_disable_bts();
-
-	if (event->hw.state & PERF_HES_STOPPED)
-		return;
-
-	ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
-}
-
-static void bts_event_stop(struct perf_event *event, int flags)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-	/* PMI handler: don't restart this counter */
-	ACCESS_ONCE(bts->started) = 0;
-
-	__bts_event_stop(event);
-
-	if (flags & PERF_EF_UPDATE)
-		bts_update(bts);
-}
-
-void intel_bts_enable_local(void)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-	if (bts->handle.event && bts->started)
-		__bts_event_start(bts->handle.event);
-}
-
-void intel_bts_disable_local(void)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-
-	if (bts->handle.event)
-		__bts_event_stop(bts->handle.event);
-}
-
-static int
-bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
-{
-	unsigned long head, space, next_space, pad, gap, skip, wakeup;
-	unsigned int next_buf;
-	struct bts_phys *phys, *next_phys;
-	int ret;
-
-	if (buf->snapshot)
-		return 0;
-
-	head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
-	if (WARN_ON_ONCE(head != local_read(&buf->head)))
-		return -EINVAL;
-
-	phys = &buf->buf[buf->cur_buf];
-	space = phys->offset + phys->displacement + phys->size - head;
-	pad = space;
-	if (space > handle->size) {
-		space = handle->size;
-		space -= space % BTS_RECORD_SIZE;
-	}
-	if (space <= BTS_SAFETY_MARGIN) {
-		/* See if next phys buffer has more space */
-		next_buf = buf->cur_buf + 1;
-		if (next_buf >= buf->nr_bufs)
-			next_buf = 0;
-		next_phys = &buf->buf[next_buf];
-		gap = buf_size(phys->page) - phys->displacement - phys->size +
-		      next_phys->displacement;
-		skip = pad + gap;
-		if (handle->size >= skip) {
-			next_space = next_phys->size;
-			if (next_space + skip > handle->size) {
-				next_space = handle->size - skip;
-				next_space -= next_space % BTS_RECORD_SIZE;
-			}
-			if (next_space > space || !space) {
-				if (pad)
-					bts_buffer_pad_out(phys, head);
-				ret = perf_aux_output_skip(handle, skip);
-				if (ret)
-					return ret;
-				/* Advance to next phys buffer */
-				phys = next_phys;
-				space = next_space;
-				head = phys->offset + phys->displacement;
-				/*
-				 * After this, cur_buf and head won't match ds
-				 * anymore, so we must not be racing with
-				 * bts_update().
-				 */
-				buf->cur_buf = next_buf;
-				local_set(&buf->head, head);
-			}
-		}
-	}
-
-	/* Don't go far beyond wakeup watermark */
-	wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
-		 handle->head;
-	if (space > wakeup) {
-		space = wakeup;
-		space -= space % BTS_RECORD_SIZE;
-	}
-
-	buf->end = head + space;
-
-	/*
-	 * If we have no space, the lost notification would have been sent when
-	 * we hit absolute_maximum - see bts_update()
-	 */
-	if (!space)
-		return -ENOSPC;
-
-	return 0;
-}
-
-int intel_bts_interrupt(void)
-{
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-	struct perf_event *event = bts->handle.event;
-	struct bts_buffer *buf;
-	s64 old_head;
-	int err;
-
-	if (!event || !bts->started)
-		return 0;
-
-	buf = perf_get_aux(&bts->handle);
-	/*
-	 * Skip snapshot counters: they don't use the interrupt, but
-	 * there's no other way of telling, because the pointer will
-	 * keep moving
-	 */
-	if (!buf || buf->snapshot)
-		return 0;
-
-	old_head = local_read(&buf->head);
-	bts_update(bts);
-
-	/* no new data */
-	if (old_head == local_read(&buf->head))
-		return 0;
-
-	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-			    !!local_xchg(&buf->lost, 0));
-
-	buf = perf_aux_output_begin(&bts->handle, event);
-	if (!buf)
-		return 1;
-
-	err = bts_buffer_reset(buf, &bts->handle);
-	if (err)
-		perf_aux_output_end(&bts->handle, 0, false);
-
-	return 1;
-}
-
-static void bts_event_del(struct perf_event *event, int mode)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-	struct bts_buffer *buf = perf_get_aux(&bts->handle);
-
-	bts_event_stop(event, PERF_EF_UPDATE);
-
-	if (buf) {
-		if (buf->snapshot)
-			bts->handle.head =
-				local_xchg(&buf->data_size,
-					   buf->nr_pages << PAGE_SHIFT);
-		perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-				    !!local_xchg(&buf->lost, 0));
-	}
-
-	cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
-	cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
-	cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
-	cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
-}
-
-static int bts_event_add(struct perf_event *event, int mode)
-{
-	struct bts_buffer *buf;
-	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct hw_perf_event *hwc = &event->hw;
-	int ret = -EBUSY;
-
-	event->hw.state = PERF_HES_STOPPED;
-
-	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-		return -EBUSY;
-
-	if (bts->handle.event)
-		return -EBUSY;
-
-	buf = perf_aux_output_begin(&bts->handle, event);
-	if (!buf)
-		return -EINVAL;
-
-	ret = bts_buffer_reset(buf, &bts->handle);
-	if (ret) {
-		perf_aux_output_end(&bts->handle, 0, false);
-		return ret;
-	}
-
-	bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
-	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
-	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
-
-	if (mode & PERF_EF_START) {
-		bts_event_start(event, 0);
-		if (hwc->state & PERF_HES_STOPPED) {
-			bts_event_del(event, 0);
-			return -EBUSY;
-		}
-	}
-
-	return 0;
-}
-
-static void bts_event_destroy(struct perf_event *event)
-{
-	x86_release_hardware();
-	x86_del_exclusive(x86_lbr_exclusive_bts);
-}
-
-static int bts_event_init(struct perf_event *event)
-{
-	int ret;
-
-	if (event->attr.type != bts_pmu.type)
-		return -ENOENT;
-
-	if (x86_add_exclusive(x86_lbr_exclusive_bts))
-		return -EBUSY;
-
-	/*
-	 * BTS leaks kernel addresses even when CPL0 tracing is
-	 * disabled, so disallow intel_bts driver for unprivileged
-	 * users on paranoid systems since it provides trace data
-	 * to the user in a zero-copy fashion.
-	 *
-	 * Note that the default paranoia setting permits unprivileged
-	 * users to profile the kernel.
-	 */
-	if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
-	    !capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	ret = x86_reserve_hardware();
-	if (ret) {
-		x86_del_exclusive(x86_lbr_exclusive_bts);
-		return ret;
-	}
-
-	event->destroy = bts_event_destroy;
-
-	return 0;
-}
-
-static void bts_event_read(struct perf_event *event)
-{
-}
-
-static __init int bts_init(void)
-{
-	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
-		return -ENODEV;
-
-	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
-	bts_pmu.task_ctx_nr	= perf_sw_context;
-	bts_pmu.event_init	= bts_event_init;
-	bts_pmu.add		= bts_event_add;
-	bts_pmu.del		= bts_event_del;
-	bts_pmu.start		= bts_event_start;
-	bts_pmu.stop		= bts_event_stop;
-	bts_pmu.read		= bts_event_read;
-	bts_pmu.setup_aux	= bts_buffer_setup_aux;
-	bts_pmu.free_aux	= bts_buffer_free_aux;
-
-	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
-}
-arch_initcall(bts_init);
-- 
cgit v0.10.2


From e1069839dd6893d2135b2fc4d96e5d03d73c2c3d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:08 +0100
Subject: perf/x86: Move perf_event_intel.c ............ =>
 x86/events/intel/core.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index bcd1e21..834e9ae 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -5,4 +5,4 @@ obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/bts.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
new file mode 100644
index 0000000..1edf301
--- /dev/null
+++ b/arch/x86/events/intel/core.c
@@ -0,0 +1,3773 @@
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/nmi.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+/*
+ * Intel PerfMon, used on Core and later.
+ */
+static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
+{
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+	[PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
+	[PERF_COUNT_HW_REF_CPU_CYCLES]		= 0x0300, /* pseudo-encoding */
+};
+
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
+{
+	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+	INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+	INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+	INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+	INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
+	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
+	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
+	INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
+	INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
+	INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
+	INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
+	INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
+	INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
+	INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
+	INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
+	INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
+	INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
+	INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
+	INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
+	INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+	INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
+{
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+	EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
+	INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
+	INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+	INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+	INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+	INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
+	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
+	INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
+	INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+	INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
+{
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
+	EVENT_EXTRA_END
+};
+
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
+{
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_slm_event_constraints[] __read_mostly =
+{
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_skl_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x01b7,
+			       MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x02b7,
+			       MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
+	EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	EVENT_EXTRA_END
+};
+
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	/*
+	 * Note the low 8 bits eventsel code is not a continuous field, containing
+	 * some #GPing bits. These are masked out.
+	 */
+	INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+	EVENT_EXTRA_END
+};
+
+EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3");
+EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");
+
+struct attribute *nhm_events_attrs[] = {
+	EVENT_PTR(mem_ld_nhm),
+	NULL,
+};
+
+struct attribute *snb_events_attrs[] = {
+	EVENT_PTR(mem_ld_snb),
+	EVENT_PTR(mem_st_snb),
+	NULL,
+};
+
+static struct event_constraint intel_hsw_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
+	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
+	/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
+	/* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
+	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
+	/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
+	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
+	INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),	/* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
+	EVENT_CONSTRAINT_END
+};
+
+static u64 intel_pmu_event_map(int hw_event)
+{
+	return intel_perfmon_event_map[hw_event];
+}
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD		BIT_ULL(0)
+#define SKL_DEMAND_RFO			BIT_ULL(1)
+#define SKL_ANY_RESPONSE		BIT_ULL(16)
+#define SKL_SUPPLIER_NONE		BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM		BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM	BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM	BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM	BIT_ULL(29)
+#define SKL_L3_MISS			(SKL_L3_MISS_LOCAL_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT			BIT_ULL(30)
+#define SKL_SNOOP_NONE			BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define SKL_SNOOP_MISS			BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define SKL_SNOOP_HITM			BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM		BIT_ULL(37)
+#define SKL_ANY_SNOOP			(SKL_SPL_HIT|SKL_SNOOP_NONE| \
+					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+					 SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ			SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM			(SKL_SNOOP_NONE| \
+					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+					 SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE		SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS			SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE		(SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x283,	/* ICACHE_64B.MISS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x608,	/* DTLB_LOAD_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x649,	/* DTLB_STORE_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0xe85,	/* ITLB_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS|SKL_ANY_SNOOP|
+				       SKL_SUPPLIER_NONE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS|SKL_ANY_SNOOP|
+				       SKL_SUPPLIER_NONE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+#define SNB_DMND_DATA_RD	(1ULL << 0)
+#define SNB_DMND_RFO		(1ULL << 1)
+#define SNB_DMND_IFETCH		(1ULL << 2)
+#define SNB_DMND_WB		(1ULL << 3)
+#define SNB_PF_DATA_RD		(1ULL << 4)
+#define SNB_PF_RFO		(1ULL << 5)
+#define SNB_PF_IFETCH		(1ULL << 6)
+#define SNB_LLC_DATA_RD		(1ULL << 7)
+#define SNB_LLC_RFO		(1ULL << 8)
+#define SNB_LLC_IFETCH		(1ULL << 9)
+#define SNB_BUS_LOCKS		(1ULL << 10)
+#define SNB_STRM_ST		(1ULL << 11)
+#define SNB_OTHER		(1ULL << 15)
+#define SNB_RESP_ANY		(1ULL << 16)
+#define SNB_NO_SUPP		(1ULL << 17)
+#define SNB_LLC_HITM		(1ULL << 18)
+#define SNB_LLC_HITE		(1ULL << 19)
+#define SNB_LLC_HITS		(1ULL << 20)
+#define SNB_LLC_HITF		(1ULL << 21)
+#define SNB_LOCAL		(1ULL << 22)
+#define SNB_REMOTE		(0xffULL << 23)
+#define SNB_SNP_NONE		(1ULL << 31)
+#define SNB_SNP_NOT_NEEDED	(1ULL << 32)
+#define SNB_SNP_MISS		(1ULL << 33)
+#define SNB_NO_FWD		(1ULL << 34)
+#define SNB_SNP_FWD		(1ULL << 35)
+#define SNB_HITM		(1ULL << 36)
+#define SNB_NON_DRAM		(1ULL << 37)
+
+#define SNB_DMND_READ		(SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
+#define SNB_DMND_WRITE		(SNB_DMND_RFO|SNB_LLC_RFO)
+#define SNB_DMND_PREFETCH	(SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SNB_SNP_ANY		(SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
+				 SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
+				 SNB_HITM)
+
+#define SNB_DRAM_ANY		(SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
+#define SNB_DRAM_REMOTE		(SNB_REMOTE|SNB_SNP_ANY)
+
+#define SNB_L3_ACCESS		SNB_RESP_ANY
+#define SNB_L3_MISS		(SNB_DRAM_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 snb_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_L3_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_L3_MISS,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
+		[ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
+		[ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
+		[ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
+	},
+ },
+};
+
+static __initconst const u64 snb_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
+		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
+		[ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
+		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+
+};
+
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
+#define HSW_DEMAND_RFO			BIT_ULL(1)
+#define HSW_ANY_RESPONSE		BIT_ULL(16)
+#define HSW_SUPPLIER_NONE		BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
+#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE			BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define HSW_SNOOP_MISS			BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define HSW_SNOOP_HITM			BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
+#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
+					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM			(HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
+					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL		BIT(26)
+#define BDW_L3_MISS			(BDW_L3_MISS_LOCAL| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x280,	/* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x108,	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x149,	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x6085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0x185,	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 westmere_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+		[ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	/*
+	 * Use RFO, not WRITEBACK, because a write miss would typically occur
+	 * on RFO.
+	 */
+	[ C(OP_WRITE) ] = {
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.ANY              */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+};
+
+/*
+ * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
+ * See IA32 SDM Vol 3B 30.6.1.3
+ */
+
+#define NHM_DMND_DATA_RD	(1 << 0)
+#define NHM_DMND_RFO		(1 << 1)
+#define NHM_DMND_IFETCH		(1 << 2)
+#define NHM_DMND_WB		(1 << 3)
+#define NHM_PF_DATA_RD		(1 << 4)
+#define NHM_PF_DATA_RFO		(1 << 5)
+#define NHM_PF_IFETCH		(1 << 6)
+#define NHM_OFFCORE_OTHER	(1 << 7)
+#define NHM_UNCORE_HIT		(1 << 8)
+#define NHM_OTHER_CORE_HIT_SNP	(1 << 9)
+#define NHM_OTHER_CORE_HITM	(1 << 10)
+        			/* reserved */
+#define NHM_REMOTE_CACHE_FWD	(1 << 12)
+#define NHM_REMOTE_DRAM		(1 << 13)
+#define NHM_LOCAL_DRAM		(1 << 14)
+#define NHM_NON_DRAM		(1 << 15)
+
+#define NHM_LOCAL		(NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_REMOTE		(NHM_REMOTE_DRAM)
+
+#define NHM_DMND_READ		(NHM_DMND_DATA_RD)
+#define NHM_DMND_WRITE		(NHM_DMND_RFO|NHM_DMND_WB)
+#define NHM_DMND_PREFETCH	(NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
+
+#define NHM_L3_HIT	(NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
+#define NHM_L3_MISS	(NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_ACCESS	(NHM_L3_HIT|NHM_L3_MISS)
+
+static __initconst const u64 nehalem_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
+		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
+		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
+		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
+	},
+ },
+};
+
+static __initconst const u64 nehalem_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+		[ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	/*
+	 * Use RFO, not WRITEBACK, because a write miss would typically occur
+	 * on RFO.
+	 */
+	[ C(OP_WRITE) ] = {
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+};
+
+static __initconst const u64 core2_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static __initconst const u64 atom_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static struct extra_reg intel_slm_extra_regs[] __read_mostly =
+{
+	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
+	EVENT_EXTRA_END
+};
+
+#define SLM_DMND_READ		SNB_DMND_DATA_RD
+#define SLM_DMND_WRITE		SNB_DMND_RFO
+#define SLM_DMND_PREFETCH	(SNB_PF_DATA_RD|SNB_PF_RFO)
+
+#define SLM_SNP_ANY		(SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
+#define SLM_LLC_ACCESS		SNB_RESP_ANY
+#define SLM_LLC_MISS		(SLM_SNP_ANY|SNB_NON_DRAM)
+
+static __initconst const u64 slm_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = SLM_DMND_WRITE|SLM_LLC_MISS,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
+	},
+ },
+};
+
+static __initconst const u64 slm_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0x0104, /* LD_DCU_MISS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
+		[ C(RESULT_MISS)   ] = 0x0280, /* ICACGE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0x0804, /* LD_DTLB_MISS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
+		[ C(RESULT_MISS)   ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+#define KNL_OT_L2_HITE		BIT_ULL(19) /* Other Tile L2 Hit */
+#define KNL_OT_L2_HITF		BIT_ULL(20) /* Other Tile L2 Hit */
+#define KNL_MCDRAM_LOCAL	BIT_ULL(21)
+#define KNL_MCDRAM_FAR		BIT_ULL(22)
+#define KNL_DDR_LOCAL		BIT_ULL(23)
+#define KNL_DDR_FAR		BIT_ULL(24)
+#define KNL_DRAM_ANY		(KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
+				    KNL_DDR_LOCAL | KNL_DDR_FAR)
+#define KNL_L2_READ		SLM_DMND_READ
+#define KNL_L2_WRITE		SLM_DMND_WRITE
+#define KNL_L2_PREFETCH		SLM_DMND_PREFETCH
+#define KNL_L2_ACCESS		SLM_LLC_ACCESS
+#define KNL_L2_MISS		(KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
+				   KNL_DRAM_ANY | SNB_SNP_ANY | \
+						  SNB_NON_DRAM)
+
+static __initconst const u64 knl_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
+	[C(LL)] = {
+		[C(OP_READ)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = 0,
+		},
+		[C(OP_WRITE)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
+		},
+		[C(OP_PREFETCH)] = {
+			[C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
+			[C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
+		},
+	},
+};
+
+/*
+ * Use from PMIs where the LBRs are already disabled.
+ */
+static void __intel_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+		intel_pmu_disable_bts();
+	else
+		intel_bts_disable_local();
+
+	intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+	__intel_pmu_disable_all();
+	intel_pmu_lbr_disable_all();
+}
+
+static void __intel_pmu_enable_all(int added, bool pmi)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	intel_pmu_pebs_enable_all();
+	intel_pmu_lbr_enable_all(pmi);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
+			x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
+
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+		struct perf_event *event =
+			cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+
+		if (WARN_ON_ONCE(!event))
+			return;
+
+		intel_pmu_enable_bts(event->hw.config);
+	} else
+		intel_bts_enable_local();
+}
+
+static void intel_pmu_enable_all(int added)
+{
+	__intel_pmu_enable_all(added, false);
+}
+
+/*
+ * Workaround for:
+ *   Intel Errata AAK100 (model 26)
+ *   Intel Errata AAP53  (model 30)
+ *   Intel Errata BD53   (model 44)
+ *
+ * The official story:
+ *   These chips need to be 'reset' when adding counters by programming the
+ *   magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
+ *   in sequence on the same PMC or on different PMCs.
+ *
+ * In practise it appears some of these events do in fact count, and
+ * we need to programm all 4 events.
+ */
+static void intel_pmu_nhm_workaround(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	static const unsigned long nhm_magic[4] = {
+		0x4300B5,
+		0x4300D2,
+		0x4300B1,
+		0x4300B1
+	};
+	struct perf_event *event;
+	int i;
+
+	/*
+	 * The Errata requires below steps:
+	 * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
+	 * 2) Configure 4 PERFEVTSELx with the magic events and clear
+	 *    the corresponding PMCx;
+	 * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
+	 * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
+	 * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
+	 */
+
+	/*
+	 * The real steps we choose are a little different from above.
+	 * A) To reduce MSR operations, we don't run step 1) as they
+	 *    are already cleared before this function is called;
+	 * B) Call x86_perf_event_update to save PMCx before configuring
+	 *    PERFEVTSELx with magic number;
+	 * C) With step 5), we do clear only when the PERFEVTSELx is
+	 *    not used currently.
+	 * D) Call x86_perf_event_set_period to restore PMCx;
+	 */
+
+	/* We always operate 4 pairs of PERF Counters */
+	for (i = 0; i < 4; i++) {
+		event = cpuc->events[i];
+		if (event)
+			x86_perf_event_update(event);
+	}
+
+	for (i = 0; i < 4; i++) {
+		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
+		wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
+	}
+
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+	for (i = 0; i < 4; i++) {
+		event = cpuc->events[i];
+
+		if (event) {
+			x86_perf_event_set_period(event);
+			__x86_pmu_enable_event(&event->hw,
+					ARCH_PERFMON_EVENTSEL_ENABLE);
+		} else
+			wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
+	}
+}
+
+static void intel_pmu_nhm_enable_all(int added)
+{
+	if (added)
+		intel_pmu_nhm_workaround();
+	intel_pmu_enable_all(added);
+}
+
+static inline u64 intel_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void intel_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline bool event_is_checkpointed(struct perf_event *event)
+{
+	return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
+static void intel_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+		intel_pmu_disable_bts();
+		intel_pmu_drain_bts_buffer();
+		return;
+	}
+
+	cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
+	cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+	cpuc->intel_cp_status &= ~(1ull << hwc->idx);
+
+	/*
+	 * must disable before any actual event
+	 * because any event may be combined with LBR
+	 */
+	if (needs_branch_stack(event))
+		intel_pmu_lbr_disable(event);
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_disable_fixed(hwc);
+		return;
+	}
+
+	x86_pmu_disable_event(event);
+
+	if (unlikely(event->attr.precise_ip))
+		intel_pmu_pebs_disable(event);
+}
+
+static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+{
+	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+
+	/*
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
+	 */
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+
+	/*
+	 * ANY bit is supported in v3 and up
+	 */
+	if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
+		bits |= 0x4;
+
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static void intel_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+		if (!__this_cpu_read(cpu_hw_events.enabled))
+			return;
+
+		intel_pmu_enable_bts(hwc->config);
+		return;
+	}
+	/*
+	 * must enabled before any actual event
+	 * because any event may be combined with LBR
+	 */
+	if (needs_branch_stack(event))
+		intel_pmu_lbr_enable(event);
+
+	if (event->attr.exclude_host)
+		cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
+	if (event->attr.exclude_guest)
+		cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
+
+	if (unlikely(event_is_checkpointed(event)))
+		cpuc->intel_cp_status |= (1ull << hwc->idx);
+
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_enable_fixed(hwc);
+		return;
+	}
+
+	if (unlikely(event->attr.precise_ip))
+		intel_pmu_pebs_enable(event);
+
+	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+}
+
+/*
+ * Save and restart an expired event. Called by NMI contexts,
+ * so it has to be careful about preempting normal event ops:
+ */
+int intel_pmu_save_and_restart(struct perf_event *event)
+{
+	x86_perf_event_update(event);
+	/*
+	 * For a checkpointed counter always reset back to 0.  This
+	 * avoids a situation where the counter overflows, aborts the
+	 * transaction and is then set back to shortly before the
+	 * overflow, and overflows and aborts again.
+	 */
+	if (unlikely(event_is_checkpointed(event))) {
+		/* No race with NMIs because the counter should not be armed */
+		wrmsrl(event->hw.event_base, 0);
+		local64_set(&event->hw.prev_count, 0);
+	}
+	return x86_perf_event_set_period(event);
+}
+
+static void intel_pmu_reset(void)
+{
+	struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+	unsigned long flags;
+	int idx;
+
+	if (!x86_pmu.num_counters)
+		return;
+
+	local_irq_save(flags);
+
+	pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
+		wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
+	}
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
+		wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+
+	if (ds)
+		ds->bts_index = ds->bts_buffer_base;
+
+	/* Ack all overflows and disable fixed counters */
+	if (x86_pmu.version >= 2) {
+		intel_pmu_ack_status(intel_pmu_get_status());
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+	}
+
+	/* Reset LBRs and LBR freezing */
+	if (x86_pmu.lbr_nr) {
+		update_debugctlmsr(get_debugctlmsr() &
+			~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+	}
+
+	local_irq_restore(flags);
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static int intel_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int bit, loops;
+	u64 status;
+	int handled;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	/*
+	 * No known reason to not always do late ACK,
+	 * but just in case do it opt-in.
+	 */
+	if (!x86_pmu.late_ack)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+	__intel_pmu_disable_all();
+	handled = intel_pmu_drain_bts_buffer();
+	handled += intel_bts_interrupt();
+	status = intel_pmu_get_status();
+	if (!status)
+		goto done;
+
+	loops = 0;
+again:
+	intel_pmu_lbr_read();
+	intel_pmu_ack_status(status);
+	if (++loops > 100) {
+		static bool warned = false;
+		if (!warned) {
+			WARN(1, "perfevents: irq loop stuck!\n");
+			perf_event_print_debug();
+			warned = true;
+		}
+		intel_pmu_reset();
+		goto done;
+	}
+
+	inc_irq_stat(apic_perf_irqs);
+
+
+	/*
+	 * Ignore a range of extra bits in status that do not indicate
+	 * overflow by themselves.
+	 */
+	status &= ~(GLOBAL_STATUS_COND_CHG |
+		    GLOBAL_STATUS_ASIF |
+		    GLOBAL_STATUS_LBRS_FROZEN);
+	if (!status)
+		goto done;
+
+	/*
+	 * PEBS overflow sets bit 62 in the global status register
+	 */
+	if (__test_and_clear_bit(62, (unsigned long *)&status)) {
+		handled++;
+		x86_pmu.drain_pebs(regs);
+	}
+
+	/*
+	 * Intel PT
+	 */
+	if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+		handled++;
+		intel_pt_interrupt();
+	}
+
+	/*
+	 * Checkpointed counters can lead to 'spurious' PMIs because the
+	 * rollback caused by the PMI will have cleared the overflow status
+	 * bit. Therefore always force probe these counters.
+	 */
+	status |= cpuc->intel_cp_status;
+
+	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		handled++;
+
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		if (!intel_pmu_save_and_restart(event))
+			continue;
+
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (has_branch_stack(event))
+			data.br_stack = &cpuc->lbr_stack;
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = intel_pmu_get_status();
+	if (status)
+		goto again;
+
+done:
+	__intel_pmu_enable_all(0, true);
+	/*
+	 * Only unmask the NMI after the overflow counters
+	 * have been reset. This avoids spurious NMIs on
+	 * Haswell CPUs.
+	 */
+	if (x86_pmu.late_ack)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+	return handled;
+}
+
+static struct event_constraint *
+intel_bts_constraints(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned int hw_event, bts_event;
+
+	if (event->attr.freq)
+		return NULL;
+
+	hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+	bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+	if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
+		return &bts_constraint;
+
+	return NULL;
+}
+
+static int intel_alt_er(int idx, u64 config)
+{
+	int alt_idx = idx;
+
+	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
+		return idx;
+
+	if (idx == EXTRA_REG_RSP_0)
+		alt_idx = EXTRA_REG_RSP_1;
+
+	if (idx == EXTRA_REG_RSP_1)
+		alt_idx = EXTRA_REG_RSP_0;
+
+	if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+		return idx;
+
+	return alt_idx;
+}
+
+static void intel_fixup_er(struct perf_event *event, int idx)
+{
+	event->hw.extra_reg.idx = idx;
+
+	if (idx == EXTRA_REG_RSP_0) {
+		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+		event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
+		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+	} else if (idx == EXTRA_REG_RSP_1) {
+		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+		event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
+		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+	}
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
+static struct event_constraint *
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+				   struct perf_event *event,
+				   struct hw_perf_event_extra *reg)
+{
+	struct event_constraint *c = &emptyconstraint;
+	struct er_account *era;
+	unsigned long flags;
+	int idx = reg->idx;
+
+	/*
+	 * reg->alloc can be set due to existing state, so for fake cpuc we
+	 * need to ignore this, otherwise we might fail to allocate proper fake
+	 * state for this extra reg constraint. Also see the comment below.
+	 */
+	if (reg->alloc && !cpuc->is_fake)
+		return NULL; /* call x86_get_event_constraint() */
+
+again:
+	era = &cpuc->shared_regs->regs[idx];
+	/*
+	 * we use spin_lock_irqsave() to avoid lockdep issues when
+	 * passing a fake cpuc
+	 */
+	raw_spin_lock_irqsave(&era->lock, flags);
+
+	if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+		/*
+		 * If its a fake cpuc -- as per validate_{group,event}() we
+		 * shouldn't touch event state and we can avoid doing so
+		 * since both will only call get_event_constraints() once
+		 * on each event, this avoids the need for reg->alloc.
+		 *
+		 * Not doing the ER fixup will only result in era->reg being
+		 * wrong, but since we won't actually try and program hardware
+		 * this isn't a problem either.
+		 */
+		if (!cpuc->is_fake) {
+			if (idx != reg->idx)
+				intel_fixup_er(event, idx);
+
+			/*
+			 * x86_schedule_events() can call get_event_constraints()
+			 * multiple times on events in the case of incremental
+			 * scheduling(). reg->alloc ensures we only do the ER
+			 * allocation once.
+			 */
+			reg->alloc = 1;
+		}
+
+		/* lock in msr value */
+		era->config = reg->config;
+		era->reg = reg->reg;
+
+		/* one more user */
+		atomic_inc(&era->ref);
+
+		/*
+		 * need to call x86_get_event_constraint()
+		 * to check if associated event has constraints
+		 */
+		c = NULL;
+	} else {
+		idx = intel_alt_er(idx, reg->config);
+		if (idx != reg->idx) {
+			raw_spin_unlock_irqrestore(&era->lock, flags);
+			goto again;
+		}
+	}
+	raw_spin_unlock_irqrestore(&era->lock, flags);
+
+	return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+				   struct hw_perf_event_extra *reg)
+{
+	struct er_account *era;
+
+	/*
+	 * Only put constraint if extra reg was actually allocated. Also takes
+	 * care of event which do not use an extra shared reg.
+	 *
+	 * Also, if this is a fake cpuc we shouldn't touch any event state
+	 * (reg->alloc) and we don't care about leaving inconsistent cpuc state
+	 * either since it'll be thrown out.
+	 */
+	if (!reg->alloc || cpuc->is_fake)
+		return;
+
+	era = &cpuc->shared_regs->regs[reg->idx];
+
+	/* one fewer user */
+	atomic_dec(&era->ref);
+
+	/* allocate again next time */
+	reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+			      struct perf_event *event)
+{
+	struct event_constraint *c = NULL, *d;
+	struct hw_perf_event_extra *xreg, *breg;
+
+	xreg = &event->hw.extra_reg;
+	if (xreg->idx != EXTRA_REG_NONE) {
+		c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
+		if (c == &emptyconstraint)
+			return c;
+	}
+	breg = &event->hw.branch_reg;
+	if (breg->idx != EXTRA_REG_NONE) {
+		d = __intel_shared_reg_get_constraints(cpuc, event, breg);
+		if (d == &emptyconstraint) {
+			__intel_shared_reg_put_constraints(cpuc, xreg);
+			c = d;
+		}
+	}
+	return c;
+}
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (x86_pmu.event_constraints) {
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if ((event->hw.config & c->cmask) == c->code) {
+				event->hw.flags |= c->flags;
+				return c;
+			}
+		}
+	}
+
+	return &unconstrained;
+}
+
+static struct event_constraint *
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			    struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	c = intel_bts_constraints(event);
+	if (c)
+		return c;
+
+	c = intel_shared_regs_constraints(cpuc, event);
+	if (c)
+		return c;
+
+	c = intel_pebs_constraints(event);
+	if (c)
+		return c;
+
+	return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl;
+	int tid = cpuc->excl_thread_id;
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return;
+
+	/*
+	 * no exclusion needed
+	 */
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return;
+
+	xl = &excl_cntrs->states[tid];
+
+	xl->sched_started = true;
+	/*
+	 * lock shared state until we are done scheduling
+	 * in stop_event_scheduling()
+	 * makes scheduling appear as a transaction
+	 */
+	raw_spin_lock(&excl_cntrs->lock);
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct event_constraint *c = cpuc->event_constraint[idx];
+	struct intel_excl_states *xl;
+	int tid = cpuc->excl_thread_id;
+
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return;
+
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return;
+
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+		return;
+
+	xl = &excl_cntrs->states[tid];
+
+	lockdep_assert_held(&excl_cntrs->lock);
+
+	if (c->flags & PERF_X86_EVENT_EXCL)
+		xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+	else
+		xl->state[cntr] = INTEL_EXCL_SHARED;
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl;
+	int tid = cpuc->excl_thread_id;
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return;
+	/*
+	 * no exclusion needed
+	 */
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return;
+
+	xl = &excl_cntrs->states[tid];
+
+	xl->sched_started = false;
+	/*
+	 * release shared state lock (acquired in intel_start_scheduling())
+	 */
+	raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+			   int idx, struct event_constraint *c)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xlo;
+	int tid = cpuc->excl_thread_id;
+	int is_excl, i;
+
+	/*
+	 * validating a group does not require
+	 * enforcing cross-thread  exclusion
+	 */
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return c;
+
+	/*
+	 * no exclusion needed
+	 */
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return c;
+
+	/*
+	 * because we modify the constraint, we need
+	 * to make a copy. Static constraints come
+	 * from static const tables.
+	 *
+	 * only needed when constraint has not yet
+	 * been cloned (marked dynamic)
+	 */
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+		struct event_constraint *cx;
+
+		/*
+		 * grab pre-allocated constraint entry
+		 */
+		cx = &cpuc->constraint_list[idx];
+
+		/*
+		 * initialize dynamic constraint
+		 * with static constraint
+		 */
+		*cx = *c;
+
+		/*
+		 * mark constraint as dynamic, so we
+		 * can free it later on
+		 */
+		cx->flags |= PERF_X86_EVENT_DYNAMIC;
+		c = cx;
+	}
+
+	/*
+	 * From here on, the constraint is dynamic.
+	 * Either it was just allocated above, or it
+	 * was allocated during a earlier invocation
+	 * of this function
+	 */
+
+	/*
+	 * state of sibling HT
+	 */
+	xlo = &excl_cntrs->states[tid ^ 1];
+
+	/*
+	 * event requires exclusive counter access
+	 * across HT threads
+	 */
+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
+	if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+		event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+		if (!cpuc->n_excl++)
+			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+	}
+
+	/*
+	 * Modify static constraint with current dynamic
+	 * state of thread
+	 *
+	 * EXCLUSIVE: sibling counter measuring exclusive event
+	 * SHARED   : sibling counter measuring non-exclusive event
+	 * UNUSED   : sibling counter unused
+	 */
+	for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
+		/*
+		 * exclusive event in sibling counter
+		 * our corresponding counter cannot be used
+		 * regardless of our event
+		 */
+		if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+			__clear_bit(i, c->idxmsk);
+		/*
+		 * if measuring an exclusive event, sibling
+		 * measuring non-exclusive, then counter cannot
+		 * be used
+		 */
+		if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+			__clear_bit(i, c->idxmsk);
+	}
+
+	/*
+	 * recompute actual bit weight for scheduling algorithm
+	 */
+	c->weight = hweight64(c->idxmsk64);
+
+	/*
+	 * if we return an empty mask, then switch
+	 * back to static empty constraint to avoid
+	 * the cost of freeing later on
+	 */
+	if (c->weight == 0)
+		c = &emptyconstraint;
+
+	return c;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			    struct perf_event *event)
+{
+	struct event_constraint *c1 = NULL;
+	struct event_constraint *c2;
+
+	if (idx >= 0) /* fake does < 0 */
+		c1 = cpuc->event_constraint[idx];
+
+	/*
+	 * first time only
+	 * - static constraint: no change across incremental scheduling calls
+	 * - dynamic constraint: handled by intel_get_excl_constraints()
+	 */
+	c2 = __intel_get_event_constraints(cpuc, idx, event);
+	if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+		bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+		c1->weight = c2->weight;
+		c2 = c1;
+	}
+
+	if (cpuc->excl_cntrs)
+		return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+	return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+		struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	int tid = cpuc->excl_thread_id;
+	struct intel_excl_states *xl;
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake)
+		return;
+
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return;
+
+	if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+		hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+		if (!--cpuc->n_excl)
+			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+	}
+
+	/*
+	 * If event was actually assigned, then mark the counter state as
+	 * unused now.
+	 */
+	if (hwc->idx >= 0) {
+		xl = &excl_cntrs->states[tid];
+
+		/*
+		 * put_constraint may be called from x86_schedule_events()
+		 * which already has the lock held so here make locking
+		 * conditional.
+		 */
+		if (!xl->sched_started)
+			raw_spin_lock(&excl_cntrs->lock);
+
+		xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+		if (!xl->sched_started)
+			raw_spin_unlock(&excl_cntrs->lock);
+	}
+}
+
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
+					struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg;
+
+	reg = &event->hw.extra_reg;
+	if (reg->idx != EXTRA_REG_NONE)
+		__intel_shared_reg_put_constraints(cpuc, reg);
+
+	reg = &event->hw.branch_reg;
+	if (reg->idx != EXTRA_REG_NONE)
+		__intel_shared_reg_put_constraints(cpuc, reg);
+}
+
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+					struct perf_event *event)
+{
+	intel_put_shared_regs_event_constraints(cpuc, event);
+
+	/*
+	 * is PMU has exclusive counter restrictions, then
+	 * all events are subject to and must call the
+	 * put_excl_constraints() routine
+	 */
+	if (cpuc->excl_cntrs)
+		intel_put_excl_constraints(cpuc, event);
+}
+
+static void intel_pebs_aliases_core2(struct perf_event *event)
+{
+	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+		/*
+		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+		 * (0x003c) so that we can use it with PEBS.
+		 *
+		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+		 * PEBS capable. However we can use INST_RETIRED.ANY_P
+		 * (0x00c0), which is a PEBS capable event, to get the same
+		 * count.
+		 *
+		 * INST_RETIRED.ANY_P counts the number of cycles that retires
+		 * CNTMASK instructions. By setting CNTMASK to a value (16)
+		 * larger than the maximum number of instructions that can be
+		 * retired per cycle (4) and then inverting the condition, we
+		 * count all cycles that retire 16 or less instructions, which
+		 * is every cycle.
+		 *
+		 * Thereby we gain a PEBS capable cycle counter.
+		 */
+		u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
+
+		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+		event->hw.config = alt_config;
+	}
+}
+
+static void intel_pebs_aliases_snb(struct perf_event *event)
+{
+	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+		/*
+		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+		 * (0x003c) so that we can use it with PEBS.
+		 *
+		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+		 * PEBS capable. However we can use UOPS_RETIRED.ALL
+		 * (0x01c2), which is a PEBS capable event, to get the same
+		 * count.
+		 *
+		 * UOPS_RETIRED.ALL counts the number of cycles that retires
+		 * CNTMASK micro-ops. By setting CNTMASK to a value (16)
+		 * larger than the maximum number of micro-ops that can be
+		 * retired per cycle (4) and then inverting the condition, we
+		 * count all cycles that retire 16 or less micro-ops, which
+		 * is every cycle.
+		 *
+		 * Thereby we gain a PEBS capable cycle counter.
+		 */
+		u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
+
+		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+		event->hw.config = alt_config;
+	}
+}
+
+static void intel_pebs_aliases_precdist(struct perf_event *event)
+{
+	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+		/*
+		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+		 * (0x003c) so that we can use it with PEBS.
+		 *
+		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+		 * PEBS capable. However we can use INST_RETIRED.PREC_DIST
+		 * (0x01c0), which is a PEBS capable event, to get the same
+		 * count.
+		 *
+		 * The PREC_DIST event has special support to minimize sample
+		 * shadowing effects. One drawback is that it can be
+		 * only programmed on counter 1, but that seems like an
+		 * acceptable trade off.
+		 */
+		u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
+
+		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+		event->hw.config = alt_config;
+	}
+}
+
+static void intel_pebs_aliases_ivb(struct perf_event *event)
+{
+	if (event->attr.precise_ip < 3)
+		return intel_pebs_aliases_snb(event);
+	return intel_pebs_aliases_precdist(event);
+}
+
+static void intel_pebs_aliases_skl(struct perf_event *event)
+{
+	if (event->attr.precise_ip < 3)
+		return intel_pebs_aliases_core2(event);
+	return intel_pebs_aliases_precdist(event);
+}
+
+static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+{
+	unsigned long flags = x86_pmu.free_running_flags;
+
+	if (event->attr.use_clockid)
+		flags &= ~PERF_SAMPLE_TIME;
+	return flags;
+}
+
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+	int ret = x86_pmu_hw_config(event);
+
+	if (ret)
+		return ret;
+
+	if (event->attr.precise_ip) {
+		if (!event->attr.freq) {
+			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+			if (!(event->attr.sample_type &
+			      ~intel_pmu_free_running_flags(event)))
+				event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+		}
+		if (x86_pmu.pebs_aliases)
+			x86_pmu.pebs_aliases(event);
+	}
+
+	if (needs_branch_stack(event)) {
+		ret = intel_pmu_setup_lbr_filter(event);
+		if (ret)
+			return ret;
+
+		/*
+		 * BTS is set up earlier in this path, so don't account twice
+		 */
+		if (!intel_pmu_has_bts(event)) {
+			/* disallow lbr if conflicting events are present */
+			if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+				return -EBUSY;
+
+			event->destroy = hw_perf_lbr_event_destroy;
+		}
+	}
+
+	if (event->attr.type != PERF_TYPE_RAW)
+		return 0;
+
+	if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+		return 0;
+
+	if (x86_pmu.version < 3)
+		return -EINVAL;
+
+	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+	return 0;
+}
+
+struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+	if (x86_pmu.guest_get_msrs)
+		return x86_pmu.guest_get_msrs(nr);
+	*nr = 0;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
+
+static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+
+	arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
+	arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
+	arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
+	/*
+	 * If PMU counter has PEBS enabled it is not enough to disable counter
+	 * on a guest entry since PEBS memory write can overshoot guest entry
+	 * and corrupt guest memory. Disabling PEBS solves the problem.
+	 */
+	arr[1].msr = MSR_IA32_PEBS_ENABLE;
+	arr[1].host = cpuc->pebs_enabled;
+	arr[1].guest = 0;
+
+	*nr = 2;
+	return arr;
+}
+
+static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++)  {
+		struct perf_event *event = cpuc->events[idx];
+
+		arr[idx].msr = x86_pmu_config_addr(idx);
+		arr[idx].host = arr[idx].guest = 0;
+
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		arr[idx].host = arr[idx].guest =
+			event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
+
+		if (event->attr.exclude_host)
+			arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+		else if (event->attr.exclude_guest)
+			arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+	}
+
+	*nr = x86_pmu.num_counters;
+	return arr;
+}
+
+static void core_pmu_enable_event(struct perf_event *event)
+{
+	if (!event->attr.exclude_host)
+		x86_pmu_enable_event(event);
+}
+
+static void core_pmu_enable_all(int added)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
+
+		if (!test_bit(idx, cpuc->active_mask) ||
+				cpuc->events[idx]->attr.exclude_host)
+			continue;
+
+		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+	}
+}
+
+static int hsw_hw_config(struct perf_event *event)
+{
+	int ret = intel_pmu_hw_config(event);
+
+	if (ret)
+		return ret;
+	if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
+		return 0;
+	event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
+
+	/*
+	 * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
+	 * PEBS or in ANY thread mode. Since the results are non-sensical forbid
+	 * this combination.
+	 */
+	if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
+	     ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
+	      event->attr.precise_ip > 0))
+		return -EOPNOTSUPP;
+
+	if (event_is_checkpointed(event)) {
+		/*
+		 * Sampling of checkpointed events can cause situations where
+		 * the CPU constantly aborts because of a overflow, which is
+		 * then checkpointed back and ignored. Forbid checkpointing
+		 * for sampling.
+		 *
+		 * But still allow a long sampling period, so that perf stat
+		 * from KVM works.
+		 */
+		if (event->attr.sample_period > 0 &&
+		    event->attr.sample_period < 0x7fffffff)
+			return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static struct event_constraint counter2_constraint =
+			EVENT_CONSTRAINT(0, 0x4, 0);
+
+static struct event_constraint *
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	c = intel_get_event_constraints(cpuc, idx, event);
+
+	/* Handle special quirk on in_tx_checkpointed only in counter 2 */
+	if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
+		if (c->idxmsk64 & (1U << 2))
+			return &counter2_constraint;
+		return &emptyconstraint;
+	}
+
+	return c;
+}
+
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7"	);
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(pc,	"config:19"	);
+PMU_FORMAT_ATTR(any,	"config:21"	); /* v3 + */
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+PMU_FORMAT_ATTR(in_tx,  "config:32");
+PMU_FORMAT_ATTR(in_tx_cp, "config:33");
+
+static struct attribute *intel_arch_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_pc.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+ssize_t intel_event_sysfs_show(char *page, u64 config)
+{
+	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
+
+	return x86_event_sysfs_show(page, config, event);
+}
+
+struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+	struct intel_shared_regs *regs;
+	int i;
+
+	regs = kzalloc_node(sizeof(struct intel_shared_regs),
+			    GFP_KERNEL, cpu_to_node(cpu));
+	if (regs) {
+		/*
+		 * initialize the locks to keep lockdep happy
+		 */
+		for (i = 0; i < EXTRA_REG_MAX; i++)
+			raw_spin_lock_init(&regs->regs[i].lock);
+
+		regs->core_id = -1;
+	}
+	return regs;
+}
+
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+	struct intel_excl_cntrs *c;
+
+	c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+			 GFP_KERNEL, cpu_to_node(cpu));
+	if (c) {
+		raw_spin_lock_init(&c->lock);
+		c->core_id = -1;
+	}
+	return c;
+}
+
+static int intel_pmu_cpu_prepare(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+		cpuc->shared_regs = allocate_shared_regs(cpu);
+		if (!cpuc->shared_regs)
+			goto err;
+	}
+
+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+		cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+		if (!cpuc->constraint_list)
+			goto err_shared_regs;
+
+		cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+		if (!cpuc->excl_cntrs)
+			goto err_constraint_list;
+
+		cpuc->excl_thread_id = 0;
+	}
+
+	return NOTIFY_OK;
+
+err_constraint_list:
+	kfree(cpuc->constraint_list);
+	cpuc->constraint_list = NULL;
+
+err_shared_regs:
+	kfree(cpuc->shared_regs);
+	cpuc->shared_regs = NULL;
+
+err:
+	return NOTIFY_BAD;
+}
+
+static void intel_pmu_cpu_starting(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	int core_id = topology_core_id(cpu);
+	int i;
+
+	init_debug_store_on_cpu(cpu);
+	/*
+	 * Deal with CPUs that don't clear their LBRs on power-up.
+	 */
+	intel_pmu_lbr_reset();
+
+	cpuc->lbr_sel = NULL;
+
+	if (!cpuc->shared_regs)
+		return;
+
+	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+			struct intel_shared_regs *pc;
+
+			pc = per_cpu(cpu_hw_events, i).shared_regs;
+			if (pc && pc->core_id == core_id) {
+				cpuc->kfree_on_online[0] = cpuc->shared_regs;
+				cpuc->shared_regs = pc;
+				break;
+			}
+		}
+		cpuc->shared_regs->core_id = core_id;
+		cpuc->shared_regs->refcnt++;
+	}
+
+	if (x86_pmu.lbr_sel_map)
+		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
+			struct intel_excl_cntrs *c;
+
+			c = per_cpu(cpu_hw_events, i).excl_cntrs;
+			if (c && c->core_id == core_id) {
+				cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+				cpuc->excl_cntrs = c;
+				cpuc->excl_thread_id = 1;
+				break;
+			}
+		}
+		cpuc->excl_cntrs->core_id = core_id;
+		cpuc->excl_cntrs->refcnt++;
+	}
+}
+
+static void free_excl_cntrs(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	struct intel_excl_cntrs *c;
+
+	c = cpuc->excl_cntrs;
+	if (c) {
+		if (c->core_id == -1 || --c->refcnt == 0)
+			kfree(c);
+		cpuc->excl_cntrs = NULL;
+		kfree(cpuc->constraint_list);
+		cpuc->constraint_list = NULL;
+	}
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	struct intel_shared_regs *pc;
+
+	pc = cpuc->shared_regs;
+	if (pc) {
+		if (pc->core_id == -1 || --pc->refcnt == 0)
+			kfree(pc);
+		cpuc->shared_regs = NULL;
+	}
+
+	free_excl_cntrs(cpu);
+
+	fini_debug_store_on_cpu(cpu);
+}
+
+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+				 bool sched_in)
+{
+	if (x86_pmu.pebs_active)
+		intel_pmu_pebs_sched_task(ctx, sched_in);
+	if (x86_pmu.lbr_nr)
+		intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
+PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
+
+PMU_FORMAT_ATTR(ldlat, "config1:0-15");
+
+PMU_FORMAT_ATTR(frontend, "config1:0-23");
+
+static struct attribute *intel_arch3_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_pc.attr,
+	&format_attr_any.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	&format_attr_in_tx.attr,
+	&format_attr_in_tx_cp.attr,
+
+	&format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
+	&format_attr_ldlat.attr, /* PEBS load latency */
+	NULL,
+};
+
+static struct attribute *skl_format_attr[] = {
+	&format_attr_frontend.attr,
+	NULL,
+};
+
+static __initconst const struct x86_pmu core_pmu = {
+	.name			= "core",
+	.handle_irq		= x86_pmu_handle_irq,
+	.disable_all		= x86_pmu_disable_all,
+	.enable_all		= core_pmu_enable_all,
+	.enable			= core_pmu_enable_event,
+	.disable		= x86_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= intel_pmu_event_map,
+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	.apic			= 1,
+	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
+
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32-bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic event period:
+	 */
+	.max_period		= (1ULL<<31) - 1,
+	.get_event_constraints	= intel_get_event_constraints,
+	.put_event_constraints	= intel_put_event_constraints,
+	.event_constraints	= intel_core_event_constraints,
+	.guest_get_msrs		= core_guest_get_msrs,
+	.format_attrs		= intel_arch_formats_attr,
+	.events_sysfs_show	= intel_event_sysfs_show,
+
+	/*
+	 * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
+	 * together with PMU version 1 and thus be using core_pmu with
+	 * shared_regs. We need following callbacks here to allocate
+	 * it properly.
+	 */
+	.cpu_prepare		= intel_pmu_cpu_prepare,
+	.cpu_starting		= intel_pmu_cpu_starting,
+	.cpu_dying		= intel_pmu_cpu_dying,
+};
+
+static __initconst const struct x86_pmu intel_pmu = {
+	.name			= "Intel",
+	.handle_irq		= intel_pmu_handle_irq,
+	.disable_all		= intel_pmu_disable_all,
+	.enable_all		= intel_pmu_enable_all,
+	.enable			= intel_pmu_enable_event,
+	.disable		= intel_pmu_disable_event,
+	.hw_config		= intel_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= intel_pmu_event_map,
+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	.apic			= 1,
+	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic event period:
+	 */
+	.max_period		= (1ULL << 31) - 1,
+	.get_event_constraints	= intel_get_event_constraints,
+	.put_event_constraints	= intel_put_event_constraints,
+	.pebs_aliases		= intel_pebs_aliases_core2,
+
+	.format_attrs		= intel_arch3_formats_attr,
+	.events_sysfs_show	= intel_event_sysfs_show,
+
+	.cpu_prepare		= intel_pmu_cpu_prepare,
+	.cpu_starting		= intel_pmu_cpu_starting,
+	.cpu_dying		= intel_pmu_cpu_dying,
+	.guest_get_msrs		= intel_guest_get_msrs,
+	.sched_task		= intel_pmu_sched_task,
+};
+
+static __init void intel_clovertown_quirk(void)
+{
+	/*
+	 * PEBS is unreliable due to:
+	 *
+	 *   AJ67  - PEBS may experience CPL leaks
+	 *   AJ68  - PEBS PMI may be delayed by one event
+	 *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+	 *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+	 *
+	 * AJ67 could be worked around by restricting the OS/USR flags.
+	 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+	 *
+	 * AJ106 could possibly be worked around by not allowing LBR
+	 *       usage from PEBS, including the fixup.
+	 * AJ68  could possibly be worked around by always programming
+	 *	 a pebs_event_reset[0] value and coping with the lost events.
+	 *
+	 * But taken together it might just make sense to not enable PEBS on
+	 * these chips.
+	 */
+	pr_warn("PEBS disabled due to CPU errata\n");
+	x86_pmu.pebs = 0;
+	x86_pmu.pebs_constraints = NULL;
+}
+
+static int intel_snb_pebs_broken(int cpu)
+{
+	u32 rev = UINT_MAX; /* default to broken for unknown models */
+
+	switch (cpu_data(cpu).x86_model) {
+	case 42: /* SNB */
+		rev = 0x28;
+		break;
+
+	case 45: /* SNB-EP */
+		switch (cpu_data(cpu).x86_mask) {
+		case 6: rev = 0x618; break;
+		case 7: rev = 0x70c; break;
+		}
+	}
+
+	return (cpu_data(cpu).microcode < rev);
+}
+
+static void intel_snb_check_microcode(void)
+{
+	int pebs_broken = 0;
+	int cpu;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		if ((pebs_broken = intel_snb_pebs_broken(cpu)))
+			break;
+	}
+	put_online_cpus();
+
+	if (pebs_broken == x86_pmu.pebs_broken)
+		return;
+
+	/*
+	 * Serialized by the microcode lock..
+	 */
+	if (x86_pmu.pebs_broken) {
+		pr_info("PEBS enabled due to microcode update\n");
+		x86_pmu.pebs_broken = 0;
+	} else {
+		pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
+		x86_pmu.pebs_broken = 1;
+	}
+}
+
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+	u64 val_old, val_new, val_tmp;
+
+	/*
+	 * Read the current value, change it and read it back to see if it
+	 * matches, this is needed to detect certain hardware emulators
+	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+	 */
+	if (rdmsrl_safe(msr, &val_old))
+		return false;
+
+	/*
+	 * Only change the bits which can be updated by wrmsrl.
+	 */
+	val_tmp = val_old ^ mask;
+	if (wrmsrl_safe(msr, val_tmp) ||
+	    rdmsrl_safe(msr, &val_new))
+		return false;
+
+	if (val_new != val_tmp)
+		return false;
+
+	/* Here it's sure that the MSR can be safely accessed.
+	 * Restore the old value and return.
+	 */
+	wrmsrl(msr, val_old);
+
+	return true;
+}
+
+static __init void intel_sandybridge_quirk(void)
+{
+	x86_pmu.check_microcode = intel_snb_check_microcode;
+	intel_snb_check_microcode();
+}
+
+static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
+	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void intel_arch_events_quirk(void)
+{
+	int bit;
+
+	/* disable event that reported as not presend by cpuid */
+	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
+		intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+		pr_warn("CPUID marked event: \'%s\' unavailable\n",
+			intel_arch_events_map[bit].name);
+	}
+}
+
+static __init void intel_nehalem_quirk(void)
+{
+	union cpuid10_ebx ebx;
+
+	ebx.full = x86_pmu.events_maskl;
+	if (ebx.split.no_branch_misses_retired) {
+		/*
+		 * Erratum AAJ80 detected, we work it around by using
+		 * the BR_MISP_EXEC.ANY event. This will over-count
+		 * branch-misses, but it's still much better than the
+		 * architectural event which is often completely bogus:
+		 */
+		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+		ebx.split.no_branch_misses_retired = 0;
+		x86_pmu.events_maskl = ebx.full;
+		pr_info("CPU erratum AAJ80 worked around\n");
+	}
+}
+
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+	x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+	x86_pmu.start_scheduling = intel_start_scheduling;
+	x86_pmu.commit_scheduling = intel_commit_scheduling;
+	x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
+EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores,	mem_st_hsw,	"event=0xd0,umask=0x82")
+
+/* Haswell special events */
+EVENT_ATTR_STR(tx-start,	tx_start,	"event=0xc9,umask=0x1");
+EVENT_ATTR_STR(tx-commit,	tx_commit,	"event=0xc9,umask=0x2");
+EVENT_ATTR_STR(tx-abort,	tx_abort,	"event=0xc9,umask=0x4");
+EVENT_ATTR_STR(tx-capacity,	tx_capacity,	"event=0x54,umask=0x2");
+EVENT_ATTR_STR(tx-conflict,	tx_conflict,	"event=0x54,umask=0x1");
+EVENT_ATTR_STR(el-start,	el_start,	"event=0xc8,umask=0x1");
+EVENT_ATTR_STR(el-commit,	el_commit,	"event=0xc8,umask=0x2");
+EVENT_ATTR_STR(el-abort,	el_abort,	"event=0xc8,umask=0x4");
+EVENT_ATTR_STR(el-capacity,	el_capacity,	"event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-conflict,	el_conflict,	"event=0x54,umask=0x1");
+EVENT_ATTR_STR(cycles-t,	cycles_t,	"event=0x3c,in_tx=1");
+EVENT_ATTR_STR(cycles-ct,	cycles_ct,	"event=0x3c,in_tx=1,in_tx_cp=1");
+
+static struct attribute *hsw_events_attrs[] = {
+	EVENT_PTR(tx_start),
+	EVENT_PTR(tx_commit),
+	EVENT_PTR(tx_abort),
+	EVENT_PTR(tx_capacity),
+	EVENT_PTR(tx_conflict),
+	EVENT_PTR(el_start),
+	EVENT_PTR(el_commit),
+	EVENT_PTR(el_abort),
+	EVENT_PTR(el_capacity),
+	EVENT_PTR(el_conflict),
+	EVENT_PTR(cycles_t),
+	EVENT_PTR(cycles_ct),
+	EVENT_PTR(mem_ld_hsw),
+	EVENT_PTR(mem_st_hsw),
+	NULL
+};
+
+__init int intel_pmu_init(void)
+{
+	union cpuid10_edx edx;
+	union cpuid10_eax eax;
+	union cpuid10_ebx ebx;
+	struct event_constraint *c;
+	unsigned int unused;
+	struct extra_reg *er;
+	int version, i;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+		switch (boot_cpu_data.x86) {
+		case 0x6:
+			return p6_pmu_init();
+		case 0xb:
+			return knc_pmu_init();
+		case 0xf:
+			return p4_pmu_init();
+		}
+		return -ENODEV;
+	}
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Branch Misses Retired hw_event or not.
+	 */
+	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
+		return -ENODEV;
+
+	version = eax.split.version_id;
+	if (version < 2)
+		x86_pmu = core_pmu;
+	else
+		x86_pmu = intel_pmu;
+
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.cntval_bits		= eax.split.bit_width;
+	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
+
+	x86_pmu.events_maskl		= ebx.full;
+	x86_pmu.events_mask_len		= eax.split.mask_length;
+
+	x86_pmu.max_pebs_events		= min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
+
+	/*
+	 * Quirk: v2 perfmon does not report fixed-purpose events, so
+	 * assume at least 3 events:
+	 */
+	if (version > 1)
+		x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
+	if (boot_cpu_has(X86_FEATURE_PDCM)) {
+		u64 capabilities;
+
+		rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+		x86_pmu.intel_cap.capabilities = capabilities;
+	}
+
+	intel_ds_init();
+
+	x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+
+	/*
+	 * Install the hw-cache-events table:
+	 */
+	switch (boot_cpu_data.x86_model) {
+	case 14: /* 65nm Core "Yonah" */
+		pr_cont("Core events, ");
+		break;
+
+	case 15: /* 65nm Core2 "Merom"          */
+		x86_add_quirk(intel_clovertown_quirk);
+	case 22: /* 65nm Core2 "Merom-L"        */
+	case 23: /* 45nm Core2 "Penryn"         */
+	case 29: /* 45nm Core2 "Dunnington (MP) */
+		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		intel_pmu_lbr_init_core();
+
+		x86_pmu.event_constraints = intel_core2_event_constraints;
+		x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
+		pr_cont("Core2 events, ");
+		break;
+
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_nhm();
+
+		x86_pmu.event_constraints = intel_nehalem_event_constraints;
+		x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
+		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+		x86_pmu.extra_regs = intel_nehalem_extra_regs;
+
+		x86_pmu.cpu_events = nhm_events_attrs;
+
+		/* UOPS_ISSUED.STALLED_CYCLES */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+		x86_add_quirk(intel_nehalem_quirk);
+
+		pr_cont("Nehalem events, ");
+		break;
+
+	case 28: /* 45nm Atom "Pineview"   */
+	case 38: /* 45nm Atom "Lincroft"   */
+	case 39: /* 32nm Atom "Penwell"    */
+	case 53: /* 32nm Atom "Cloverview" */
+	case 54: /* 32nm Atom "Cedarview"  */
+		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		intel_pmu_lbr_init_atom();
+
+		x86_pmu.event_constraints = intel_gen_event_constraints;
+		x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
+		pr_cont("Atom events, ");
+		break;
+
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 76: /* 14nm Atom "Airmont"                   */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+		memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
+			sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_atom();
+
+		x86_pmu.event_constraints = intel_slm_event_constraints;
+		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_slm_extra_regs;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		pr_cont("Silvermont events, ");
+		break;
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+		memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_nhm();
+
+		x86_pmu.event_constraints = intel_westmere_event_constraints;
+		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_westmere_extra_regs;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+
+		x86_pmu.cpu_events = nhm_events_attrs;
+
+		/* UOPS_ISSUED.STALLED_CYCLES */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
+
+		pr_cont("Westmere events, ");
+		break;
+
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+		x86_add_quirk(intel_sandybridge_quirk);
+		x86_add_quirk(intel_ht_bug);
+		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_snb();
+
+		x86_pmu.event_constraints = intel_snb_event_constraints;
+		x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		if (boot_cpu_data.x86_model == 45)
+			x86_pmu.extra_regs = intel_snbep_extra_regs;
+		else
+			x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.cpu_events = snb_events_attrs;
+
+		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
+			X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
+
+		pr_cont("SandyBridge events, ");
+		break;
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+		x86_add_quirk(intel_ht_bug);
+		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		/* dTLB-load-misses on IVB is different than SNB */
+		hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
+		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_snb();
+
+		x86_pmu.event_constraints = intel_ivb_event_constraints;
+		x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
+		if (boot_cpu_data.x86_model == 62)
+			x86_pmu.extra_regs = intel_snbep_extra_regs;
+		else
+			x86_pmu.extra_regs = intel_snb_extra_regs;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.cpu_events = snb_events_attrs;
+
+		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+
+		pr_cont("IvyBridge events, ");
+		break;
+
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 69: /* 22nm Haswell ULT */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+		x86_add_quirk(intel_ht_bug);
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		intel_pmu_lbr_init_hsw();
+
+		x86_pmu.event_constraints = intel_hsw_event_constraints;
+		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_snbep_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.lbr_double_abort = true;
+		pr_cont("Haswell events, ");
+		break;
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+		hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+									 BDW_L3_MISS|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+									  HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+									     BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+		intel_pmu_lbr_init_hsw();
+
+		x86_pmu.event_constraints = intel_bdw_event_constraints;
+		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_snbep_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
+		x86_pmu.pebs_prec_dist = true;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
+		pr_cont("Broadwell events, ");
+		break;
+
+	case 87: /* Knights Landing Xeon Phi */
+		memcpy(hw_cache_event_ids,
+		       slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs,
+		       knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		intel_pmu_lbr_init_knl();
+
+		x86_pmu.event_constraints = intel_slm_event_constraints;
+		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_knl_extra_regs;
+
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		pr_cont("Knights Landing events, ");
+		break;
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		intel_pmu_lbr_init_skl();
+
+		x86_pmu.event_constraints = intel_skl_event_constraints;
+		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_skl_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
+		x86_pmu.pebs_prec_dist = true;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+						  skl_format_attr);
+		WARN_ON(!x86_pmu.format_attrs);
+		x86_pmu.cpu_events = hsw_events_attrs;
+		pr_cont("Skylake events, ");
+		break;
+
+	default:
+		switch (x86_pmu.version) {
+		case 1:
+			x86_pmu.event_constraints = intel_v1_event_constraints;
+			pr_cont("generic architected perfmon v1, ");
+			break;
+		default:
+			/*
+			 * default constraints for v2 and up
+			 */
+			x86_pmu.event_constraints = intel_gen_event_constraints;
+			pr_cont("generic architected perfmon, ");
+			break;
+		}
+	}
+
+	if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
+		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+		     x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
+		x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
+	}
+	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+
+	if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+		     x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
+		x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
+	}
+
+	x86_pmu.intel_ctrl |=
+		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
+
+	if (x86_pmu.event_constraints) {
+		/*
+		 * event on fixed counter2 (REF_CYCLES) only works on this
+		 * counter, so do not extend mask to generic counters
+		 */
+		for_each_event_constraint(c, x86_pmu.event_constraints) {
+			if (c->cmask == FIXED_EVENT_FLAGS
+			    && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+				c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+			}
+			c->idxmsk64 &=
+				~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+			c->weight = hweight64(c->idxmsk64);
+		}
+	}
+
+	/*
+	 * Access LBR MSR may cause #GP under certain circumstances.
+	 * E.g. KVM doesn't support LBR MSR
+	 * Check all LBT MSR here.
+	 * Disable LBR access if any LBR MSRs can not be accessed.
+	 */
+	if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+		x86_pmu.lbr_nr = 0;
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+		      check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+			x86_pmu.lbr_nr = 0;
+	}
+
+	/*
+	 * Access extra MSR may cause #GP under certain circumstances.
+	 * E.g. KVM doesn't support offcore event
+	 * Check all extra_regs here.
+	 */
+	if (x86_pmu.extra_regs) {
+		for (er = x86_pmu.extra_regs; er->msr; er++) {
+			er->extra_msr_access = check_msr(er->msr, 0x11UL);
+			/* Disable LBR select mapping */
+			if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+				x86_pmu.lbr_sel_map = NULL;
+		}
+	}
+
+	/* Support full width counters using alternative MSR range */
+	if (x86_pmu.intel_cap.full_width_write) {
+		x86_pmu.max_period = x86_pmu.cntval_mask;
+		x86_pmu.perfctr = MSR_IA32_PMC0;
+		pr_cont("full-width counters, ");
+	}
+
+	return 0;
+}
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+	int cpu = smp_processor_id();
+	int w, c;
+	/*
+	 * problem not present on this CPU model, nothing to do
+	 */
+	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+		return 0;
+
+	w = cpumask_weight(topology_sibling_cpumask(cpu));
+	if (w > 1) {
+		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+		return 0;
+	}
+
+	if (lockup_detector_suspend() != 0) {
+		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+		return 0;
+	}
+
+	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+	x86_pmu.start_scheduling = NULL;
+	x86_pmu.commit_scheduling = NULL;
+	x86_pmu.stop_scheduling = NULL;
+
+	lockup_detector_resume();
+
+	get_online_cpus();
+
+	for_each_online_cpu(c) {
+		free_excl_cntrs(c);
+	}
+
+	put_online_cpus();
+	pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+	return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index e28f931..0a27243 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_cstate.o
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
deleted file mode 100644
index fed2ab1..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ /dev/null
@@ -1,3773 +0,0 @@
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/nmi.h>
-
-#include <asm/cpufeature.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-/*
- * Intel PerfMon, used on Core and later.
- */
-static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
-{
-	[PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
-	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
-	[PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
-	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
-	[PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
-	[PERF_COUNT_HW_REF_CPU_CYCLES]		= 0x0300, /* pseudo-encoding */
-};
-
-static struct event_constraint intel_core_event_constraints[] __read_mostly =
-{
-	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
-	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
-	INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
-	INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
-	INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
-	INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_core2_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
-	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
-	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
-	INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
-	INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
-	INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
-	INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
-	INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
-	INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
-	INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
-	INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
-	INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
-	INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
-	INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
-	INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
-	INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
-	INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
-	EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
-{
-	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
-	EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
-	INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
-	INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
-	INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_snb_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
-	INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-	INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
-	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
-	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-
-	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
-	INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
-	INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-	INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-
-	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-	EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
-{
-	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
-	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b),
-	EVENT_EXTRA_END
-};
-
-static struct event_constraint intel_v1_event_constraints[] __read_mostly =
-{
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_gen_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_slm_event_constraints[] __read_mostly =
-{
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_skl_event_constraints[] = {
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
-	INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
-	EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg intel_knl_extra_regs[] __read_mostly = {
-	INTEL_UEVENT_EXTRA_REG(0x01b7,
-			       MSR_OFFCORE_RSP_0, 0x7f9ffbffffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x02b7,
-			       MSR_OFFCORE_RSP_1, 0x3f9ffbffffull, RSP_1),
-	EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
-	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1),
-	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-	EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
-	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
-	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-	EVENT_EXTRA_END
-};
-
-static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
-	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
-	/*
-	 * Note the low 8 bits eventsel code is not a continuous field, containing
-	 * some #GPing bits. These are masked out.
-	 */
-	INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
-	EVENT_EXTRA_END
-};
-
-EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3");
-EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");
-
-struct attribute *nhm_events_attrs[] = {
-	EVENT_PTR(mem_ld_nhm),
-	NULL,
-};
-
-struct attribute *snb_events_attrs[] = {
-	EVENT_PTR(mem_ld_snb),
-	EVENT_PTR(mem_st_snb),
-	NULL,
-};
-
-static struct event_constraint intel_hsw_event_constraints[] = {
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
-	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
-	/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
-	/* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
-	/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
-
-	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_bdw_event_constraints[] = {
-	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
-	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
-	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
-	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
-	INTEL_UBIT_EVENT_CONSTRAINT(0x8a3, 0x4),	/* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
-	EVENT_CONSTRAINT_END
-};
-
-static u64 intel_pmu_event_map(int hw_event)
-{
-	return intel_perfmon_event_map[hw_event];
-}
-
-/*
- * Notes on the events:
- * - data reads do not include code reads (comparable to earlier tables)
- * - data counts include speculative execution (except L1 write, dtlb, bpu)
- * - remote node access includes remote memory, remote cache, remote mmio.
- * - prefetches are not included in the counts.
- * - icache miss does not include decoded icache
- */
-
-#define SKL_DEMAND_DATA_RD		BIT_ULL(0)
-#define SKL_DEMAND_RFO			BIT_ULL(1)
-#define SKL_ANY_RESPONSE		BIT_ULL(16)
-#define SKL_SUPPLIER_NONE		BIT_ULL(17)
-#define SKL_L3_MISS_LOCAL_DRAM		BIT_ULL(26)
-#define SKL_L3_MISS_REMOTE_HOP0_DRAM	BIT_ULL(27)
-#define SKL_L3_MISS_REMOTE_HOP1_DRAM	BIT_ULL(28)
-#define SKL_L3_MISS_REMOTE_HOP2P_DRAM	BIT_ULL(29)
-#define SKL_L3_MISS			(SKL_L3_MISS_LOCAL_DRAM| \
-					 SKL_L3_MISS_REMOTE_HOP0_DRAM| \
-					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
-					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
-#define SKL_SPL_HIT			BIT_ULL(30)
-#define SKL_SNOOP_NONE			BIT_ULL(31)
-#define SKL_SNOOP_NOT_NEEDED		BIT_ULL(32)
-#define SKL_SNOOP_MISS			BIT_ULL(33)
-#define SKL_SNOOP_HIT_NO_FWD		BIT_ULL(34)
-#define SKL_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
-#define SKL_SNOOP_HITM			BIT_ULL(36)
-#define SKL_SNOOP_NON_DRAM		BIT_ULL(37)
-#define SKL_ANY_SNOOP			(SKL_SPL_HIT|SKL_SNOOP_NONE| \
-					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
-					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
-					 SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
-#define SKL_DEMAND_READ			SKL_DEMAND_DATA_RD
-#define SKL_SNOOP_DRAM			(SKL_SNOOP_NONE| \
-					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
-					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
-					 SKL_SNOOP_HITM|SKL_SPL_HIT)
-#define SKL_DEMAND_WRITE		SKL_DEMAND_RFO
-#define SKL_LLC_ACCESS			SKL_ANY_RESPONSE
-#define SKL_L3_MISS_REMOTE		(SKL_L3_MISS_REMOTE_HOP0_DRAM| \
-					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
-					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
-
-static __initconst const u64 skl_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
-		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x283,	/* ICACHE_64B.MISS */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
-		[ C(RESULT_MISS)   ] = 0x608,	/* DTLB_LOAD_MISSES.WALK_COMPLETED */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
-		[ C(RESULT_MISS)   ] = 0x649,	/* DTLB_STORE_MISSES.WALK_COMPLETED */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2085,	/* ITLB_MISSES.STLB_HIT */
-		[ C(RESULT_MISS)   ] = 0xe85,	/* ITLB_MISSES.WALK_COMPLETED */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
-};
-
-static __initconst const u64 skl_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
-				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
-		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
-				       SKL_L3_MISS|SKL_ANY_SNOOP|
-				       SKL_SUPPLIER_NONE,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
-				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
-		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
-				       SKL_L3_MISS|SKL_ANY_SNOOP|
-				       SKL_SUPPLIER_NONE,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
-				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
-		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
-				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
-				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
-		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
-				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
-};
-
-#define SNB_DMND_DATA_RD	(1ULL << 0)
-#define SNB_DMND_RFO		(1ULL << 1)
-#define SNB_DMND_IFETCH		(1ULL << 2)
-#define SNB_DMND_WB		(1ULL << 3)
-#define SNB_PF_DATA_RD		(1ULL << 4)
-#define SNB_PF_RFO		(1ULL << 5)
-#define SNB_PF_IFETCH		(1ULL << 6)
-#define SNB_LLC_DATA_RD		(1ULL << 7)
-#define SNB_LLC_RFO		(1ULL << 8)
-#define SNB_LLC_IFETCH		(1ULL << 9)
-#define SNB_BUS_LOCKS		(1ULL << 10)
-#define SNB_STRM_ST		(1ULL << 11)
-#define SNB_OTHER		(1ULL << 15)
-#define SNB_RESP_ANY		(1ULL << 16)
-#define SNB_NO_SUPP		(1ULL << 17)
-#define SNB_LLC_HITM		(1ULL << 18)
-#define SNB_LLC_HITE		(1ULL << 19)
-#define SNB_LLC_HITS		(1ULL << 20)
-#define SNB_LLC_HITF		(1ULL << 21)
-#define SNB_LOCAL		(1ULL << 22)
-#define SNB_REMOTE		(0xffULL << 23)
-#define SNB_SNP_NONE		(1ULL << 31)
-#define SNB_SNP_NOT_NEEDED	(1ULL << 32)
-#define SNB_SNP_MISS		(1ULL << 33)
-#define SNB_NO_FWD		(1ULL << 34)
-#define SNB_SNP_FWD		(1ULL << 35)
-#define SNB_HITM		(1ULL << 36)
-#define SNB_NON_DRAM		(1ULL << 37)
-
-#define SNB_DMND_READ		(SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
-#define SNB_DMND_WRITE		(SNB_DMND_RFO|SNB_LLC_RFO)
-#define SNB_DMND_PREFETCH	(SNB_PF_DATA_RD|SNB_PF_RFO)
-
-#define SNB_SNP_ANY		(SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
-				 SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
-				 SNB_HITM)
-
-#define SNB_DRAM_ANY		(SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
-#define SNB_DRAM_REMOTE		(SNB_REMOTE|SNB_SNP_ANY)
-
-#define SNB_L3_ACCESS		SNB_RESP_ANY
-#define SNB_L3_MISS		(SNB_DRAM_ANY|SNB_NON_DRAM)
-
-static __initconst const u64 snb_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_L3_MISS,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_L3_MISS,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
-		[ C(RESULT_MISS)   ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
-		[ C(RESULT_MISS)   ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
-		[ C(RESULT_MISS)   ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
-	},
- },
-};
-
-static __initconst const u64 snb_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS        */
-		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPLACEMENT              */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES       */
-		[ C(RESULT_MISS)   ] = 0x0851, /* L1D.ALL_M_REPLACEMENT        */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x024e, /* HW_PRE_REQ.DL1_MISS          */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0280, /* ICACHE.MISSES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_WRITE) ] = {
-		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
-		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
-		[ C(RESULT_MISS)   ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT         */
-		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
-
-};
-
-/*
- * Notes on the events:
- * - data reads do not include code reads (comparable to earlier tables)
- * - data counts include speculative execution (except L1 write, dtlb, bpu)
- * - remote node access includes remote memory, remote cache, remote mmio.
- * - prefetches are not included in the counts because they are not
- *   reliably counted.
- */
-
-#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
-#define HSW_DEMAND_RFO			BIT_ULL(1)
-#define HSW_ANY_RESPONSE		BIT_ULL(16)
-#define HSW_SUPPLIER_NONE		BIT_ULL(17)
-#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
-#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
-#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
-#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
-#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
-					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
-					 HSW_L3_MISS_REMOTE_HOP2P)
-#define HSW_SNOOP_NONE			BIT_ULL(31)
-#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
-#define HSW_SNOOP_MISS			BIT_ULL(33)
-#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
-#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
-#define HSW_SNOOP_HITM			BIT_ULL(36)
-#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
-#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
-					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
-					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
-					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
-#define HSW_SNOOP_DRAM			(HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
-#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
-#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
-#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
-					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
-#define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
-
-#define BDW_L3_MISS_LOCAL		BIT(26)
-#define BDW_L3_MISS			(BDW_L3_MISS_LOCAL| \
-					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
-					 HSW_L3_MISS_REMOTE_HOP2P)
-
-
-static __initconst const u64 hsw_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
-		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x280,	/* ICACHE.MISSES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
-		[ C(RESULT_MISS)   ] = 0x108,	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
-		[ C(RESULT_MISS)   ] = 0x149,	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x6085,	/* ITLB_MISSES.STLB_HIT */
-		[ C(RESULT_MISS)   ] = 0x185,	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
-		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
-};
-
-static __initconst const u64 hsw_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
-				       HSW_LLC_ACCESS,
-		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
-				       HSW_L3_MISS|HSW_ANY_SNOOP,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
-				       HSW_LLC_ACCESS,
-		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
-				       HSW_L3_MISS|HSW_ANY_SNOOP,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
-				       HSW_L3_MISS_LOCAL_DRAM|
-				       HSW_SNOOP_DRAM,
-		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
-				       HSW_L3_MISS_REMOTE|
-				       HSW_SNOOP_DRAM,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
-				       HSW_L3_MISS_LOCAL_DRAM|
-				       HSW_SNOOP_DRAM,
-		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
-				       HSW_L3_MISS_REMOTE|
-				       HSW_SNOOP_DRAM,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
-};
-
-static __initconst const u64 westmere_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-		[ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	/*
-	 * Use RFO, not WRITEBACK, because a write miss would typically occur
-	 * on RFO.
-	 */
-	[ C(OP_WRITE) ] = {
-		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISSES.ANY              */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
-};
-
-/*
- * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
- * See IA32 SDM Vol 3B 30.6.1.3
- */
-
-#define NHM_DMND_DATA_RD	(1 << 0)
-#define NHM_DMND_RFO		(1 << 1)
-#define NHM_DMND_IFETCH		(1 << 2)
-#define NHM_DMND_WB		(1 << 3)
-#define NHM_PF_DATA_RD		(1 << 4)
-#define NHM_PF_DATA_RFO		(1 << 5)
-#define NHM_PF_IFETCH		(1 << 6)
-#define NHM_OFFCORE_OTHER	(1 << 7)
-#define NHM_UNCORE_HIT		(1 << 8)
-#define NHM_OTHER_CORE_HIT_SNP	(1 << 9)
-#define NHM_OTHER_CORE_HITM	(1 << 10)
-        			/* reserved */
-#define NHM_REMOTE_CACHE_FWD	(1 << 12)
-#define NHM_REMOTE_DRAM		(1 << 13)
-#define NHM_LOCAL_DRAM		(1 << 14)
-#define NHM_NON_DRAM		(1 << 15)
-
-#define NHM_LOCAL		(NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_REMOTE		(NHM_REMOTE_DRAM)
-
-#define NHM_DMND_READ		(NHM_DMND_DATA_RD)
-#define NHM_DMND_WRITE		(NHM_DMND_RFO|NHM_DMND_WB)
-#define NHM_DMND_PREFETCH	(NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
-
-#define NHM_L3_HIT	(NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
-#define NHM_L3_MISS	(NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD)
-#define NHM_L3_ACCESS	(NHM_L3_HIT|NHM_L3_MISS)
-
-static __initconst const u64 nehalem_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
-		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE,
-		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE,
-		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE,
-		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE,
-	},
- },
-};
-
-static __initconst const u64 nehalem_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
-		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
-		[ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
-		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
-		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	/*
-	 * Use RFO, not WRITEBACK, because a write miss would typically occur
-	 * on RFO.
-	 */
-	[ C(OP_WRITE) ] = {
-		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
-		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
-		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
-		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
-};
-
-static __initconst const u64 core2_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
-		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
-		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static __initconst const u64 atom_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
-		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
-		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
-		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static struct extra_reg intel_slm_extra_regs[] __read_mostly =
-{
-	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
-	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
-	EVENT_EXTRA_END
-};
-
-#define SLM_DMND_READ		SNB_DMND_DATA_RD
-#define SLM_DMND_WRITE		SNB_DMND_RFO
-#define SLM_DMND_PREFETCH	(SNB_PF_DATA_RD|SNB_PF_RFO)
-
-#define SLM_SNP_ANY		(SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM)
-#define SLM_LLC_ACCESS		SNB_RESP_ANY
-#define SLM_LLC_MISS		(SLM_SNP_ANY|SNB_NON_DRAM)
-
-static __initconst const u64 slm_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS,
-		[ C(RESULT_MISS)   ] = SLM_DMND_WRITE|SLM_LLC_MISS,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS,
-		[ C(RESULT_MISS)   ] = SLM_DMND_PREFETCH|SLM_LLC_MISS,
-	},
- },
-};
-
-static __initconst const u64 slm_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0x0104, /* LD_DCU_MISS */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */
-		[ C(RESULT_MISS)   ] = 0x0280, /* ICACGE.MISSES */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
-	[ C(OP_PREFETCH) ] = {
-		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01b7,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0x0804, /* LD_DTLB_MISS */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
-		[ C(RESULT_MISS)   ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
-		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-#define KNL_OT_L2_HITE		BIT_ULL(19) /* Other Tile L2 Hit */
-#define KNL_OT_L2_HITF		BIT_ULL(20) /* Other Tile L2 Hit */
-#define KNL_MCDRAM_LOCAL	BIT_ULL(21)
-#define KNL_MCDRAM_FAR		BIT_ULL(22)
-#define KNL_DDR_LOCAL		BIT_ULL(23)
-#define KNL_DDR_FAR		BIT_ULL(24)
-#define KNL_DRAM_ANY		(KNL_MCDRAM_LOCAL | KNL_MCDRAM_FAR | \
-				    KNL_DDR_LOCAL | KNL_DDR_FAR)
-#define KNL_L2_READ		SLM_DMND_READ
-#define KNL_L2_WRITE		SLM_DMND_WRITE
-#define KNL_L2_PREFETCH		SLM_DMND_PREFETCH
-#define KNL_L2_ACCESS		SLM_LLC_ACCESS
-#define KNL_L2_MISS		(KNL_OT_L2_HITE | KNL_OT_L2_HITF | \
-				   KNL_DRAM_ANY | SNB_SNP_ANY | \
-						  SNB_NON_DRAM)
-
-static __initconst const u64 knl_hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)] = KNL_L2_READ | KNL_L2_ACCESS,
-			[C(RESULT_MISS)]   = 0,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)] = KNL_L2_WRITE | KNL_L2_ACCESS,
-			[C(RESULT_MISS)]   = KNL_L2_WRITE | KNL_L2_MISS,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)] = KNL_L2_PREFETCH | KNL_L2_ACCESS,
-			[C(RESULT_MISS)]   = KNL_L2_PREFETCH | KNL_L2_MISS,
-		},
-	},
-};
-
-/*
- * Use from PMIs where the LBRs are already disabled.
- */
-static void __intel_pmu_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
-		intel_pmu_disable_bts();
-	else
-		intel_bts_disable_local();
-
-	intel_pmu_pebs_disable_all();
-}
-
-static void intel_pmu_disable_all(void)
-{
-	__intel_pmu_disable_all();
-	intel_pmu_lbr_disable_all();
-}
-
-static void __intel_pmu_enable_all(int added, bool pmi)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	intel_pmu_pebs_enable_all();
-	intel_pmu_lbr_enable_all(pmi);
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
-			x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
-
-	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
-		struct perf_event *event =
-			cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-
-		if (WARN_ON_ONCE(!event))
-			return;
-
-		intel_pmu_enable_bts(event->hw.config);
-	} else
-		intel_bts_enable_local();
-}
-
-static void intel_pmu_enable_all(int added)
-{
-	__intel_pmu_enable_all(added, false);
-}
-
-/*
- * Workaround for:
- *   Intel Errata AAK100 (model 26)
- *   Intel Errata AAP53  (model 30)
- *   Intel Errata BD53   (model 44)
- *
- * The official story:
- *   These chips need to be 'reset' when adding counters by programming the
- *   magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
- *   in sequence on the same PMC or on different PMCs.
- *
- * In practise it appears some of these events do in fact count, and
- * we need to programm all 4 events.
- */
-static void intel_pmu_nhm_workaround(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	static const unsigned long nhm_magic[4] = {
-		0x4300B5,
-		0x4300D2,
-		0x4300B1,
-		0x4300B1
-	};
-	struct perf_event *event;
-	int i;
-
-	/*
-	 * The Errata requires below steps:
-	 * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
-	 * 2) Configure 4 PERFEVTSELx with the magic events and clear
-	 *    the corresponding PMCx;
-	 * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
-	 * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
-	 * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
-	 */
-
-	/*
-	 * The real steps we choose are a little different from above.
-	 * A) To reduce MSR operations, we don't run step 1) as they
-	 *    are already cleared before this function is called;
-	 * B) Call x86_perf_event_update to save PMCx before configuring
-	 *    PERFEVTSELx with magic number;
-	 * C) With step 5), we do clear only when the PERFEVTSELx is
-	 *    not used currently.
-	 * D) Call x86_perf_event_set_period to restore PMCx;
-	 */
-
-	/* We always operate 4 pairs of PERF Counters */
-	for (i = 0; i < 4; i++) {
-		event = cpuc->events[i];
-		if (event)
-			x86_perf_event_update(event);
-	}
-
-	for (i = 0; i < 4; i++) {
-		wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
-		wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
-	}
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
-
-	for (i = 0; i < 4; i++) {
-		event = cpuc->events[i];
-
-		if (event) {
-			x86_perf_event_set_period(event);
-			__x86_pmu_enable_event(&event->hw,
-					ARCH_PERFMON_EVENTSEL_ENABLE);
-		} else
-			wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
-	}
-}
-
-static void intel_pmu_nhm_enable_all(int added)
-{
-	if (added)
-		intel_pmu_nhm_workaround();
-	intel_pmu_enable_all(added);
-}
-
-static inline u64 intel_pmu_get_status(void)
-{
-	u64 status;
-
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-	return status;
-}
-
-static inline void intel_pmu_ack_status(u64 ack)
-{
-	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
-}
-
-static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
-{
-	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
-	u64 ctrl_val, mask;
-
-	mask = 0xfULL << (idx * 4);
-
-	rdmsrl(hwc->config_base, ctrl_val);
-	ctrl_val &= ~mask;
-	wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static inline bool event_is_checkpointed(struct perf_event *event)
-{
-	return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
-}
-
-static void intel_pmu_disable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
-		intel_pmu_disable_bts();
-		intel_pmu_drain_bts_buffer();
-		return;
-	}
-
-	cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
-	cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
-	cpuc->intel_cp_status &= ~(1ull << hwc->idx);
-
-	/*
-	 * must disable before any actual event
-	 * because any event may be combined with LBR
-	 */
-	if (needs_branch_stack(event))
-		intel_pmu_lbr_disable(event);
-
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-		intel_pmu_disable_fixed(hwc);
-		return;
-	}
-
-	x86_pmu_disable_event(event);
-
-	if (unlikely(event->attr.precise_ip))
-		intel_pmu_pebs_disable(event);
-}
-
-static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
-{
-	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
-	u64 ctrl_val, bits, mask;
-
-	/*
-	 * Enable IRQ generation (0x8),
-	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
-	 * if requested:
-	 */
-	bits = 0x8ULL;
-	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
-		bits |= 0x2;
-	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
-		bits |= 0x1;
-
-	/*
-	 * ANY bit is supported in v3 and up
-	 */
-	if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
-		bits |= 0x4;
-
-	bits <<= (idx * 4);
-	mask = 0xfULL << (idx * 4);
-
-	rdmsrl(hwc->config_base, ctrl_val);
-	ctrl_val &= ~mask;
-	ctrl_val |= bits;
-	wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void intel_pmu_enable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
-		if (!__this_cpu_read(cpu_hw_events.enabled))
-			return;
-
-		intel_pmu_enable_bts(hwc->config);
-		return;
-	}
-	/*
-	 * must enabled before any actual event
-	 * because any event may be combined with LBR
-	 */
-	if (needs_branch_stack(event))
-		intel_pmu_lbr_enable(event);
-
-	if (event->attr.exclude_host)
-		cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
-	if (event->attr.exclude_guest)
-		cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
-
-	if (unlikely(event_is_checkpointed(event)))
-		cpuc->intel_cp_status |= (1ull << hwc->idx);
-
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
-		intel_pmu_enable_fixed(hwc);
-		return;
-	}
-
-	if (unlikely(event->attr.precise_ip))
-		intel_pmu_pebs_enable(event);
-
-	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-}
-
-/*
- * Save and restart an expired event. Called by NMI contexts,
- * so it has to be careful about preempting normal event ops:
- */
-int intel_pmu_save_and_restart(struct perf_event *event)
-{
-	x86_perf_event_update(event);
-	/*
-	 * For a checkpointed counter always reset back to 0.  This
-	 * avoids a situation where the counter overflows, aborts the
-	 * transaction and is then set back to shortly before the
-	 * overflow, and overflows and aborts again.
-	 */
-	if (unlikely(event_is_checkpointed(event))) {
-		/* No race with NMIs because the counter should not be armed */
-		wrmsrl(event->hw.event_base, 0);
-		local64_set(&event->hw.prev_count, 0);
-	}
-	return x86_perf_event_set_period(event);
-}
-
-static void intel_pmu_reset(void)
-{
-	struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
-	unsigned long flags;
-	int idx;
-
-	if (!x86_pmu.num_counters)
-		return;
-
-	local_irq_save(flags);
-
-	pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
-		wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
-	}
-	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
-		wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
-
-	if (ds)
-		ds->bts_index = ds->bts_buffer_base;
-
-	/* Ack all overflows and disable fixed counters */
-	if (x86_pmu.version >= 2) {
-		intel_pmu_ack_status(intel_pmu_get_status());
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-	}
-
-	/* Reset LBRs and LBR freezing */
-	if (x86_pmu.lbr_nr) {
-		update_debugctlmsr(get_debugctlmsr() &
-			~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
-	}
-
-	local_irq_restore(flags);
-}
-
-/*
- * This handler is triggered by the local APIC, so the APIC IRQ handling
- * rules apply:
- */
-static int intel_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_events *cpuc;
-	int bit, loops;
-	u64 status;
-	int handled;
-
-	cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	/*
-	 * No known reason to not always do late ACK,
-	 * but just in case do it opt-in.
-	 */
-	if (!x86_pmu.late_ack)
-		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	__intel_pmu_disable_all();
-	handled = intel_pmu_drain_bts_buffer();
-	handled += intel_bts_interrupt();
-	status = intel_pmu_get_status();
-	if (!status)
-		goto done;
-
-	loops = 0;
-again:
-	intel_pmu_lbr_read();
-	intel_pmu_ack_status(status);
-	if (++loops > 100) {
-		static bool warned = false;
-		if (!warned) {
-			WARN(1, "perfevents: irq loop stuck!\n");
-			perf_event_print_debug();
-			warned = true;
-		}
-		intel_pmu_reset();
-		goto done;
-	}
-
-	inc_irq_stat(apic_perf_irqs);
-
-
-	/*
-	 * Ignore a range of extra bits in status that do not indicate
-	 * overflow by themselves.
-	 */
-	status &= ~(GLOBAL_STATUS_COND_CHG |
-		    GLOBAL_STATUS_ASIF |
-		    GLOBAL_STATUS_LBRS_FROZEN);
-	if (!status)
-		goto done;
-
-	/*
-	 * PEBS overflow sets bit 62 in the global status register
-	 */
-	if (__test_and_clear_bit(62, (unsigned long *)&status)) {
-		handled++;
-		x86_pmu.drain_pebs(regs);
-	}
-
-	/*
-	 * Intel PT
-	 */
-	if (__test_and_clear_bit(55, (unsigned long *)&status)) {
-		handled++;
-		intel_pt_interrupt();
-	}
-
-	/*
-	 * Checkpointed counters can lead to 'spurious' PMIs because the
-	 * rollback caused by the PMI will have cleared the overflow status
-	 * bit. Therefore always force probe these counters.
-	 */
-	status |= cpuc->intel_cp_status;
-
-	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-		struct perf_event *event = cpuc->events[bit];
-
-		handled++;
-
-		if (!test_bit(bit, cpuc->active_mask))
-			continue;
-
-		if (!intel_pmu_save_and_restart(event))
-			continue;
-
-		perf_sample_data_init(&data, 0, event->hw.last_period);
-
-		if (has_branch_stack(event))
-			data.br_stack = &cpuc->lbr_stack;
-
-		if (perf_event_overflow(event, &data, regs))
-			x86_pmu_stop(event, 0);
-	}
-
-	/*
-	 * Repeat if there is more work to be done:
-	 */
-	status = intel_pmu_get_status();
-	if (status)
-		goto again;
-
-done:
-	__intel_pmu_enable_all(0, true);
-	/*
-	 * Only unmask the NMI after the overflow counters
-	 * have been reset. This avoids spurious NMIs on
-	 * Haswell CPUs.
-	 */
-	if (x86_pmu.late_ack)
-		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	return handled;
-}
-
-static struct event_constraint *
-intel_bts_constraints(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned int hw_event, bts_event;
-
-	if (event->attr.freq)
-		return NULL;
-
-	hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
-	bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
-
-	if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
-		return &bts_constraint;
-
-	return NULL;
-}
-
-static int intel_alt_er(int idx, u64 config)
-{
-	int alt_idx = idx;
-
-	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
-		return idx;
-
-	if (idx == EXTRA_REG_RSP_0)
-		alt_idx = EXTRA_REG_RSP_1;
-
-	if (idx == EXTRA_REG_RSP_1)
-		alt_idx = EXTRA_REG_RSP_0;
-
-	if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
-		return idx;
-
-	return alt_idx;
-}
-
-static void intel_fixup_er(struct perf_event *event, int idx)
-{
-	event->hw.extra_reg.idx = idx;
-
-	if (idx == EXTRA_REG_RSP_0) {
-		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-		event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
-		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
-	} else if (idx == EXTRA_REG_RSP_1) {
-		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-		event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
-		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
-	}
-}
-
-/*
- * manage allocation of shared extra msr for certain events
- *
- * sharing can be:
- * per-cpu: to be shared between the various events on a single PMU
- * per-core: per-cpu + shared by HT threads
- */
-static struct event_constraint *
-__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
-				   struct perf_event *event,
-				   struct hw_perf_event_extra *reg)
-{
-	struct event_constraint *c = &emptyconstraint;
-	struct er_account *era;
-	unsigned long flags;
-	int idx = reg->idx;
-
-	/*
-	 * reg->alloc can be set due to existing state, so for fake cpuc we
-	 * need to ignore this, otherwise we might fail to allocate proper fake
-	 * state for this extra reg constraint. Also see the comment below.
-	 */
-	if (reg->alloc && !cpuc->is_fake)
-		return NULL; /* call x86_get_event_constraint() */
-
-again:
-	era = &cpuc->shared_regs->regs[idx];
-	/*
-	 * we use spin_lock_irqsave() to avoid lockdep issues when
-	 * passing a fake cpuc
-	 */
-	raw_spin_lock_irqsave(&era->lock, flags);
-
-	if (!atomic_read(&era->ref) || era->config == reg->config) {
-
-		/*
-		 * If its a fake cpuc -- as per validate_{group,event}() we
-		 * shouldn't touch event state and we can avoid doing so
-		 * since both will only call get_event_constraints() once
-		 * on each event, this avoids the need for reg->alloc.
-		 *
-		 * Not doing the ER fixup will only result in era->reg being
-		 * wrong, but since we won't actually try and program hardware
-		 * this isn't a problem either.
-		 */
-		if (!cpuc->is_fake) {
-			if (idx != reg->idx)
-				intel_fixup_er(event, idx);
-
-			/*
-			 * x86_schedule_events() can call get_event_constraints()
-			 * multiple times on events in the case of incremental
-			 * scheduling(). reg->alloc ensures we only do the ER
-			 * allocation once.
-			 */
-			reg->alloc = 1;
-		}
-
-		/* lock in msr value */
-		era->config = reg->config;
-		era->reg = reg->reg;
-
-		/* one more user */
-		atomic_inc(&era->ref);
-
-		/*
-		 * need to call x86_get_event_constraint()
-		 * to check if associated event has constraints
-		 */
-		c = NULL;
-	} else {
-		idx = intel_alt_er(idx, reg->config);
-		if (idx != reg->idx) {
-			raw_spin_unlock_irqrestore(&era->lock, flags);
-			goto again;
-		}
-	}
-	raw_spin_unlock_irqrestore(&era->lock, flags);
-
-	return c;
-}
-
-static void
-__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
-				   struct hw_perf_event_extra *reg)
-{
-	struct er_account *era;
-
-	/*
-	 * Only put constraint if extra reg was actually allocated. Also takes
-	 * care of event which do not use an extra shared reg.
-	 *
-	 * Also, if this is a fake cpuc we shouldn't touch any event state
-	 * (reg->alloc) and we don't care about leaving inconsistent cpuc state
-	 * either since it'll be thrown out.
-	 */
-	if (!reg->alloc || cpuc->is_fake)
-		return;
-
-	era = &cpuc->shared_regs->regs[reg->idx];
-
-	/* one fewer user */
-	atomic_dec(&era->ref);
-
-	/* allocate again next time */
-	reg->alloc = 0;
-}
-
-static struct event_constraint *
-intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
-			      struct perf_event *event)
-{
-	struct event_constraint *c = NULL, *d;
-	struct hw_perf_event_extra *xreg, *breg;
-
-	xreg = &event->hw.extra_reg;
-	if (xreg->idx != EXTRA_REG_NONE) {
-		c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
-		if (c == &emptyconstraint)
-			return c;
-	}
-	breg = &event->hw.branch_reg;
-	if (breg->idx != EXTRA_REG_NONE) {
-		d = __intel_shared_reg_get_constraints(cpuc, event, breg);
-		if (d == &emptyconstraint) {
-			__intel_shared_reg_put_constraints(cpuc, xreg);
-			c = d;
-		}
-	}
-	return c;
-}
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			  struct perf_event *event)
-{
-	struct event_constraint *c;
-
-	if (x86_pmu.event_constraints) {
-		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if ((event->hw.config & c->cmask) == c->code) {
-				event->hw.flags |= c->flags;
-				return c;
-			}
-		}
-	}
-
-	return &unconstrained;
-}
-
-static struct event_constraint *
-__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			    struct perf_event *event)
-{
-	struct event_constraint *c;
-
-	c = intel_bts_constraints(event);
-	if (c)
-		return c;
-
-	c = intel_shared_regs_constraints(cpuc, event);
-	if (c)
-		return c;
-
-	c = intel_pebs_constraints(event);
-	if (c)
-		return c;
-
-	return x86_get_event_constraints(cpuc, idx, event);
-}
-
-static void
-intel_start_scheduling(struct cpu_hw_events *cpuc)
-{
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl;
-	int tid = cpuc->excl_thread_id;
-
-	/*
-	 * nothing needed if in group validation mode
-	 */
-	if (cpuc->is_fake || !is_ht_workaround_enabled())
-		return;
-
-	/*
-	 * no exclusion needed
-	 */
-	if (WARN_ON_ONCE(!excl_cntrs))
-		return;
-
-	xl = &excl_cntrs->states[tid];
-
-	xl->sched_started = true;
-	/*
-	 * lock shared state until we are done scheduling
-	 * in stop_event_scheduling()
-	 * makes scheduling appear as a transaction
-	 */
-	raw_spin_lock(&excl_cntrs->lock);
-}
-
-static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
-{
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct event_constraint *c = cpuc->event_constraint[idx];
-	struct intel_excl_states *xl;
-	int tid = cpuc->excl_thread_id;
-
-	if (cpuc->is_fake || !is_ht_workaround_enabled())
-		return;
-
-	if (WARN_ON_ONCE(!excl_cntrs))
-		return;
-
-	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
-		return;
-
-	xl = &excl_cntrs->states[tid];
-
-	lockdep_assert_held(&excl_cntrs->lock);
-
-	if (c->flags & PERF_X86_EVENT_EXCL)
-		xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
-	else
-		xl->state[cntr] = INTEL_EXCL_SHARED;
-}
-
-static void
-intel_stop_scheduling(struct cpu_hw_events *cpuc)
-{
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl;
-	int tid = cpuc->excl_thread_id;
-
-	/*
-	 * nothing needed if in group validation mode
-	 */
-	if (cpuc->is_fake || !is_ht_workaround_enabled())
-		return;
-	/*
-	 * no exclusion needed
-	 */
-	if (WARN_ON_ONCE(!excl_cntrs))
-		return;
-
-	xl = &excl_cntrs->states[tid];
-
-	xl->sched_started = false;
-	/*
-	 * release shared state lock (acquired in intel_start_scheduling())
-	 */
-	raw_spin_unlock(&excl_cntrs->lock);
-}
-
-static struct event_constraint *
-intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
-			   int idx, struct event_constraint *c)
-{
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xlo;
-	int tid = cpuc->excl_thread_id;
-	int is_excl, i;
-
-	/*
-	 * validating a group does not require
-	 * enforcing cross-thread  exclusion
-	 */
-	if (cpuc->is_fake || !is_ht_workaround_enabled())
-		return c;
-
-	/*
-	 * no exclusion needed
-	 */
-	if (WARN_ON_ONCE(!excl_cntrs))
-		return c;
-
-	/*
-	 * because we modify the constraint, we need
-	 * to make a copy. Static constraints come
-	 * from static const tables.
-	 *
-	 * only needed when constraint has not yet
-	 * been cloned (marked dynamic)
-	 */
-	if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
-		struct event_constraint *cx;
-
-		/*
-		 * grab pre-allocated constraint entry
-		 */
-		cx = &cpuc->constraint_list[idx];
-
-		/*
-		 * initialize dynamic constraint
-		 * with static constraint
-		 */
-		*cx = *c;
-
-		/*
-		 * mark constraint as dynamic, so we
-		 * can free it later on
-		 */
-		cx->flags |= PERF_X86_EVENT_DYNAMIC;
-		c = cx;
-	}
-
-	/*
-	 * From here on, the constraint is dynamic.
-	 * Either it was just allocated above, or it
-	 * was allocated during a earlier invocation
-	 * of this function
-	 */
-
-	/*
-	 * state of sibling HT
-	 */
-	xlo = &excl_cntrs->states[tid ^ 1];
-
-	/*
-	 * event requires exclusive counter access
-	 * across HT threads
-	 */
-	is_excl = c->flags & PERF_X86_EVENT_EXCL;
-	if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
-		event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
-		if (!cpuc->n_excl++)
-			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
-	}
-
-	/*
-	 * Modify static constraint with current dynamic
-	 * state of thread
-	 *
-	 * EXCLUSIVE: sibling counter measuring exclusive event
-	 * SHARED   : sibling counter measuring non-exclusive event
-	 * UNUSED   : sibling counter unused
-	 */
-	for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
-		/*
-		 * exclusive event in sibling counter
-		 * our corresponding counter cannot be used
-		 * regardless of our event
-		 */
-		if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
-			__clear_bit(i, c->idxmsk);
-		/*
-		 * if measuring an exclusive event, sibling
-		 * measuring non-exclusive, then counter cannot
-		 * be used
-		 */
-		if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
-			__clear_bit(i, c->idxmsk);
-	}
-
-	/*
-	 * recompute actual bit weight for scheduling algorithm
-	 */
-	c->weight = hweight64(c->idxmsk64);
-
-	/*
-	 * if we return an empty mask, then switch
-	 * back to static empty constraint to avoid
-	 * the cost of freeing later on
-	 */
-	if (c->weight == 0)
-		c = &emptyconstraint;
-
-	return c;
-}
-
-static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			    struct perf_event *event)
-{
-	struct event_constraint *c1 = NULL;
-	struct event_constraint *c2;
-
-	if (idx >= 0) /* fake does < 0 */
-		c1 = cpuc->event_constraint[idx];
-
-	/*
-	 * first time only
-	 * - static constraint: no change across incremental scheduling calls
-	 * - dynamic constraint: handled by intel_get_excl_constraints()
-	 */
-	c2 = __intel_get_event_constraints(cpuc, idx, event);
-	if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
-		bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
-		c1->weight = c2->weight;
-		c2 = c1;
-	}
-
-	if (cpuc->excl_cntrs)
-		return intel_get_excl_constraints(cpuc, event, idx, c2);
-
-	return c2;
-}
-
-static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
-		struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	int tid = cpuc->excl_thread_id;
-	struct intel_excl_states *xl;
-
-	/*
-	 * nothing needed if in group validation mode
-	 */
-	if (cpuc->is_fake)
-		return;
-
-	if (WARN_ON_ONCE(!excl_cntrs))
-		return;
-
-	if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
-		hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
-		if (!--cpuc->n_excl)
-			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
-	}
-
-	/*
-	 * If event was actually assigned, then mark the counter state as
-	 * unused now.
-	 */
-	if (hwc->idx >= 0) {
-		xl = &excl_cntrs->states[tid];
-
-		/*
-		 * put_constraint may be called from x86_schedule_events()
-		 * which already has the lock held so here make locking
-		 * conditional.
-		 */
-		if (!xl->sched_started)
-			raw_spin_lock(&excl_cntrs->lock);
-
-		xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
-
-		if (!xl->sched_started)
-			raw_spin_unlock(&excl_cntrs->lock);
-	}
-}
-
-static void
-intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
-					struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg;
-
-	reg = &event->hw.extra_reg;
-	if (reg->idx != EXTRA_REG_NONE)
-		__intel_shared_reg_put_constraints(cpuc, reg);
-
-	reg = &event->hw.branch_reg;
-	if (reg->idx != EXTRA_REG_NONE)
-		__intel_shared_reg_put_constraints(cpuc, reg);
-}
-
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
-					struct perf_event *event)
-{
-	intel_put_shared_regs_event_constraints(cpuc, event);
-
-	/*
-	 * is PMU has exclusive counter restrictions, then
-	 * all events are subject to and must call the
-	 * put_excl_constraints() routine
-	 */
-	if (cpuc->excl_cntrs)
-		intel_put_excl_constraints(cpuc, event);
-}
-
-static void intel_pebs_aliases_core2(struct perf_event *event)
-{
-	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-		/*
-		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-		 * (0x003c) so that we can use it with PEBS.
-		 *
-		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-		 * PEBS capable. However we can use INST_RETIRED.ANY_P
-		 * (0x00c0), which is a PEBS capable event, to get the same
-		 * count.
-		 *
-		 * INST_RETIRED.ANY_P counts the number of cycles that retires
-		 * CNTMASK instructions. By setting CNTMASK to a value (16)
-		 * larger than the maximum number of instructions that can be
-		 * retired per cycle (4) and then inverting the condition, we
-		 * count all cycles that retire 16 or less instructions, which
-		 * is every cycle.
-		 *
-		 * Thereby we gain a PEBS capable cycle counter.
-		 */
-		u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16);
-
-		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-		event->hw.config = alt_config;
-	}
-}
-
-static void intel_pebs_aliases_snb(struct perf_event *event)
-{
-	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-		/*
-		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-		 * (0x003c) so that we can use it with PEBS.
-		 *
-		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-		 * PEBS capable. However we can use UOPS_RETIRED.ALL
-		 * (0x01c2), which is a PEBS capable event, to get the same
-		 * count.
-		 *
-		 * UOPS_RETIRED.ALL counts the number of cycles that retires
-		 * CNTMASK micro-ops. By setting CNTMASK to a value (16)
-		 * larger than the maximum number of micro-ops that can be
-		 * retired per cycle (4) and then inverting the condition, we
-		 * count all cycles that retire 16 or less micro-ops, which
-		 * is every cycle.
-		 *
-		 * Thereby we gain a PEBS capable cycle counter.
-		 */
-		u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
-
-		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-		event->hw.config = alt_config;
-	}
-}
-
-static void intel_pebs_aliases_precdist(struct perf_event *event)
-{
-	if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
-		/*
-		 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
-		 * (0x003c) so that we can use it with PEBS.
-		 *
-		 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
-		 * PEBS capable. However we can use INST_RETIRED.PREC_DIST
-		 * (0x01c0), which is a PEBS capable event, to get the same
-		 * count.
-		 *
-		 * The PREC_DIST event has special support to minimize sample
-		 * shadowing effects. One drawback is that it can be
-		 * only programmed on counter 1, but that seems like an
-		 * acceptable trade off.
-		 */
-		u64 alt_config = X86_CONFIG(.event=0xc0, .umask=0x01, .inv=1, .cmask=16);
-
-		alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
-		event->hw.config = alt_config;
-	}
-}
-
-static void intel_pebs_aliases_ivb(struct perf_event *event)
-{
-	if (event->attr.precise_ip < 3)
-		return intel_pebs_aliases_snb(event);
-	return intel_pebs_aliases_precdist(event);
-}
-
-static void intel_pebs_aliases_skl(struct perf_event *event)
-{
-	if (event->attr.precise_ip < 3)
-		return intel_pebs_aliases_core2(event);
-	return intel_pebs_aliases_precdist(event);
-}
-
-static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
-{
-	unsigned long flags = x86_pmu.free_running_flags;
-
-	if (event->attr.use_clockid)
-		flags &= ~PERF_SAMPLE_TIME;
-	return flags;
-}
-
-static int intel_pmu_hw_config(struct perf_event *event)
-{
-	int ret = x86_pmu_hw_config(event);
-
-	if (ret)
-		return ret;
-
-	if (event->attr.precise_ip) {
-		if (!event->attr.freq) {
-			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
-			if (!(event->attr.sample_type &
-			      ~intel_pmu_free_running_flags(event)))
-				event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
-		}
-		if (x86_pmu.pebs_aliases)
-			x86_pmu.pebs_aliases(event);
-	}
-
-	if (needs_branch_stack(event)) {
-		ret = intel_pmu_setup_lbr_filter(event);
-		if (ret)
-			return ret;
-
-		/*
-		 * BTS is set up earlier in this path, so don't account twice
-		 */
-		if (!intel_pmu_has_bts(event)) {
-			/* disallow lbr if conflicting events are present */
-			if (x86_add_exclusive(x86_lbr_exclusive_lbr))
-				return -EBUSY;
-
-			event->destroy = hw_perf_lbr_event_destroy;
-		}
-	}
-
-	if (event->attr.type != PERF_TYPE_RAW)
-		return 0;
-
-	if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
-		return 0;
-
-	if (x86_pmu.version < 3)
-		return -EINVAL;
-
-	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
-
-	return 0;
-}
-
-struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
-	if (x86_pmu.guest_get_msrs)
-		return x86_pmu.guest_get_msrs(nr);
-	*nr = 0;
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
-
-static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
-
-	arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
-	arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
-	arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
-	/*
-	 * If PMU counter has PEBS enabled it is not enough to disable counter
-	 * on a guest entry since PEBS memory write can overshoot guest entry
-	 * and corrupt guest memory. Disabling PEBS solves the problem.
-	 */
-	arr[1].msr = MSR_IA32_PEBS_ENABLE;
-	arr[1].host = cpuc->pebs_enabled;
-	arr[1].guest = 0;
-
-	*nr = 2;
-	return arr;
-}
-
-static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++)  {
-		struct perf_event *event = cpuc->events[idx];
-
-		arr[idx].msr = x86_pmu_config_addr(idx);
-		arr[idx].host = arr[idx].guest = 0;
-
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-
-		arr[idx].host = arr[idx].guest =
-			event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
-
-		if (event->attr.exclude_host)
-			arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-		else if (event->attr.exclude_guest)
-			arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-	}
-
-	*nr = x86_pmu.num_counters;
-	return arr;
-}
-
-static void core_pmu_enable_event(struct perf_event *event)
-{
-	if (!event->attr.exclude_host)
-		x86_pmu_enable_event(event);
-}
-
-static void core_pmu_enable_all(int added)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
-
-		if (!test_bit(idx, cpuc->active_mask) ||
-				cpuc->events[idx]->attr.exclude_host)
-			continue;
-
-		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
-	}
-}
-
-static int hsw_hw_config(struct perf_event *event)
-{
-	int ret = intel_pmu_hw_config(event);
-
-	if (ret)
-		return ret;
-	if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE))
-		return 0;
-	event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED);
-
-	/*
-	 * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with
-	 * PEBS or in ANY thread mode. Since the results are non-sensical forbid
-	 * this combination.
-	 */
-	if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) &&
-	     ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) ||
-	      event->attr.precise_ip > 0))
-		return -EOPNOTSUPP;
-
-	if (event_is_checkpointed(event)) {
-		/*
-		 * Sampling of checkpointed events can cause situations where
-		 * the CPU constantly aborts because of a overflow, which is
-		 * then checkpointed back and ignored. Forbid checkpointing
-		 * for sampling.
-		 *
-		 * But still allow a long sampling period, so that perf stat
-		 * from KVM works.
-		 */
-		if (event->attr.sample_period > 0 &&
-		    event->attr.sample_period < 0x7fffffff)
-			return -EOPNOTSUPP;
-	}
-	return 0;
-}
-
-static struct event_constraint counter2_constraint =
-			EVENT_CONSTRAINT(0, 0x4, 0);
-
-static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			  struct perf_event *event)
-{
-	struct event_constraint *c;
-
-	c = intel_get_event_constraints(cpuc, idx, event);
-
-	/* Handle special quirk on in_tx_checkpointed only in counter 2 */
-	if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
-		if (c->idxmsk64 & (1U << 2))
-			return &counter2_constraint;
-		return &emptyconstraint;
-	}
-
-	return c;
-}
-
-/*
- * Broadwell:
- *
- * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
- * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
- * the two to enforce a minimum period of 128 (the smallest value that has bits
- * 0-5 cleared and >= 100).
- *
- * Because of how the code in x86_perf_event_set_period() works, the truncation
- * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
- * to make up for the 'lost' events due to carrying the 'error' in period_left.
- *
- * Therefore the effective (average) period matches the requested period,
- * despite coarser hardware granularity.
- */
-static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
-{
-	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
-			X86_CONFIG(.event=0xc0, .umask=0x01)) {
-		if (left < 128)
-			left = 128;
-		left &= ~0x3fu;
-	}
-	return left;
-}
-
-PMU_FORMAT_ATTR(event,	"config:0-7"	);
-PMU_FORMAT_ATTR(umask,	"config:8-15"	);
-PMU_FORMAT_ATTR(edge,	"config:18"	);
-PMU_FORMAT_ATTR(pc,	"config:19"	);
-PMU_FORMAT_ATTR(any,	"config:21"	); /* v3 + */
-PMU_FORMAT_ATTR(inv,	"config:23"	);
-PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
-PMU_FORMAT_ATTR(in_tx,  "config:32");
-PMU_FORMAT_ATTR(in_tx_cp, "config:33");
-
-static struct attribute *intel_arch_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_pc.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask.attr,
-	NULL,
-};
-
-ssize_t intel_event_sysfs_show(char *page, u64 config)
-{
-	u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
-
-	return x86_event_sysfs_show(page, config, event);
-}
-
-struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
-	struct intel_shared_regs *regs;
-	int i;
-
-	regs = kzalloc_node(sizeof(struct intel_shared_regs),
-			    GFP_KERNEL, cpu_to_node(cpu));
-	if (regs) {
-		/*
-		 * initialize the locks to keep lockdep happy
-		 */
-		for (i = 0; i < EXTRA_REG_MAX; i++)
-			raw_spin_lock_init(&regs->regs[i].lock);
-
-		regs->core_id = -1;
-	}
-	return regs;
-}
-
-static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
-{
-	struct intel_excl_cntrs *c;
-
-	c = kzalloc_node(sizeof(struct intel_excl_cntrs),
-			 GFP_KERNEL, cpu_to_node(cpu));
-	if (c) {
-		raw_spin_lock_init(&c->lock);
-		c->core_id = -1;
-	}
-	return c;
-}
-
-static int intel_pmu_cpu_prepare(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-
-	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
-		cpuc->shared_regs = allocate_shared_regs(cpu);
-		if (!cpuc->shared_regs)
-			goto err;
-	}
-
-	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-		size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
-
-		cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
-		if (!cpuc->constraint_list)
-			goto err_shared_regs;
-
-		cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
-		if (!cpuc->excl_cntrs)
-			goto err_constraint_list;
-
-		cpuc->excl_thread_id = 0;
-	}
-
-	return NOTIFY_OK;
-
-err_constraint_list:
-	kfree(cpuc->constraint_list);
-	cpuc->constraint_list = NULL;
-
-err_shared_regs:
-	kfree(cpuc->shared_regs);
-	cpuc->shared_regs = NULL;
-
-err:
-	return NOTIFY_BAD;
-}
-
-static void intel_pmu_cpu_starting(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	int core_id = topology_core_id(cpu);
-	int i;
-
-	init_debug_store_on_cpu(cpu);
-	/*
-	 * Deal with CPUs that don't clear their LBRs on power-up.
-	 */
-	intel_pmu_lbr_reset();
-
-	cpuc->lbr_sel = NULL;
-
-	if (!cpuc->shared_regs)
-		return;
-
-	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
-		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
-			struct intel_shared_regs *pc;
-
-			pc = per_cpu(cpu_hw_events, i).shared_regs;
-			if (pc && pc->core_id == core_id) {
-				cpuc->kfree_on_online[0] = cpuc->shared_regs;
-				cpuc->shared_regs = pc;
-				break;
-			}
-		}
-		cpuc->shared_regs->core_id = core_id;
-		cpuc->shared_regs->refcnt++;
-	}
-
-	if (x86_pmu.lbr_sel_map)
-		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
-
-	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
-			struct intel_excl_cntrs *c;
-
-			c = per_cpu(cpu_hw_events, i).excl_cntrs;
-			if (c && c->core_id == core_id) {
-				cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
-				cpuc->excl_cntrs = c;
-				cpuc->excl_thread_id = 1;
-				break;
-			}
-		}
-		cpuc->excl_cntrs->core_id = core_id;
-		cpuc->excl_cntrs->refcnt++;
-	}
-}
-
-static void free_excl_cntrs(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	struct intel_excl_cntrs *c;
-
-	c = cpuc->excl_cntrs;
-	if (c) {
-		if (c->core_id == -1 || --c->refcnt == 0)
-			kfree(c);
-		cpuc->excl_cntrs = NULL;
-		kfree(cpuc->constraint_list);
-		cpuc->constraint_list = NULL;
-	}
-}
-
-static void intel_pmu_cpu_dying(int cpu)
-{
-	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	struct intel_shared_regs *pc;
-
-	pc = cpuc->shared_regs;
-	if (pc) {
-		if (pc->core_id == -1 || --pc->refcnt == 0)
-			kfree(pc);
-		cpuc->shared_regs = NULL;
-	}
-
-	free_excl_cntrs(cpu);
-
-	fini_debug_store_on_cpu(cpu);
-}
-
-static void intel_pmu_sched_task(struct perf_event_context *ctx,
-				 bool sched_in)
-{
-	if (x86_pmu.pebs_active)
-		intel_pmu_pebs_sched_task(ctx, sched_in);
-	if (x86_pmu.lbr_nr)
-		intel_pmu_lbr_sched_task(ctx, sched_in);
-}
-
-PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
-
-PMU_FORMAT_ATTR(ldlat, "config1:0-15");
-
-PMU_FORMAT_ATTR(frontend, "config1:0-23");
-
-static struct attribute *intel_arch3_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_pc.attr,
-	&format_attr_any.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask.attr,
-	&format_attr_in_tx.attr,
-	&format_attr_in_tx_cp.attr,
-
-	&format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
-	&format_attr_ldlat.attr, /* PEBS load latency */
-	NULL,
-};
-
-static struct attribute *skl_format_attr[] = {
-	&format_attr_frontend.attr,
-	NULL,
-};
-
-static __initconst const struct x86_pmu core_pmu = {
-	.name			= "core",
-	.handle_irq		= x86_pmu_handle_irq,
-	.disable_all		= x86_pmu_disable_all,
-	.enable_all		= core_pmu_enable_all,
-	.enable			= core_pmu_enable_event,
-	.disable		= x86_pmu_disable_event,
-	.hw_config		= x86_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
-	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
-	.event_map		= intel_pmu_event_map,
-	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
-	.apic			= 1,
-	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
-
-	/*
-	 * Intel PMCs cannot be accessed sanely above 32-bit width,
-	 * so we install an artificial 1<<31 period regardless of
-	 * the generic event period:
-	 */
-	.max_period		= (1ULL<<31) - 1,
-	.get_event_constraints	= intel_get_event_constraints,
-	.put_event_constraints	= intel_put_event_constraints,
-	.event_constraints	= intel_core_event_constraints,
-	.guest_get_msrs		= core_guest_get_msrs,
-	.format_attrs		= intel_arch_formats_attr,
-	.events_sysfs_show	= intel_event_sysfs_show,
-
-	/*
-	 * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
-	 * together with PMU version 1 and thus be using core_pmu with
-	 * shared_regs. We need following callbacks here to allocate
-	 * it properly.
-	 */
-	.cpu_prepare		= intel_pmu_cpu_prepare,
-	.cpu_starting		= intel_pmu_cpu_starting,
-	.cpu_dying		= intel_pmu_cpu_dying,
-};
-
-static __initconst const struct x86_pmu intel_pmu = {
-	.name			= "Intel",
-	.handle_irq		= intel_pmu_handle_irq,
-	.disable_all		= intel_pmu_disable_all,
-	.enable_all		= intel_pmu_enable_all,
-	.enable			= intel_pmu_enable_event,
-	.disable		= intel_pmu_disable_event,
-	.hw_config		= intel_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
-	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
-	.event_map		= intel_pmu_event_map,
-	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
-	.apic			= 1,
-	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
-	/*
-	 * Intel PMCs cannot be accessed sanely above 32 bit width,
-	 * so we install an artificial 1<<31 period regardless of
-	 * the generic event period:
-	 */
-	.max_period		= (1ULL << 31) - 1,
-	.get_event_constraints	= intel_get_event_constraints,
-	.put_event_constraints	= intel_put_event_constraints,
-	.pebs_aliases		= intel_pebs_aliases_core2,
-
-	.format_attrs		= intel_arch3_formats_attr,
-	.events_sysfs_show	= intel_event_sysfs_show,
-
-	.cpu_prepare		= intel_pmu_cpu_prepare,
-	.cpu_starting		= intel_pmu_cpu_starting,
-	.cpu_dying		= intel_pmu_cpu_dying,
-	.guest_get_msrs		= intel_guest_get_msrs,
-	.sched_task		= intel_pmu_sched_task,
-};
-
-static __init void intel_clovertown_quirk(void)
-{
-	/*
-	 * PEBS is unreliable due to:
-	 *
-	 *   AJ67  - PEBS may experience CPL leaks
-	 *   AJ68  - PEBS PMI may be delayed by one event
-	 *   AJ69  - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
-	 *   AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
-	 *
-	 * AJ67 could be worked around by restricting the OS/USR flags.
-	 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
-	 *
-	 * AJ106 could possibly be worked around by not allowing LBR
-	 *       usage from PEBS, including the fixup.
-	 * AJ68  could possibly be worked around by always programming
-	 *	 a pebs_event_reset[0] value and coping with the lost events.
-	 *
-	 * But taken together it might just make sense to not enable PEBS on
-	 * these chips.
-	 */
-	pr_warn("PEBS disabled due to CPU errata\n");
-	x86_pmu.pebs = 0;
-	x86_pmu.pebs_constraints = NULL;
-}
-
-static int intel_snb_pebs_broken(int cpu)
-{
-	u32 rev = UINT_MAX; /* default to broken for unknown models */
-
-	switch (cpu_data(cpu).x86_model) {
-	case 42: /* SNB */
-		rev = 0x28;
-		break;
-
-	case 45: /* SNB-EP */
-		switch (cpu_data(cpu).x86_mask) {
-		case 6: rev = 0x618; break;
-		case 7: rev = 0x70c; break;
-		}
-	}
-
-	return (cpu_data(cpu).microcode < rev);
-}
-
-static void intel_snb_check_microcode(void)
-{
-	int pebs_broken = 0;
-	int cpu;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		if ((pebs_broken = intel_snb_pebs_broken(cpu)))
-			break;
-	}
-	put_online_cpus();
-
-	if (pebs_broken == x86_pmu.pebs_broken)
-		return;
-
-	/*
-	 * Serialized by the microcode lock..
-	 */
-	if (x86_pmu.pebs_broken) {
-		pr_info("PEBS enabled due to microcode update\n");
-		x86_pmu.pebs_broken = 0;
-	} else {
-		pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
-		x86_pmu.pebs_broken = 1;
-	}
-}
-
-/*
- * Under certain circumstances, access certain MSR may cause #GP.
- * The function tests if the input MSR can be safely accessed.
- */
-static bool check_msr(unsigned long msr, u64 mask)
-{
-	u64 val_old, val_new, val_tmp;
-
-	/*
-	 * Read the current value, change it and read it back to see if it
-	 * matches, this is needed to detect certain hardware emulators
-	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
-	 */
-	if (rdmsrl_safe(msr, &val_old))
-		return false;
-
-	/*
-	 * Only change the bits which can be updated by wrmsrl.
-	 */
-	val_tmp = val_old ^ mask;
-	if (wrmsrl_safe(msr, val_tmp) ||
-	    rdmsrl_safe(msr, &val_new))
-		return false;
-
-	if (val_new != val_tmp)
-		return false;
-
-	/* Here it's sure that the MSR can be safely accessed.
-	 * Restore the old value and return.
-	 */
-	wrmsrl(msr, val_old);
-
-	return true;
-}
-
-static __init void intel_sandybridge_quirk(void)
-{
-	x86_pmu.check_microcode = intel_snb_check_microcode;
-	intel_snb_check_microcode();
-}
-
-static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
-	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
-	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
-	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
-	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
-	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
-	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
-	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
-};
-
-static __init void intel_arch_events_quirk(void)
-{
-	int bit;
-
-	/* disable event that reported as not presend by cpuid */
-	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
-		intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
-		pr_warn("CPUID marked event: \'%s\' unavailable\n",
-			intel_arch_events_map[bit].name);
-	}
-}
-
-static __init void intel_nehalem_quirk(void)
-{
-	union cpuid10_ebx ebx;
-
-	ebx.full = x86_pmu.events_maskl;
-	if (ebx.split.no_branch_misses_retired) {
-		/*
-		 * Erratum AAJ80 detected, we work it around by using
-		 * the BR_MISP_EXEC.ANY event. This will over-count
-		 * branch-misses, but it's still much better than the
-		 * architectural event which is often completely bogus:
-		 */
-		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
-		ebx.split.no_branch_misses_retired = 0;
-		x86_pmu.events_maskl = ebx.full;
-		pr_info("CPU erratum AAJ80 worked around\n");
-	}
-}
-
-/*
- * enable software workaround for errata:
- * SNB: BJ122
- * IVB: BV98
- * HSW: HSD29
- *
- * Only needed when HT is enabled. However detecting
- * if HT is enabled is difficult (model specific). So instead,
- * we enable the workaround in the early boot, and verify if
- * it is needed in a later initcall phase once we have valid
- * topology information to check if HT is actually enabled
- */
-static __init void intel_ht_bug(void)
-{
-	x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
-
-	x86_pmu.start_scheduling = intel_start_scheduling;
-	x86_pmu.commit_scheduling = intel_commit_scheduling;
-	x86_pmu.stop_scheduling = intel_stop_scheduling;
-}
-
-EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores,	mem_st_hsw,	"event=0xd0,umask=0x82")
-
-/* Haswell special events */
-EVENT_ATTR_STR(tx-start,	tx_start,	"event=0xc9,umask=0x1");
-EVENT_ATTR_STR(tx-commit,	tx_commit,	"event=0xc9,umask=0x2");
-EVENT_ATTR_STR(tx-abort,	tx_abort,	"event=0xc9,umask=0x4");
-EVENT_ATTR_STR(tx-capacity,	tx_capacity,	"event=0x54,umask=0x2");
-EVENT_ATTR_STR(tx-conflict,	tx_conflict,	"event=0x54,umask=0x1");
-EVENT_ATTR_STR(el-start,	el_start,	"event=0xc8,umask=0x1");
-EVENT_ATTR_STR(el-commit,	el_commit,	"event=0xc8,umask=0x2");
-EVENT_ATTR_STR(el-abort,	el_abort,	"event=0xc8,umask=0x4");
-EVENT_ATTR_STR(el-capacity,	el_capacity,	"event=0x54,umask=0x2");
-EVENT_ATTR_STR(el-conflict,	el_conflict,	"event=0x54,umask=0x1");
-EVENT_ATTR_STR(cycles-t,	cycles_t,	"event=0x3c,in_tx=1");
-EVENT_ATTR_STR(cycles-ct,	cycles_ct,	"event=0x3c,in_tx=1,in_tx_cp=1");
-
-static struct attribute *hsw_events_attrs[] = {
-	EVENT_PTR(tx_start),
-	EVENT_PTR(tx_commit),
-	EVENT_PTR(tx_abort),
-	EVENT_PTR(tx_capacity),
-	EVENT_PTR(tx_conflict),
-	EVENT_PTR(el_start),
-	EVENT_PTR(el_commit),
-	EVENT_PTR(el_abort),
-	EVENT_PTR(el_capacity),
-	EVENT_PTR(el_conflict),
-	EVENT_PTR(cycles_t),
-	EVENT_PTR(cycles_ct),
-	EVENT_PTR(mem_ld_hsw),
-	EVENT_PTR(mem_st_hsw),
-	NULL
-};
-
-__init int intel_pmu_init(void)
-{
-	union cpuid10_edx edx;
-	union cpuid10_eax eax;
-	union cpuid10_ebx ebx;
-	struct event_constraint *c;
-	unsigned int unused;
-	struct extra_reg *er;
-	int version, i;
-
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
-		switch (boot_cpu_data.x86) {
-		case 0x6:
-			return p6_pmu_init();
-		case 0xb:
-			return knc_pmu_init();
-		case 0xf:
-			return p4_pmu_init();
-		}
-		return -ENODEV;
-	}
-
-	/*
-	 * Check whether the Architectural PerfMon supports
-	 * Branch Misses Retired hw_event or not.
-	 */
-	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
-	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
-		return -ENODEV;
-
-	version = eax.split.version_id;
-	if (version < 2)
-		x86_pmu = core_pmu;
-	else
-		x86_pmu = intel_pmu;
-
-	x86_pmu.version			= version;
-	x86_pmu.num_counters		= eax.split.num_counters;
-	x86_pmu.cntval_bits		= eax.split.bit_width;
-	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1;
-
-	x86_pmu.events_maskl		= ebx.full;
-	x86_pmu.events_mask_len		= eax.split.mask_length;
-
-	x86_pmu.max_pebs_events		= min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
-
-	/*
-	 * Quirk: v2 perfmon does not report fixed-purpose events, so
-	 * assume at least 3 events:
-	 */
-	if (version > 1)
-		x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
-	if (boot_cpu_has(X86_FEATURE_PDCM)) {
-		u64 capabilities;
-
-		rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
-		x86_pmu.intel_cap.capabilities = capabilities;
-	}
-
-	intel_ds_init();
-
-	x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
-
-	/*
-	 * Install the hw-cache-events table:
-	 */
-	switch (boot_cpu_data.x86_model) {
-	case 14: /* 65nm Core "Yonah" */
-		pr_cont("Core events, ");
-		break;
-
-	case 15: /* 65nm Core2 "Merom"          */
-		x86_add_quirk(intel_clovertown_quirk);
-	case 22: /* 65nm Core2 "Merom-L"        */
-	case 23: /* 45nm Core2 "Penryn"         */
-	case 29: /* 45nm Core2 "Dunnington (MP) */
-		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-
-		intel_pmu_lbr_init_core();
-
-		x86_pmu.event_constraints = intel_core2_event_constraints;
-		x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
-		pr_cont("Core2 events, ");
-		break;
-
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
-		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_nhm();
-
-		x86_pmu.event_constraints = intel_nehalem_event_constraints;
-		x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
-		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
-		x86_pmu.extra_regs = intel_nehalem_extra_regs;
-
-		x86_pmu.cpu_events = nhm_events_attrs;
-
-		/* UOPS_ISSUED.STALLED_CYCLES */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
-		x86_add_quirk(intel_nehalem_quirk);
-
-		pr_cont("Nehalem events, ");
-		break;
-
-	case 28: /* 45nm Atom "Pineview"   */
-	case 38: /* 45nm Atom "Lincroft"   */
-	case 39: /* 32nm Atom "Penwell"    */
-	case 53: /* 32nm Atom "Cloverview" */
-	case 54: /* 32nm Atom "Cedarview"  */
-		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-
-		intel_pmu_lbr_init_atom();
-
-		x86_pmu.event_constraints = intel_gen_event_constraints;
-		x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_core2;
-		pr_cont("Atom events, ");
-		break;
-
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 76: /* 14nm Atom "Airmont"                   */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-		memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
-			sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_atom();
-
-		x86_pmu.event_constraints = intel_slm_event_constraints;
-		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_slm_extra_regs;
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		pr_cont("Silvermont events, ");
-		break;
-
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
-		memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_nhm();
-
-		x86_pmu.event_constraints = intel_westmere_event_constraints;
-		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
-		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_westmere_extra_regs;
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-
-		x86_pmu.cpu_events = nhm_events_attrs;
-
-		/* UOPS_ISSUED.STALLED_CYCLES */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
-
-		pr_cont("Westmere events, ");
-		break;
-
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
-		x86_add_quirk(intel_sandybridge_quirk);
-		x86_add_quirk(intel_ht_bug);
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_snb();
-
-		x86_pmu.event_constraints = intel_snb_event_constraints;
-		x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
-		if (boot_cpu_data.x86_model == 45)
-			x86_pmu.extra_regs = intel_snbep_extra_regs;
-		else
-			x86_pmu.extra_regs = intel_snb_extra_regs;
-
-
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		x86_pmu.cpu_events = snb_events_attrs;
-
-		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
-			X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
-
-		pr_cont("SandyBridge events, ");
-		break;
-
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
-		x86_add_quirk(intel_ht_bug);
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
-		       sizeof(hw_cache_event_ids));
-		/* dTLB-load-misses on IVB is different than SNB */
-		hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
-
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
-		       sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_snb();
-
-		x86_pmu.event_constraints = intel_ivb_event_constraints;
-		x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-		x86_pmu.pebs_prec_dist = true;
-		if (boot_cpu_data.x86_model == 62)
-			x86_pmu.extra_regs = intel_snbep_extra_regs;
-		else
-			x86_pmu.extra_regs = intel_snb_extra_regs;
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		x86_pmu.cpu_events = snb_events_attrs;
-
-		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
-			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
-
-		pr_cont("IvyBridge events, ");
-		break;
-
-
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 69: /* 22nm Haswell ULT */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-		x86_add_quirk(intel_ht_bug);
-		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
-		intel_pmu_lbr_init_hsw();
-
-		x86_pmu.event_constraints = intel_hsw_event_constraints;
-		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_snbep_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-		x86_pmu.pebs_prec_dist = true;
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		x86_pmu.hw_config = hsw_hw_config;
-		x86_pmu.get_event_constraints = hsw_get_event_constraints;
-		x86_pmu.cpu_events = hsw_events_attrs;
-		x86_pmu.lbr_double_abort = true;
-		pr_cont("Haswell events, ");
-		break;
-
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
-		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-
-		/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
-		hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
-									 BDW_L3_MISS|HSW_SNOOP_DRAM;
-		hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
-									  HSW_SNOOP_DRAM;
-		hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
-									     BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
-		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
-									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
-
-		intel_pmu_lbr_init_hsw();
-
-		x86_pmu.event_constraints = intel_bdw_event_constraints;
-		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_snbep_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
-		x86_pmu.pebs_prec_dist = true;
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		x86_pmu.hw_config = hsw_hw_config;
-		x86_pmu.get_event_constraints = hsw_get_event_constraints;
-		x86_pmu.cpu_events = hsw_events_attrs;
-		x86_pmu.limit_period = bdw_limit_period;
-		pr_cont("Broadwell events, ");
-		break;
-
-	case 87: /* Knights Landing Xeon Phi */
-		memcpy(hw_cache_event_ids,
-		       slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs,
-		       knl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-		intel_pmu_lbr_init_knl();
-
-		x86_pmu.event_constraints = intel_slm_event_constraints;
-		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_knl_extra_regs;
-
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		pr_cont("Knights Landing events, ");
-		break;
-
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-		intel_pmu_lbr_init_skl();
-
-		x86_pmu.event_constraints = intel_skl_event_constraints;
-		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
-		x86_pmu.extra_regs = intel_skl_extra_regs;
-		x86_pmu.pebs_aliases = intel_pebs_aliases_skl;
-		x86_pmu.pebs_prec_dist = true;
-		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
-		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
-
-		x86_pmu.hw_config = hsw_hw_config;
-		x86_pmu.get_event_constraints = hsw_get_event_constraints;
-		x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
-						  skl_format_attr);
-		WARN_ON(!x86_pmu.format_attrs);
-		x86_pmu.cpu_events = hsw_events_attrs;
-		pr_cont("Skylake events, ");
-		break;
-
-	default:
-		switch (x86_pmu.version) {
-		case 1:
-			x86_pmu.event_constraints = intel_v1_event_constraints;
-			pr_cont("generic architected perfmon v1, ");
-			break;
-		default:
-			/*
-			 * default constraints for v2 and up
-			 */
-			x86_pmu.event_constraints = intel_gen_event_constraints;
-			pr_cont("generic architected perfmon, ");
-			break;
-		}
-	}
-
-	if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
-		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-		     x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
-		x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
-	}
-	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
-
-	if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
-		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-		     x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
-		x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
-	}
-
-	x86_pmu.intel_ctrl |=
-		((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
-
-	if (x86_pmu.event_constraints) {
-		/*
-		 * event on fixed counter2 (REF_CYCLES) only works on this
-		 * counter, so do not extend mask to generic counters
-		 */
-		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if (c->cmask == FIXED_EVENT_FLAGS
-			    && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
-				c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
-			}
-			c->idxmsk64 &=
-				~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
-			c->weight = hweight64(c->idxmsk64);
-		}
-	}
-
-	/*
-	 * Access LBR MSR may cause #GP under certain circumstances.
-	 * E.g. KVM doesn't support LBR MSR
-	 * Check all LBT MSR here.
-	 * Disable LBR access if any LBR MSRs can not be accessed.
-	 */
-	if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
-		x86_pmu.lbr_nr = 0;
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
-		if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
-		      check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
-			x86_pmu.lbr_nr = 0;
-	}
-
-	/*
-	 * Access extra MSR may cause #GP under certain circumstances.
-	 * E.g. KVM doesn't support offcore event
-	 * Check all extra_regs here.
-	 */
-	if (x86_pmu.extra_regs) {
-		for (er = x86_pmu.extra_regs; er->msr; er++) {
-			er->extra_msr_access = check_msr(er->msr, 0x11UL);
-			/* Disable LBR select mapping */
-			if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
-				x86_pmu.lbr_sel_map = NULL;
-		}
-	}
-
-	/* Support full width counters using alternative MSR range */
-	if (x86_pmu.intel_cap.full_width_write) {
-		x86_pmu.max_period = x86_pmu.cntval_mask;
-		x86_pmu.perfctr = MSR_IA32_PMC0;
-		pr_cont("full-width counters, ");
-	}
-
-	return 0;
-}
-
-/*
- * HT bug: phase 2 init
- * Called once we have valid topology information to check
- * whether or not HT is enabled
- * If HT is off, then we disable the workaround
- */
-static __init int fixup_ht_bug(void)
-{
-	int cpu = smp_processor_id();
-	int w, c;
-	/*
-	 * problem not present on this CPU model, nothing to do
-	 */
-	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
-		return 0;
-
-	w = cpumask_weight(topology_sibling_cpumask(cpu));
-	if (w > 1) {
-		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
-		return 0;
-	}
-
-	if (lockup_detector_suspend() != 0) {
-		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
-		return 0;
-	}
-
-	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
-
-	x86_pmu.start_scheduling = NULL;
-	x86_pmu.commit_scheduling = NULL;
-	x86_pmu.stop_scheduling = NULL;
-
-	lockup_detector_resume();
-
-	get_online_cpus();
-
-	for_each_online_cpu(c) {
-		free_excl_cntrs(c);
-	}
-
-	put_online_cpus();
-	pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
-	return 0;
-}
-subsys_initcall(fixup_ht_bug)
-- 
cgit v0.10.2


From 5c781a3daa3053c355259894f9e7a478deb0cb46 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:09 +0100
Subject: perf/x86: Move perf_event_intel_cqm.c ........ =>
 x86/events/intel/cqm.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 834e9ae..f97c283 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -5,4 +5,4 @@ obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
new file mode 100644
index 0000000..d1b623a
--- /dev/null
+++ b/arch/x86/events/intel/cqm.c
@@ -0,0 +1,1391 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "../../kernel/cpu/perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC	0x0c8f
+#define MSR_IA32_QM_CTR		0x0c8e
+#define MSR_IA32_QM_EVTSEL	0x0c8d
+
+static u32 cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid:		The cached Resource Monitoring ID
+ * @closid:		The cached Class Of Service ID
+ * @rmid_usecnt:	The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+	u32			rmid;
+	u32			closid;
+	int			rmid_usecnt;
+};
+
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+/*
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR		(1ULL << 63)
+#define RMID_VAL_UNAVAIL	(1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID	(1 << 0)
+
+#define QOS_EVENT_MASK	QOS_L3_OCCUP_EVENT_ID
+
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static u32 intel_cqm_rotation_rmid;
+
+#define INVALID_RMID		(-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(u32 rmid)
+{
+	if (!rmid || rmid == INVALID_RMID)
+		return false;
+
+	return true;
+}
+
+static u64 __rmid_read(u32 rmid)
+{
+	u64 val;
+
+	/*
+	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+	 * it just says that to increase confusion.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+	rdmsrl(MSR_IA32_QM_CTR, val);
+
+	/*
+	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+	 * the number of cachelines tagged with @rmid.
+	 */
+	return val;
+}
+
+enum rmid_recycle_state {
+	RMID_YOUNG = 0,
+	RMID_AVAILABLE,
+	RMID_DIRTY,
+};
+
+struct cqm_rmid_entry {
+	u32 rmid;
+	enum rmid_recycle_state state;
+	struct list_head list;
+	unsigned long queue_time;
+};
+
+/*
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
+{
+	struct cqm_rmid_entry *entry;
+
+	entry = cqm_rmid_ptrs[rmid];
+	WARN_ON(entry->rmid != rmid);
+
+	return entry;
+}
+
+/*
+ * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
+ */
+static u32 __get_rmid(void)
+{
+	struct cqm_rmid_entry *entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	if (list_empty(&cqm_rmid_free_lru))
+		return INVALID_RMID;
+
+	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
+	list_del(&entry->list);
+
+	return entry->rmid;
+}
+
+static void __put_rmid(u32 rmid)
+{
+	struct cqm_rmid_entry *entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	WARN_ON(!__rmid_valid(rmid));
+	entry = __rmid_entry(rmid);
+
+	entry->queue_time = jiffies;
+	entry->state = RMID_YOUNG;
+
+	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+	struct cqm_rmid_entry *entry;
+	unsigned int nr_rmids;
+	int r = 0;
+
+	nr_rmids = cqm_max_rmid + 1;
+	cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+				nr_rmids, GFP_KERNEL);
+	if (!cqm_rmid_ptrs)
+		return -ENOMEM;
+
+	for (; r <= cqm_max_rmid; r++) {
+		struct cqm_rmid_entry *entry;
+
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry)
+			goto fail;
+
+		INIT_LIST_HEAD(&entry->list);
+		entry->rmid = r;
+		cqm_rmid_ptrs[r] = entry;
+
+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
+	}
+
+	/*
+	 * RMID 0 is special and is always allocated. It's used for all
+	 * tasks that are not monitored.
+	 */
+	entry = __rmid_entry(0);
+	list_del(&entry->list);
+
+	mutex_lock(&cache_mutex);
+	intel_cqm_rotation_rmid = __get_rmid();
+	mutex_unlock(&cache_mutex);
+
+	return 0;
+fail:
+	while (r--)
+		kfree(cqm_rmid_ptrs[r]);
+
+	kfree(cqm_rmid_ptrs);
+	return -ENOMEM;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+	/* Per-cpu and task events don't mix */
+	if ((a->attach_state & PERF_ATTACH_TASK) !=
+	    (b->attach_state & PERF_ATTACH_TASK))
+		return false;
+
+#ifdef CONFIG_CGROUP_PERF
+	if (a->cgrp != b->cgrp)
+		return false;
+#endif
+
+	/* If not task event, we're machine wide */
+	if (!(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Events that target same task are placed into the same cache group.
+	 */
+	if (a->hw.target == b->hw.target)
+		return true;
+
+	/*
+	 * Are we an inherited event?
+	 */
+	if (b->parent == a)
+		return true;
+
+	return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return perf_cgroup_from_task(event->hw.target, event->ctx);
+
+	return event->cgrp;
+}
+#endif
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ *		   PROHIBITS
+ *     system-wide    -> 	cgroup and task
+ *     cgroup 	      ->	system-wide
+ *     		      ->	task in cgroup
+ *     task 	      -> 	system-wide
+ *     		      ->	task in cgroup
+ *
+ * Call this function before allocating an RMID.
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+	/*
+	 * We can have any number of cgroups but only one system-wide
+	 * event at a time.
+	 */
+	if (a->cgrp && b->cgrp) {
+		struct perf_cgroup *ac = a->cgrp;
+		struct perf_cgroup *bc = b->cgrp;
+
+		/*
+		 * This condition should have been caught in
+		 * __match_event() and we should be sharing an RMID.
+		 */
+		WARN_ON_ONCE(ac == bc);
+
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+
+	if (a->cgrp || b->cgrp) {
+		struct perf_cgroup *ac, *bc;
+
+		/*
+		 * cgroup and system-wide events are mutually exclusive
+		 */
+		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+			return true;
+
+		/*
+		 * Ensure neither event is part of the other's cgroup
+		 */
+		ac = event_to_cgroup(a);
+		bc = event_to_cgroup(b);
+		if (ac == bc)
+			return true;
+
+		/*
+		 * Must have cgroup and non-intersecting task events.
+		 */
+		if (!ac || !bc)
+			return false;
+
+		/*
+		 * We have cgroup and task events, and the task belongs
+		 * to a cgroup. Check for for overlap.
+		 */
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+#endif
+	/*
+	 * If one of them is not a task, same story as above with cgroups.
+	 */
+	if (!(a->attach_state & PERF_ATTACH_TASK) ||
+	    !(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Must be non-overlapping.
+	 */
+	return false;
+}
+
+struct rmid_read {
+	u32 rmid;
+	atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
+{
+	struct perf_event *event;
+	struct list_head *head = &group->hw.cqm_group_entry;
+	u32 old_rmid = group->hw.cqm_rmid;
+
+	lockdep_assert_held(&cache_mutex);
+
+	/*
+	 * If our RMID is being deallocated, perform a read now.
+	 */
+	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+		struct rmid_read rr = {
+			.value = ATOMIC64_INIT(0),
+			.rmid = old_rmid,
+		};
+
+		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+				 &rr, 1);
+		local64_set(&group->count, atomic64_read(&rr.value));
+	}
+
+	raw_spin_lock_irq(&cache_lock);
+
+	group->hw.cqm_rmid = rmid;
+	list_for_each_entry(event, head, hw.cqm_group_entry)
+		event->hw.cqm_rmid = rmid;
+
+	raw_spin_unlock_irq(&cache_lock);
+
+	return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+	struct cqm_rmid_entry *entry;
+
+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+		if (entry->state != RMID_AVAILABLE)
+			break;
+
+		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+			entry->state = RMID_DIRTY;
+	}
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(u32 rmid)
+{
+	struct perf_event *leader, *event;
+
+	lockdep_assert_held(&cache_mutex);
+
+	leader = list_first_entry(&cache_groups, struct perf_event,
+				  hw.cqm_groups_entry);
+	event = leader;
+
+	list_for_each_entry_continue(event, &cache_groups,
+				     hw.cqm_groups_entry) {
+		if (__rmid_valid(event->hw.cqm_rmid))
+			continue;
+
+		if (__conflict_event(event, leader))
+			continue;
+
+		intel_cqm_xchg_rmid(event, rmid);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+	struct cqm_rmid_entry *entry, *tmp;
+
+	lockdep_assert_held(&cache_mutex);
+
+	*available = 0;
+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+		unsigned long min_queue_time;
+		unsigned long now = jiffies;
+
+		/*
+		 * We hold RMIDs placed into limbo for a minimum queue
+		 * time. Before the minimum queue time has elapsed we do
+		 * not recycle RMIDs.
+		 *
+		 * The reasoning is that until a sufficient time has
+		 * passed since we stopped using an RMID, any RMID
+		 * placed onto the limbo list will likely still have
+		 * data tagged in the cache, which means we'll probably
+		 * fail to recycle it anyway.
+		 *
+		 * We can save ourselves an expensive IPI by skipping
+		 * any RMIDs that have not been queued for the minimum
+		 * time.
+		 */
+		min_queue_time = entry->queue_time +
+			msecs_to_jiffies(__rmid_queue_time_ms);
+
+		if (time_after(min_queue_time, now))
+			break;
+
+		entry->state = RMID_AVAILABLE;
+		(*available)++;
+	}
+
+	/*
+	 * Fast return if none of the RMIDs on the limbo list have been
+	 * sitting on the queue for the minimum queue time.
+	 */
+	if (!*available)
+		return false;
+
+	/*
+	 * Test whether an RMID is free for each package.
+	 */
+	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+		/*
+		 * Exhausted all RMIDs that have waited min queue time.
+		 */
+		if (entry->state == RMID_YOUNG)
+			break;
+
+		if (entry->state == RMID_DIRTY)
+			continue;
+
+		list_del(&entry->list);	/* remove from limbo */
+
+		/*
+		 * The rotation RMID gets priority if it's
+		 * currently invalid. In which case, skip adding
+		 * the RMID to the the free lru.
+		 */
+		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+			intel_cqm_rotation_rmid = entry->rmid;
+			continue;
+		}
+
+		/*
+		 * If we have groups waiting for RMIDs, hand
+		 * them one now provided they don't conflict.
+		 */
+		if (intel_cqm_sched_in_event(entry->rmid))
+			continue;
+
+		/*
+		 * Otherwise place it onto the free list.
+		 */
+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
+	}
+
+
+	return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
+ */
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
+{
+	struct perf_event *rotor;
+	u32 rmid;
+
+	lockdep_assert_held(&cache_mutex);
+
+	rotor = list_first_entry(&cache_groups, struct perf_event,
+				 hw.cqm_groups_entry);
+
+	/*
+	 * The group at the front of the list should always have a valid
+	 * RMID. If it doesn't then no groups have RMIDs assigned and we
+	 * don't need to rotate the list.
+	 */
+	if (next == rotor)
+		return;
+
+	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+	__put_rmid(rmid);
+
+	list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+	struct perf_event *group, *g;
+	u32 rmid;
+
+	lockdep_assert_held(&cache_mutex);
+
+	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+		if (group == event)
+			continue;
+
+		rmid = group->hw.cqm_rmid;
+
+		/*
+		 * Skip events that don't have a valid RMID.
+		 */
+		if (!__rmid_valid(rmid))
+			continue;
+
+		/*
+		 * No conflict? No problem! Leave the event alone.
+		 */
+		if (!__conflict_event(group, event))
+			continue;
+
+		intel_cqm_xchg_rmid(group, INVALID_RMID);
+		__put_rmid(rmid);
+	}
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * We rotate for two reasons,
+ *   1. To handle the scheduling of conflicting events
+ *   2. To recycle RMIDs
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+	struct perf_event *group, *start = NULL;
+	unsigned int threshold_limit;
+	unsigned int nr_needed = 0;
+	unsigned int nr_available;
+	bool rotated = false;
+
+	mutex_lock(&cache_mutex);
+
+again:
+	/*
+	 * Fast path through this function if there are no groups and no
+	 * RMIDs that need cleaning.
+	 */
+	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+		goto out;
+
+	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+		if (!__rmid_valid(group->hw.cqm_rmid)) {
+			if (!start)
+				start = group;
+			nr_needed++;
+		}
+	}
+
+	/*
+	 * We have some event groups, but they all have RMIDs assigned
+	 * and no RMIDs need cleaning.
+	 */
+	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+		goto out;
+
+	if (!nr_needed)
+		goto stabilize;
+
+	/*
+	 * We have more event groups without RMIDs than available RMIDs,
+	 * or we have event groups that conflict with the ones currently
+	 * scheduled.
+	 *
+	 * We force deallocate the rmid of the group at the head of
+	 * cache_groups. The first event group without an RMID then gets
+	 * assigned intel_cqm_rotation_rmid. This ensures we always make
+	 * forward progress.
+	 *
+	 * Rotate the cache_groups list so the previous head is now the
+	 * tail.
+	 */
+	__intel_cqm_pick_and_rotate(start);
+
+	/*
+	 * If the rotation is going to succeed, reduce the threshold so
+	 * that we don't needlessly reuse dirty RMIDs.
+	 */
+	if (__rmid_valid(intel_cqm_rotation_rmid)) {
+		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+		intel_cqm_rotation_rmid = __get_rmid();
+
+		intel_cqm_sched_out_conflicting_events(start);
+
+		if (__intel_cqm_threshold)
+			__intel_cqm_threshold--;
+	}
+
+	rotated = true;
+
+stabilize:
+	/*
+	 * We now need to stablize the RMID we freed above (if any) to
+	 * ensure that the next time we rotate we have an RMID with zero
+	 * occupancy value.
+	 *
+	 * Alternatively, if we didn't need to perform any rotation,
+	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
+	 */
+	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+	while (intel_cqm_rmid_stabilize(&nr_available) &&
+	       __intel_cqm_threshold < threshold_limit) {
+		unsigned int steal_limit;
+
+		/*
+		 * Don't spin if nobody is actively waiting for an RMID,
+		 * the rotation worker will be kicked as soon as an
+		 * event needs an RMID anyway.
+		 */
+		if (!nr_needed)
+			break;
+
+		/* Allow max 25% of RMIDs to be in limbo. */
+		steal_limit = (cqm_max_rmid + 1) / 4;
+
+		/*
+		 * We failed to stabilize any RMIDs so our rotation
+		 * logic is now stuck. In order to make forward progress
+		 * we have a few options:
+		 *
+		 *   1. rotate ("steal") another RMID
+		 *   2. increase the threshold
+		 *   3. do nothing
+		 *
+		 * We do both of 1. and 2. until we hit the steal limit.
+		 *
+		 * The steal limit prevents all RMIDs ending up on the
+		 * limbo list. This can happen if every RMID has a
+		 * non-zero occupancy above threshold_limit, and the
+		 * occupancy values aren't dropping fast enough.
+		 *
+		 * Note that there is prioritisation at work here - we'd
+		 * rather increase the number of RMIDs on the limbo list
+		 * than increase the threshold, because increasing the
+		 * threshold skews the event data (because we reuse
+		 * dirty RMIDs) - threshold bumps are a last resort.
+		 */
+		if (nr_available < steal_limit)
+			goto again;
+
+		__intel_cqm_threshold++;
+	}
+
+out:
+	mutex_unlock(&cache_mutex);
+	return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+	unsigned long delay;
+
+	__intel_cqm_rmid_rotate();
+
+	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+	schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static void intel_cqm_setup_event(struct perf_event *event,
+				  struct perf_event **group)
+{
+	struct perf_event *iter;
+	bool conflict = false;
+	u32 rmid;
+
+	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+		rmid = iter->hw.cqm_rmid;
+
+		if (__match_event(iter, event)) {
+			/* All tasks in a group share an RMID */
+			event->hw.cqm_rmid = rmid;
+			*group = iter;
+			return;
+		}
+
+		/*
+		 * We only care about conflicts for events that are
+		 * actually scheduled in (and hence have a valid RMID).
+		 */
+		if (__conflict_event(iter, event) && __rmid_valid(rmid))
+			conflict = true;
+	}
+
+	if (conflict)
+		rmid = INVALID_RMID;
+	else
+		rmid = __get_rmid();
+
+	event->hw.cqm_rmid = rmid;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+	unsigned long flags;
+	u32 rmid;
+	u64 val;
+
+	/*
+	 * Task events are handled by intel_cqm_event_count().
+	 */
+	if (event->cpu == -1)
+		return;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+	rmid = event->hw.cqm_rmid;
+
+	if (!__rmid_valid(rmid))
+		goto out;
+
+	val = __rmid_read(rmid);
+
+	/*
+	 * Ignore this reading on error states and do not update the value.
+	 */
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		goto out;
+
+	local64_set(&event->count, val);
+out:
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+static void __intel_cqm_event_count(void *info)
+{
+	struct rmid_read *rr = info;
+	u64 val;
+
+	val = __rmid_read(rr->rmid);
+
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+
+	atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+	return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+	unsigned long flags;
+	struct rmid_read rr = {
+		.value = ATOMIC64_INIT(0),
+	};
+
+	/*
+	 * We only need to worry about task events. System-wide events
+	 * are handled like usual, i.e. entirely with
+	 * intel_cqm_event_read().
+	 */
+	if (event->cpu != -1)
+		return __perf_event_count(event);
+
+	/*
+	 * Only the group leader gets to report values. This stops us
+	 * reporting duplicate values to userspace, and gives us a clear
+	 * rule for which task gets to report the values.
+	 *
+	 * Note that it is impossible to attribute these values to
+	 * specific packages - we forfeit that ability when we create
+	 * task events.
+	 */
+	if (!cqm_group_leader(event))
+		return 0;
+
+	/*
+	 * Getting up-to-date values requires an SMP IPI which is not
+	 * possible if we're being called in interrupt context. Return
+	 * the cached values instead.
+	 */
+	if (unlikely(in_interrupt()))
+		goto out;
+
+	/*
+	 * Notice that we don't perform the reading of an RMID
+	 * atomically, because we can't hold a spin lock across the
+	 * IPIs.
+	 *
+	 * Speculatively perform the read, since @event might be
+	 * assigned a different (possibly invalid) RMID while we're
+	 * busying performing the IPI calls. It's therefore necessary to
+	 * check @event's RMID afterwards, and if it has changed,
+	 * discard the result of the read.
+	 */
+	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
+
+	if (!__rmid_valid(rr.rmid))
+		goto out;
+
+	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+	if (event->hw.cqm_rmid == rr.rmid)
+		local64_set(&event->count, atomic64_read(&rr.value));
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
+	return __perf_event_count(event);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+	u32 rmid = event->hw.cqm_rmid;
+
+	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+		return;
+
+	event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+	if (state->rmid_usecnt++) {
+		if (!WARN_ON_ONCE(state->rmid != rmid))
+			return;
+	} else {
+		WARN_ON_ONCE(state->rmid);
+	}
+
+	state->rmid = rmid;
+	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+	if (event->hw.cqm_state & PERF_HES_STOPPED)
+		return;
+
+	event->hw.cqm_state |= PERF_HES_STOPPED;
+
+	intel_cqm_event_read(event);
+
+	if (!--state->rmid_usecnt) {
+		state->rmid = 0;
+		wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
+	} else {
+		WARN_ON_ONCE(!state->rmid);
+	}
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+	unsigned long flags;
+	u32 rmid;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+
+	event->hw.cqm_state = PERF_HES_STOPPED;
+	rmid = event->hw.cqm_rmid;
+
+	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
+		intel_cqm_event_start(event, mode);
+
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+	return 0;
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+	struct perf_event *group_other = NULL;
+
+	mutex_lock(&cache_mutex);
+
+	/*
+	 * If there's another event in this group...
+	 */
+	if (!list_empty(&event->hw.cqm_group_entry)) {
+		group_other = list_first_entry(&event->hw.cqm_group_entry,
+					       struct perf_event,
+					       hw.cqm_group_entry);
+		list_del(&event->hw.cqm_group_entry);
+	}
+
+	/*
+	 * And we're the group leader..
+	 */
+	if (cqm_group_leader(event)) {
+		/*
+		 * If there was a group_other, make that leader, otherwise
+		 * destroy the group and return the RMID.
+		 */
+		if (group_other) {
+			list_replace(&event->hw.cqm_groups_entry,
+				     &group_other->hw.cqm_groups_entry);
+		} else {
+			u32 rmid = event->hw.cqm_rmid;
+
+			if (__rmid_valid(rmid))
+				__put_rmid(rmid);
+			list_del(&event->hw.cqm_groups_entry);
+		}
+	}
+
+	mutex_unlock(&cache_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+	struct perf_event *group = NULL;
+	bool rotate = false;
+
+	if (event->attr.type != intel_cqm_pmu.type)
+		return -ENOENT;
+
+	if (event->attr.config & ~QOS_EVENT_MASK)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+	event->destroy = intel_cqm_event_destroy;
+
+	mutex_lock(&cache_mutex);
+
+	/* Will also set rmid */
+	intel_cqm_setup_event(event, &group);
+
+	if (group) {
+		list_add_tail(&event->hw.cqm_group_entry,
+			      &group->hw.cqm_group_entry);
+	} else {
+		list_add_tail(&event->hw.cqm_groups_entry,
+			      &cache_groups);
+
+		/*
+		 * All RMIDs are either in use or have recently been
+		 * used. Kick the rotation worker to clean/free some.
+		 *
+		 * We only do this for the group leader, rather than for
+		 * every event in a group to save on needless work.
+		 */
+		if (!__rmid_valid(event->hw.cqm_rmid))
+			rotate = true;
+	}
+
+	mutex_unlock(&cache_mutex);
+
+	if (rotate)
+		schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
+	return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+	EVENT_PTR(intel_cqm_llc),
+	EVENT_PTR(intel_cqm_llc_pkg),
+	EVENT_PTR(intel_cqm_llc_unit),
+	EVENT_PTR(intel_cqm_llc_scale),
+	EVENT_PTR(intel_cqm_llc_snapshot),
+	NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+	.name = "events",
+	.attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+	.name = "format",
+	.attrs = intel_cqm_formats_attr,
+};
+
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+			   char *page)
+{
+	ssize_t rv;
+
+	mutex_lock(&cache_mutex);
+	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+	mutex_unlock(&cache_mutex);
+
+	return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	unsigned int bytes, cachelines;
+	int ret;
+
+	ret = kstrtouint(buf, 0, &bytes);
+	if (ret)
+		return ret;
+
+	mutex_lock(&cache_mutex);
+
+	__intel_cqm_max_threshold = bytes;
+	cachelines = bytes / cqm_l3_scale;
+
+	/*
+	 * The new maximum takes effect immediately.
+	 */
+	if (__intel_cqm_threshold > cachelines)
+		__intel_cqm_threshold = cachelines;
+
+	mutex_unlock(&cache_mutex);
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+	&dev_attr_max_recycle_threshold.attr,
+	NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+	.attrs = intel_cqm_attrs,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+	&intel_cqm_events_group,
+	&intel_cqm_format_group,
+	&intel_cqm_group,
+	NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+	.attr_groups	     = intel_cqm_attr_groups,
+	.task_ctx_nr	     = perf_sw_context,
+	.event_init	     = intel_cqm_event_init,
+	.add		     = intel_cqm_event_add,
+	.del		     = intel_cqm_event_stop,
+	.start		     = intel_cqm_event_start,
+	.stop		     = intel_cqm_event_stop,
+	.read		     = intel_cqm_event_read,
+	.count		     = intel_cqm_event_count,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+	int phys_id = topology_physical_package_id(cpu);
+	int i;
+
+	for_each_cpu(i, &cqm_cpumask) {
+		if (phys_id == topology_physical_package_id(i))
+			return;	/* already got reader for this socket */
+	}
+
+	cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_starting(unsigned int cpu)
+{
+	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+	state->rmid = 0;
+	state->closid = 0;
+	state->rmid_usecnt = 0;
+
+	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+	int phys_id = topology_physical_package_id(cpu);
+	int i;
+
+	/*
+	 * Is @cpu a designated cqm reader?
+	 */
+	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+		return;
+
+	for_each_online_cpu(i) {
+		if (i == cpu)
+			continue;
+
+		if (phys_id == topology_physical_package_id(i)) {
+			cpumask_set_cpu(i, &cqm_cpumask);
+			break;
+		}
+	}
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu  = (unsigned long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		intel_cqm_cpu_exit(cpu);
+		break;
+	case CPU_STARTING:
+		intel_cqm_cpu_starting(cpu);
+		cqm_pick_event_reader(cpu);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+	{}
+};
+
+static int __init intel_cqm_init(void)
+{
+	char *str, scale[20];
+	int i, cpu, ret;
+
+	if (!x86_match_cpu(intel_cqm_match))
+		return -ENODEV;
+
+	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+	/*
+	 * It's possible that not all resources support the same number
+	 * of RMIDs. Instead of making scheduling much more complicated
+	 * (where we have to match a task's RMID to a cpu that supports
+	 * that many RMIDs) just find the minimum RMIDs supported across
+	 * all cpus.
+	 *
+	 * Also, check that the scales match on all cpus.
+	 */
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu) {
+		struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+		if (c->x86_cache_max_rmid < cqm_max_rmid)
+			cqm_max_rmid = c->x86_cache_max_rmid;
+
+		if (c->x86_cache_occ_scale != cqm_l3_scale) {
+			pr_err("Multiple LLC scale values, disabling\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	/*
+	 * A reasonable upper limit on the max threshold is the number
+	 * of lines tagged per RMID if all RMIDs have the same number of
+	 * lines tagged in the LLC.
+	 *
+	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+	 */
+	__intel_cqm_max_threshold =
+		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
+	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+	str = kstrdup(scale, GFP_KERNEL);
+	if (!str) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	event_attr_intel_cqm_llc_scale.event_str = str;
+
+	ret = intel_cqm_setup_rmid_cache();
+	if (ret)
+		goto out;
+
+	for_each_online_cpu(i) {
+		intel_cqm_cpu_starting(i);
+		cqm_pick_event_reader(i);
+	}
+
+	__perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+	if (ret)
+		pr_err("Intel CQM perf registration failed: %d\n", ret);
+	else
+		pr_info("Intel CQM monitoring enabled\n");
+
+out:
+	cpu_notifier_register_done();
+
+	return ret;
+}
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 0a27243..64781cf 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_cstate.o
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
deleted file mode 100644
index a316ca9..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ /dev/null
@@ -1,1391 +0,0 @@
-/*
- * Intel Cache Quality-of-Service Monitoring (CQM) support.
- *
- * Based very, very heavily on work by Peter Zijlstra.
- */
-
-#include <linux/perf_event.h>
-#include <linux/slab.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-#define MSR_IA32_PQR_ASSOC	0x0c8f
-#define MSR_IA32_QM_CTR		0x0c8e
-#define MSR_IA32_QM_EVTSEL	0x0c8d
-
-static u32 cqm_max_rmid = -1;
-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
-
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid:		The cached Resource Monitoring ID
- * @closid:		The cached Class Of Service ID
- * @rmid_usecnt:	The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
-	u32			rmid;
-	u32			closid;
-	int			rmid_usecnt;
-};
-
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
-
-/*
- * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
- * Also protects event->hw.cqm_rmid
- *
- * Hold either for stability, both for modification of ->hw.cqm_rmid.
- */
-static DEFINE_MUTEX(cache_mutex);
-static DEFINE_RAW_SPINLOCK(cache_lock);
-
-/*
- * Groups of events that have the same target(s), one RMID per group.
- */
-static LIST_HEAD(cache_groups);
-
-/*
- * Mask of CPUs for reading CQM values. We only need one per-socket.
- */
-static cpumask_t cqm_cpumask;
-
-#define RMID_VAL_ERROR		(1ULL << 63)
-#define RMID_VAL_UNAVAIL	(1ULL << 62)
-
-#define QOS_L3_OCCUP_EVENT_ID	(1 << 0)
-
-#define QOS_EVENT_MASK	QOS_L3_OCCUP_EVENT_ID
-
-/*
- * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
- *
- * This rmid is always free and is guaranteed to have an associated
- * near-zero occupancy value, i.e. no cachelines are tagged with this
- * RMID, once __intel_cqm_rmid_rotate() returns.
- */
-static u32 intel_cqm_rotation_rmid;
-
-#define INVALID_RMID		(-1)
-
-/*
- * Is @rmid valid for programming the hardware?
- *
- * rmid 0 is reserved by the hardware for all non-monitored tasks, which
- * means that we should never come across an rmid with that value.
- * Likewise, an rmid value of -1 is used to indicate "no rmid currently
- * assigned" and is used as part of the rotation code.
- */
-static inline bool __rmid_valid(u32 rmid)
-{
-	if (!rmid || rmid == INVALID_RMID)
-		return false;
-
-	return true;
-}
-
-static u64 __rmid_read(u32 rmid)
-{
-	u64 val;
-
-	/*
-	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
-	 * it just says that to increase confusion.
-	 */
-	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, val);
-
-	/*
-	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
-	 * the number of cachelines tagged with @rmid.
-	 */
-	return val;
-}
-
-enum rmid_recycle_state {
-	RMID_YOUNG = 0,
-	RMID_AVAILABLE,
-	RMID_DIRTY,
-};
-
-struct cqm_rmid_entry {
-	u32 rmid;
-	enum rmid_recycle_state state;
-	struct list_head list;
-	unsigned long queue_time;
-};
-
-/*
- * cqm_rmid_free_lru - A least recently used list of RMIDs.
- *
- * Oldest entry at the head, newest (most recently used) entry at the
- * tail. This list is never traversed, it's only used to keep track of
- * the lru order. That is, we only pick entries of the head or insert
- * them on the tail.
- *
- * All entries on the list are 'free', and their RMIDs are not currently
- * in use. To mark an RMID as in use, remove its entry from the lru
- * list.
- *
- *
- * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
- *
- * This list is contains RMIDs that no one is currently using but that
- * may have a non-zero occupancy value associated with them. The
- * rotation worker moves RMIDs from the limbo list to the free list once
- * the occupancy value drops below __intel_cqm_threshold.
- *
- * Both lists are protected by cache_mutex.
- */
-static LIST_HEAD(cqm_rmid_free_lru);
-static LIST_HEAD(cqm_rmid_limbo_lru);
-
-/*
- * We use a simple array of pointers so that we can lookup a struct
- * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
- * and __put_rmid() from having to worry about dealing with struct
- * cqm_rmid_entry - they just deal with rmids, i.e. integers.
- *
- * Once this array is initialized it is read-only. No locks are required
- * to access it.
- *
- * All entries for all RMIDs can be looked up in the this array at all
- * times.
- */
-static struct cqm_rmid_entry **cqm_rmid_ptrs;
-
-static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
-{
-	struct cqm_rmid_entry *entry;
-
-	entry = cqm_rmid_ptrs[rmid];
-	WARN_ON(entry->rmid != rmid);
-
-	return entry;
-}
-
-/*
- * Returns < 0 on fail.
- *
- * We expect to be called with cache_mutex held.
- */
-static u32 __get_rmid(void)
-{
-	struct cqm_rmid_entry *entry;
-
-	lockdep_assert_held(&cache_mutex);
-
-	if (list_empty(&cqm_rmid_free_lru))
-		return INVALID_RMID;
-
-	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
-	list_del(&entry->list);
-
-	return entry->rmid;
-}
-
-static void __put_rmid(u32 rmid)
-{
-	struct cqm_rmid_entry *entry;
-
-	lockdep_assert_held(&cache_mutex);
-
-	WARN_ON(!__rmid_valid(rmid));
-	entry = __rmid_entry(rmid);
-
-	entry->queue_time = jiffies;
-	entry->state = RMID_YOUNG;
-
-	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
-}
-
-static int intel_cqm_setup_rmid_cache(void)
-{
-	struct cqm_rmid_entry *entry;
-	unsigned int nr_rmids;
-	int r = 0;
-
-	nr_rmids = cqm_max_rmid + 1;
-	cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
-				nr_rmids, GFP_KERNEL);
-	if (!cqm_rmid_ptrs)
-		return -ENOMEM;
-
-	for (; r <= cqm_max_rmid; r++) {
-		struct cqm_rmid_entry *entry;
-
-		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-		if (!entry)
-			goto fail;
-
-		INIT_LIST_HEAD(&entry->list);
-		entry->rmid = r;
-		cqm_rmid_ptrs[r] = entry;
-
-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
-	}
-
-	/*
-	 * RMID 0 is special and is always allocated. It's used for all
-	 * tasks that are not monitored.
-	 */
-	entry = __rmid_entry(0);
-	list_del(&entry->list);
-
-	mutex_lock(&cache_mutex);
-	intel_cqm_rotation_rmid = __get_rmid();
-	mutex_unlock(&cache_mutex);
-
-	return 0;
-fail:
-	while (r--)
-		kfree(cqm_rmid_ptrs[r]);
-
-	kfree(cqm_rmid_ptrs);
-	return -ENOMEM;
-}
-
-/*
- * Determine if @a and @b measure the same set of tasks.
- *
- * If @a and @b measure the same set of tasks then we want to share a
- * single RMID.
- */
-static bool __match_event(struct perf_event *a, struct perf_event *b)
-{
-	/* Per-cpu and task events don't mix */
-	if ((a->attach_state & PERF_ATTACH_TASK) !=
-	    (b->attach_state & PERF_ATTACH_TASK))
-		return false;
-
-#ifdef CONFIG_CGROUP_PERF
-	if (a->cgrp != b->cgrp)
-		return false;
-#endif
-
-	/* If not task event, we're machine wide */
-	if (!(b->attach_state & PERF_ATTACH_TASK))
-		return true;
-
-	/*
-	 * Events that target same task are placed into the same cache group.
-	 */
-	if (a->hw.target == b->hw.target)
-		return true;
-
-	/*
-	 * Are we an inherited event?
-	 */
-	if (b->parent == a)
-		return true;
-
-	return false;
-}
-
-#ifdef CONFIG_CGROUP_PERF
-static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
-{
-	if (event->attach_state & PERF_ATTACH_TASK)
-		return perf_cgroup_from_task(event->hw.target, event->ctx);
-
-	return event->cgrp;
-}
-#endif
-
-/*
- * Determine if @a's tasks intersect with @b's tasks
- *
- * There are combinations of events that we explicitly prohibit,
- *
- *		   PROHIBITS
- *     system-wide    -> 	cgroup and task
- *     cgroup 	      ->	system-wide
- *     		      ->	task in cgroup
- *     task 	      -> 	system-wide
- *     		      ->	task in cgroup
- *
- * Call this function before allocating an RMID.
- */
-static bool __conflict_event(struct perf_event *a, struct perf_event *b)
-{
-#ifdef CONFIG_CGROUP_PERF
-	/*
-	 * We can have any number of cgroups but only one system-wide
-	 * event at a time.
-	 */
-	if (a->cgrp && b->cgrp) {
-		struct perf_cgroup *ac = a->cgrp;
-		struct perf_cgroup *bc = b->cgrp;
-
-		/*
-		 * This condition should have been caught in
-		 * __match_event() and we should be sharing an RMID.
-		 */
-		WARN_ON_ONCE(ac == bc);
-
-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-			return true;
-
-		return false;
-	}
-
-	if (a->cgrp || b->cgrp) {
-		struct perf_cgroup *ac, *bc;
-
-		/*
-		 * cgroup and system-wide events are mutually exclusive
-		 */
-		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
-		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
-			return true;
-
-		/*
-		 * Ensure neither event is part of the other's cgroup
-		 */
-		ac = event_to_cgroup(a);
-		bc = event_to_cgroup(b);
-		if (ac == bc)
-			return true;
-
-		/*
-		 * Must have cgroup and non-intersecting task events.
-		 */
-		if (!ac || !bc)
-			return false;
-
-		/*
-		 * We have cgroup and task events, and the task belongs
-		 * to a cgroup. Check for for overlap.
-		 */
-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
-			return true;
-
-		return false;
-	}
-#endif
-	/*
-	 * If one of them is not a task, same story as above with cgroups.
-	 */
-	if (!(a->attach_state & PERF_ATTACH_TASK) ||
-	    !(b->attach_state & PERF_ATTACH_TASK))
-		return true;
-
-	/*
-	 * Must be non-overlapping.
-	 */
-	return false;
-}
-
-struct rmid_read {
-	u32 rmid;
-	atomic64_t value;
-};
-
-static void __intel_cqm_event_count(void *info);
-
-/*
- * Exchange the RMID of a group of events.
- */
-static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
-{
-	struct perf_event *event;
-	struct list_head *head = &group->hw.cqm_group_entry;
-	u32 old_rmid = group->hw.cqm_rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	/*
-	 * If our RMID is being deallocated, perform a read now.
-	 */
-	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
-		struct rmid_read rr = {
-			.value = ATOMIC64_INIT(0),
-			.rmid = old_rmid,
-		};
-
-		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
-				 &rr, 1);
-		local64_set(&group->count, atomic64_read(&rr.value));
-	}
-
-	raw_spin_lock_irq(&cache_lock);
-
-	group->hw.cqm_rmid = rmid;
-	list_for_each_entry(event, head, hw.cqm_group_entry)
-		event->hw.cqm_rmid = rmid;
-
-	raw_spin_unlock_irq(&cache_lock);
-
-	return old_rmid;
-}
-
-/*
- * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
- * cachelines are still tagged with RMIDs in limbo, we progressively
- * increment the threshold until we find an RMID in limbo with <=
- * __intel_cqm_threshold lines tagged. This is designed to mitigate the
- * problem where cachelines tagged with an RMID are not steadily being
- * evicted.
- *
- * On successful rotations we decrease the threshold back towards zero.
- *
- * __intel_cqm_max_threshold provides an upper bound on the threshold,
- * and is measured in bytes because it's exposed to userland.
- */
-static unsigned int __intel_cqm_threshold;
-static unsigned int __intel_cqm_max_threshold;
-
-/*
- * Test whether an RMID has a zero occupancy value on this cpu.
- */
-static void intel_cqm_stable(void *arg)
-{
-	struct cqm_rmid_entry *entry;
-
-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-		if (entry->state != RMID_AVAILABLE)
-			break;
-
-		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
-			entry->state = RMID_DIRTY;
-	}
-}
-
-/*
- * If we have group events waiting for an RMID that don't conflict with
- * events already running, assign @rmid.
- */
-static bool intel_cqm_sched_in_event(u32 rmid)
-{
-	struct perf_event *leader, *event;
-
-	lockdep_assert_held(&cache_mutex);
-
-	leader = list_first_entry(&cache_groups, struct perf_event,
-				  hw.cqm_groups_entry);
-	event = leader;
-
-	list_for_each_entry_continue(event, &cache_groups,
-				     hw.cqm_groups_entry) {
-		if (__rmid_valid(event->hw.cqm_rmid))
-			continue;
-
-		if (__conflict_event(event, leader))
-			continue;
-
-		intel_cqm_xchg_rmid(event, rmid);
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Initially use this constant for both the limbo queue time and the
- * rotation timer interval, pmu::hrtimer_interval_ms.
- *
- * They don't need to be the same, but the two are related since if you
- * rotate faster than you recycle RMIDs, you may run out of available
- * RMIDs.
- */
-#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
-
-static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
-
-/*
- * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
- * @nr_available: number of freeable RMIDs on the limbo list
- *
- * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
- * cachelines are tagged with those RMIDs. After this we can reuse them
- * and know that the current set of active RMIDs is stable.
- *
- * Return %true or %false depending on whether stabilization needs to be
- * reattempted.
- *
- * If we return %true then @nr_available is updated to indicate the
- * number of RMIDs on the limbo list that have been queued for the
- * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
- * are above __intel_cqm_threshold.
- */
-static bool intel_cqm_rmid_stabilize(unsigned int *available)
-{
-	struct cqm_rmid_entry *entry, *tmp;
-
-	lockdep_assert_held(&cache_mutex);
-
-	*available = 0;
-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
-		unsigned long min_queue_time;
-		unsigned long now = jiffies;
-
-		/*
-		 * We hold RMIDs placed into limbo for a minimum queue
-		 * time. Before the minimum queue time has elapsed we do
-		 * not recycle RMIDs.
-		 *
-		 * The reasoning is that until a sufficient time has
-		 * passed since we stopped using an RMID, any RMID
-		 * placed onto the limbo list will likely still have
-		 * data tagged in the cache, which means we'll probably
-		 * fail to recycle it anyway.
-		 *
-		 * We can save ourselves an expensive IPI by skipping
-		 * any RMIDs that have not been queued for the minimum
-		 * time.
-		 */
-		min_queue_time = entry->queue_time +
-			msecs_to_jiffies(__rmid_queue_time_ms);
-
-		if (time_after(min_queue_time, now))
-			break;
-
-		entry->state = RMID_AVAILABLE;
-		(*available)++;
-	}
-
-	/*
-	 * Fast return if none of the RMIDs on the limbo list have been
-	 * sitting on the queue for the minimum queue time.
-	 */
-	if (!*available)
-		return false;
-
-	/*
-	 * Test whether an RMID is free for each package.
-	 */
-	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
-
-	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
-		/*
-		 * Exhausted all RMIDs that have waited min queue time.
-		 */
-		if (entry->state == RMID_YOUNG)
-			break;
-
-		if (entry->state == RMID_DIRTY)
-			continue;
-
-		list_del(&entry->list);	/* remove from limbo */
-
-		/*
-		 * The rotation RMID gets priority if it's
-		 * currently invalid. In which case, skip adding
-		 * the RMID to the the free lru.
-		 */
-		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
-			intel_cqm_rotation_rmid = entry->rmid;
-			continue;
-		}
-
-		/*
-		 * If we have groups waiting for RMIDs, hand
-		 * them one now provided they don't conflict.
-		 */
-		if (intel_cqm_sched_in_event(entry->rmid))
-			continue;
-
-		/*
-		 * Otherwise place it onto the free list.
-		 */
-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
-	}
-
-
-	return __rmid_valid(intel_cqm_rotation_rmid);
-}
-
-/*
- * Pick a victim group and move it to the tail of the group list.
- * @next: The first group without an RMID
- */
-static void __intel_cqm_pick_and_rotate(struct perf_event *next)
-{
-	struct perf_event *rotor;
-	u32 rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	rotor = list_first_entry(&cache_groups, struct perf_event,
-				 hw.cqm_groups_entry);
-
-	/*
-	 * The group at the front of the list should always have a valid
-	 * RMID. If it doesn't then no groups have RMIDs assigned and we
-	 * don't need to rotate the list.
-	 */
-	if (next == rotor)
-		return;
-
-	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
-	__put_rmid(rmid);
-
-	list_rotate_left(&cache_groups);
-}
-
-/*
- * Deallocate the RMIDs from any events that conflict with @event, and
- * place them on the back of the group list.
- */
-static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
-{
-	struct perf_event *group, *g;
-	u32 rmid;
-
-	lockdep_assert_held(&cache_mutex);
-
-	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
-		if (group == event)
-			continue;
-
-		rmid = group->hw.cqm_rmid;
-
-		/*
-		 * Skip events that don't have a valid RMID.
-		 */
-		if (!__rmid_valid(rmid))
-			continue;
-
-		/*
-		 * No conflict? No problem! Leave the event alone.
-		 */
-		if (!__conflict_event(group, event))
-			continue;
-
-		intel_cqm_xchg_rmid(group, INVALID_RMID);
-		__put_rmid(rmid);
-	}
-}
-
-/*
- * Attempt to rotate the groups and assign new RMIDs.
- *
- * We rotate for two reasons,
- *   1. To handle the scheduling of conflicting events
- *   2. To recycle RMIDs
- *
- * Rotating RMIDs is complicated because the hardware doesn't give us
- * any clues.
- *
- * There's problems with the hardware interface; when you change the
- * task:RMID map cachelines retain their 'old' tags, giving a skewed
- * picture. In order to work around this, we must always keep one free
- * RMID - intel_cqm_rotation_rmid.
- *
- * Rotation works by taking away an RMID from a group (the old RMID),
- * and assigning the free RMID to another group (the new RMID). We must
- * then wait for the old RMID to not be used (no cachelines tagged).
- * This ensure that all cachelines are tagged with 'active' RMIDs. At
- * this point we can start reading values for the new RMID and treat the
- * old RMID as the free RMID for the next rotation.
- *
- * Return %true or %false depending on whether we did any rotating.
- */
-static bool __intel_cqm_rmid_rotate(void)
-{
-	struct perf_event *group, *start = NULL;
-	unsigned int threshold_limit;
-	unsigned int nr_needed = 0;
-	unsigned int nr_available;
-	bool rotated = false;
-
-	mutex_lock(&cache_mutex);
-
-again:
-	/*
-	 * Fast path through this function if there are no groups and no
-	 * RMIDs that need cleaning.
-	 */
-	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
-		goto out;
-
-	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
-		if (!__rmid_valid(group->hw.cqm_rmid)) {
-			if (!start)
-				start = group;
-			nr_needed++;
-		}
-	}
-
-	/*
-	 * We have some event groups, but they all have RMIDs assigned
-	 * and no RMIDs need cleaning.
-	 */
-	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
-		goto out;
-
-	if (!nr_needed)
-		goto stabilize;
-
-	/*
-	 * We have more event groups without RMIDs than available RMIDs,
-	 * or we have event groups that conflict with the ones currently
-	 * scheduled.
-	 *
-	 * We force deallocate the rmid of the group at the head of
-	 * cache_groups. The first event group without an RMID then gets
-	 * assigned intel_cqm_rotation_rmid. This ensures we always make
-	 * forward progress.
-	 *
-	 * Rotate the cache_groups list so the previous head is now the
-	 * tail.
-	 */
-	__intel_cqm_pick_and_rotate(start);
-
-	/*
-	 * If the rotation is going to succeed, reduce the threshold so
-	 * that we don't needlessly reuse dirty RMIDs.
-	 */
-	if (__rmid_valid(intel_cqm_rotation_rmid)) {
-		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
-		intel_cqm_rotation_rmid = __get_rmid();
-
-		intel_cqm_sched_out_conflicting_events(start);
-
-		if (__intel_cqm_threshold)
-			__intel_cqm_threshold--;
-	}
-
-	rotated = true;
-
-stabilize:
-	/*
-	 * We now need to stablize the RMID we freed above (if any) to
-	 * ensure that the next time we rotate we have an RMID with zero
-	 * occupancy value.
-	 *
-	 * Alternatively, if we didn't need to perform any rotation,
-	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
-	 */
-	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
-
-	while (intel_cqm_rmid_stabilize(&nr_available) &&
-	       __intel_cqm_threshold < threshold_limit) {
-		unsigned int steal_limit;
-
-		/*
-		 * Don't spin if nobody is actively waiting for an RMID,
-		 * the rotation worker will be kicked as soon as an
-		 * event needs an RMID anyway.
-		 */
-		if (!nr_needed)
-			break;
-
-		/* Allow max 25% of RMIDs to be in limbo. */
-		steal_limit = (cqm_max_rmid + 1) / 4;
-
-		/*
-		 * We failed to stabilize any RMIDs so our rotation
-		 * logic is now stuck. In order to make forward progress
-		 * we have a few options:
-		 *
-		 *   1. rotate ("steal") another RMID
-		 *   2. increase the threshold
-		 *   3. do nothing
-		 *
-		 * We do both of 1. and 2. until we hit the steal limit.
-		 *
-		 * The steal limit prevents all RMIDs ending up on the
-		 * limbo list. This can happen if every RMID has a
-		 * non-zero occupancy above threshold_limit, and the
-		 * occupancy values aren't dropping fast enough.
-		 *
-		 * Note that there is prioritisation at work here - we'd
-		 * rather increase the number of RMIDs on the limbo list
-		 * than increase the threshold, because increasing the
-		 * threshold skews the event data (because we reuse
-		 * dirty RMIDs) - threshold bumps are a last resort.
-		 */
-		if (nr_available < steal_limit)
-			goto again;
-
-		__intel_cqm_threshold++;
-	}
-
-out:
-	mutex_unlock(&cache_mutex);
-	return rotated;
-}
-
-static void intel_cqm_rmid_rotate(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
-
-static struct pmu intel_cqm_pmu;
-
-static void intel_cqm_rmid_rotate(struct work_struct *work)
-{
-	unsigned long delay;
-
-	__intel_cqm_rmid_rotate();
-
-	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
-	schedule_delayed_work(&intel_cqm_rmid_work, delay);
-}
-
-/*
- * Find a group and setup RMID.
- *
- * If we're part of a group, we use the group's RMID.
- */
-static void intel_cqm_setup_event(struct perf_event *event,
-				  struct perf_event **group)
-{
-	struct perf_event *iter;
-	bool conflict = false;
-	u32 rmid;
-
-	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
-		rmid = iter->hw.cqm_rmid;
-
-		if (__match_event(iter, event)) {
-			/* All tasks in a group share an RMID */
-			event->hw.cqm_rmid = rmid;
-			*group = iter;
-			return;
-		}
-
-		/*
-		 * We only care about conflicts for events that are
-		 * actually scheduled in (and hence have a valid RMID).
-		 */
-		if (__conflict_event(iter, event) && __rmid_valid(rmid))
-			conflict = true;
-	}
-
-	if (conflict)
-		rmid = INVALID_RMID;
-	else
-		rmid = __get_rmid();
-
-	event->hw.cqm_rmid = rmid;
-}
-
-static void intel_cqm_event_read(struct perf_event *event)
-{
-	unsigned long flags;
-	u32 rmid;
-	u64 val;
-
-	/*
-	 * Task events are handled by intel_cqm_event_count().
-	 */
-	if (event->cpu == -1)
-		return;
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-	rmid = event->hw.cqm_rmid;
-
-	if (!__rmid_valid(rmid))
-		goto out;
-
-	val = __rmid_read(rmid);
-
-	/*
-	 * Ignore this reading on error states and do not update the value.
-	 */
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		goto out;
-
-	local64_set(&event->count, val);
-out:
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-}
-
-static void __intel_cqm_event_count(void *info)
-{
-	struct rmid_read *rr = info;
-	u64 val;
-
-	val = __rmid_read(rr->rmid);
-
-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return;
-
-	atomic64_add(val, &rr->value);
-}
-
-static inline bool cqm_group_leader(struct perf_event *event)
-{
-	return !list_empty(&event->hw.cqm_groups_entry);
-}
-
-static u64 intel_cqm_event_count(struct perf_event *event)
-{
-	unsigned long flags;
-	struct rmid_read rr = {
-		.value = ATOMIC64_INIT(0),
-	};
-
-	/*
-	 * We only need to worry about task events. System-wide events
-	 * are handled like usual, i.e. entirely with
-	 * intel_cqm_event_read().
-	 */
-	if (event->cpu != -1)
-		return __perf_event_count(event);
-
-	/*
-	 * Only the group leader gets to report values. This stops us
-	 * reporting duplicate values to userspace, and gives us a clear
-	 * rule for which task gets to report the values.
-	 *
-	 * Note that it is impossible to attribute these values to
-	 * specific packages - we forfeit that ability when we create
-	 * task events.
-	 */
-	if (!cqm_group_leader(event))
-		return 0;
-
-	/*
-	 * Getting up-to-date values requires an SMP IPI which is not
-	 * possible if we're being called in interrupt context. Return
-	 * the cached values instead.
-	 */
-	if (unlikely(in_interrupt()))
-		goto out;
-
-	/*
-	 * Notice that we don't perform the reading of an RMID
-	 * atomically, because we can't hold a spin lock across the
-	 * IPIs.
-	 *
-	 * Speculatively perform the read, since @event might be
-	 * assigned a different (possibly invalid) RMID while we're
-	 * busying performing the IPI calls. It's therefore necessary to
-	 * check @event's RMID afterwards, and if it has changed,
-	 * discard the result of the read.
-	 */
-	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
-
-	if (!__rmid_valid(rr.rmid))
-		goto out;
-
-	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-	if (event->hw.cqm_rmid == rr.rmid)
-		local64_set(&event->count, atomic64_read(&rr.value));
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-out:
-	return __perf_event_count(event);
-}
-
-static void intel_cqm_event_start(struct perf_event *event, int mode)
-{
-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-	u32 rmid = event->hw.cqm_rmid;
-
-	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
-		return;
-
-	event->hw.cqm_state &= ~PERF_HES_STOPPED;
-
-	if (state->rmid_usecnt++) {
-		if (!WARN_ON_ONCE(state->rmid != rmid))
-			return;
-	} else {
-		WARN_ON_ONCE(state->rmid);
-	}
-
-	state->rmid = rmid;
-	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
-}
-
-static void intel_cqm_event_stop(struct perf_event *event, int mode)
-{
-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-
-	if (event->hw.cqm_state & PERF_HES_STOPPED)
-		return;
-
-	event->hw.cqm_state |= PERF_HES_STOPPED;
-
-	intel_cqm_event_read(event);
-
-	if (!--state->rmid_usecnt) {
-		state->rmid = 0;
-		wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
-	} else {
-		WARN_ON_ONCE(!state->rmid);
-	}
-}
-
-static int intel_cqm_event_add(struct perf_event *event, int mode)
-{
-	unsigned long flags;
-	u32 rmid;
-
-	raw_spin_lock_irqsave(&cache_lock, flags);
-
-	event->hw.cqm_state = PERF_HES_STOPPED;
-	rmid = event->hw.cqm_rmid;
-
-	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
-		intel_cqm_event_start(event, mode);
-
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
-
-	return 0;
-}
-
-static void intel_cqm_event_destroy(struct perf_event *event)
-{
-	struct perf_event *group_other = NULL;
-
-	mutex_lock(&cache_mutex);
-
-	/*
-	 * If there's another event in this group...
-	 */
-	if (!list_empty(&event->hw.cqm_group_entry)) {
-		group_other = list_first_entry(&event->hw.cqm_group_entry,
-					       struct perf_event,
-					       hw.cqm_group_entry);
-		list_del(&event->hw.cqm_group_entry);
-	}
-
-	/*
-	 * And we're the group leader..
-	 */
-	if (cqm_group_leader(event)) {
-		/*
-		 * If there was a group_other, make that leader, otherwise
-		 * destroy the group and return the RMID.
-		 */
-		if (group_other) {
-			list_replace(&event->hw.cqm_groups_entry,
-				     &group_other->hw.cqm_groups_entry);
-		} else {
-			u32 rmid = event->hw.cqm_rmid;
-
-			if (__rmid_valid(rmid))
-				__put_rmid(rmid);
-			list_del(&event->hw.cqm_groups_entry);
-		}
-	}
-
-	mutex_unlock(&cache_mutex);
-}
-
-static int intel_cqm_event_init(struct perf_event *event)
-{
-	struct perf_event *group = NULL;
-	bool rotate = false;
-
-	if (event->attr.type != intel_cqm_pmu.type)
-		return -ENOENT;
-
-	if (event->attr.config & ~QOS_EVENT_MASK)
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
-	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
-
-	event->destroy = intel_cqm_event_destroy;
-
-	mutex_lock(&cache_mutex);
-
-	/* Will also set rmid */
-	intel_cqm_setup_event(event, &group);
-
-	if (group) {
-		list_add_tail(&event->hw.cqm_group_entry,
-			      &group->hw.cqm_group_entry);
-	} else {
-		list_add_tail(&event->hw.cqm_groups_entry,
-			      &cache_groups);
-
-		/*
-		 * All RMIDs are either in use or have recently been
-		 * used. Kick the rotation worker to clean/free some.
-		 *
-		 * We only do this for the group leader, rather than for
-		 * every event in a group to save on needless work.
-		 */
-		if (!__rmid_valid(event->hw.cqm_rmid))
-			rotate = true;
-	}
-
-	mutex_unlock(&cache_mutex);
-
-	if (rotate)
-		schedule_delayed_work(&intel_cqm_rmid_work, 0);
-
-	return 0;
-}
-
-EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
-EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
-EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
-EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
-
-static struct attribute *intel_cqm_events_attr[] = {
-	EVENT_PTR(intel_cqm_llc),
-	EVENT_PTR(intel_cqm_llc_pkg),
-	EVENT_PTR(intel_cqm_llc_unit),
-	EVENT_PTR(intel_cqm_llc_scale),
-	EVENT_PTR(intel_cqm_llc_snapshot),
-	NULL,
-};
-
-static struct attribute_group intel_cqm_events_group = {
-	.name = "events",
-	.attrs = intel_cqm_events_attr,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-7");
-static struct attribute *intel_cqm_formats_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group intel_cqm_format_group = {
-	.name = "format",
-	.attrs = intel_cqm_formats_attr,
-};
-
-static ssize_t
-max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
-			   char *page)
-{
-	ssize_t rv;
-
-	mutex_lock(&cache_mutex);
-	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
-	mutex_unlock(&cache_mutex);
-
-	return rv;
-}
-
-static ssize_t
-max_recycle_threshold_store(struct device *dev,
-			    struct device_attribute *attr,
-			    const char *buf, size_t count)
-{
-	unsigned int bytes, cachelines;
-	int ret;
-
-	ret = kstrtouint(buf, 0, &bytes);
-	if (ret)
-		return ret;
-
-	mutex_lock(&cache_mutex);
-
-	__intel_cqm_max_threshold = bytes;
-	cachelines = bytes / cqm_l3_scale;
-
-	/*
-	 * The new maximum takes effect immediately.
-	 */
-	if (__intel_cqm_threshold > cachelines)
-		__intel_cqm_threshold = cachelines;
-
-	mutex_unlock(&cache_mutex);
-
-	return count;
-}
-
-static DEVICE_ATTR_RW(max_recycle_threshold);
-
-static struct attribute *intel_cqm_attrs[] = {
-	&dev_attr_max_recycle_threshold.attr,
-	NULL,
-};
-
-static const struct attribute_group intel_cqm_group = {
-	.attrs = intel_cqm_attrs,
-};
-
-static const struct attribute_group *intel_cqm_attr_groups[] = {
-	&intel_cqm_events_group,
-	&intel_cqm_format_group,
-	&intel_cqm_group,
-	NULL,
-};
-
-static struct pmu intel_cqm_pmu = {
-	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
-	.attr_groups	     = intel_cqm_attr_groups,
-	.task_ctx_nr	     = perf_sw_context,
-	.event_init	     = intel_cqm_event_init,
-	.add		     = intel_cqm_event_add,
-	.del		     = intel_cqm_event_stop,
-	.start		     = intel_cqm_event_start,
-	.stop		     = intel_cqm_event_stop,
-	.read		     = intel_cqm_event_read,
-	.count		     = intel_cqm_event_count,
-};
-
-static inline void cqm_pick_event_reader(int cpu)
-{
-	int phys_id = topology_physical_package_id(cpu);
-	int i;
-
-	for_each_cpu(i, &cqm_cpumask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;	/* already got reader for this socket */
-	}
-
-	cpumask_set_cpu(cpu, &cqm_cpumask);
-}
-
-static void intel_cqm_cpu_starting(unsigned int cpu)
-{
-	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-	state->rmid = 0;
-	state->closid = 0;
-	state->rmid_usecnt = 0;
-
-	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
-	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
-}
-
-static void intel_cqm_cpu_exit(unsigned int cpu)
-{
-	int phys_id = topology_physical_package_id(cpu);
-	int i;
-
-	/*
-	 * Is @cpu a designated cqm reader?
-	 */
-	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
-		return;
-
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
-
-		if (phys_id == topology_physical_package_id(i)) {
-			cpumask_set_cpu(i, &cqm_cpumask);
-			break;
-		}
-	}
-}
-
-static int intel_cqm_cpu_notifier(struct notifier_block *nb,
-				  unsigned long action, void *hcpu)
-{
-	unsigned int cpu  = (unsigned long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_PREPARE:
-		intel_cqm_cpu_exit(cpu);
-		break;
-	case CPU_STARTING:
-		intel_cqm_cpu_starting(cpu);
-		cqm_pick_event_reader(cpu);
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static const struct x86_cpu_id intel_cqm_match[] = {
-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
-	{}
-};
-
-static int __init intel_cqm_init(void)
-{
-	char *str, scale[20];
-	int i, cpu, ret;
-
-	if (!x86_match_cpu(intel_cqm_match))
-		return -ENODEV;
-
-	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
-
-	/*
-	 * It's possible that not all resources support the same number
-	 * of RMIDs. Instead of making scheduling much more complicated
-	 * (where we have to match a task's RMID to a cpu that supports
-	 * that many RMIDs) just find the minimum RMIDs supported across
-	 * all cpus.
-	 *
-	 * Also, check that the scales match on all cpus.
-	 */
-	cpu_notifier_register_begin();
-
-	for_each_online_cpu(cpu) {
-		struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-		if (c->x86_cache_max_rmid < cqm_max_rmid)
-			cqm_max_rmid = c->x86_cache_max_rmid;
-
-		if (c->x86_cache_occ_scale != cqm_l3_scale) {
-			pr_err("Multiple LLC scale values, disabling\n");
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	/*
-	 * A reasonable upper limit on the max threshold is the number
-	 * of lines tagged per RMID if all RMIDs have the same number of
-	 * lines tagged in the LLC.
-	 *
-	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
-	 */
-	__intel_cqm_max_threshold =
-		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
-
-	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
-	str = kstrdup(scale, GFP_KERNEL);
-	if (!str) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	event_attr_intel_cqm_llc_scale.event_str = str;
-
-	ret = intel_cqm_setup_rmid_cache();
-	if (ret)
-		goto out;
-
-	for_each_online_cpu(i) {
-		intel_cqm_cpu_starting(i);
-		cqm_pick_event_reader(i);
-	}
-
-	__perf_cpu_notifier(intel_cqm_cpu_notifier);
-
-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-	if (ret)
-		pr_err("Intel CQM perf registration failed: %d\n", ret);
-	else
-		pr_info("Intel CQM monitoring enabled\n");
-
-out:
-	cpu_notifier_register_done();
-
-	return ret;
-}
-device_initcall(intel_cqm_init);
-- 
cgit v0.10.2


From 6aec1ad7365661315e9ab13b17eeb97ab3c38176 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:10 +0100
Subject: perf/x86: Move perf_event_intel_cstate.c ..... =>
 x86/events/intel/cstate.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-5-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index f97c283..e0a70fe 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -6,3 +6,4 @@ ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
new file mode 100644
index 0000000..1bbf37e
--- /dev/null
+++ b/arch/x86/events/intel/cstate.c
@@ -0,0 +1,694 @@
+/*
+ * perf_event_intel_cstate.c: support cstate residency counters
+ *
+ * Copyright (C) 2015, Intel Corp.
+ * Author: Kan Liang (kan.liang@intel.com)
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ */
+
+/*
+ * This file export cstate related free running (read-only) counters
+ * for perf. These counters may be use simultaneously by other tools,
+ * such as turbostat. However, it still make sense to implement them
+ * in perf. Because we can conveniently collect them together with
+ * other events, and allow to use them from tools without special MSR
+ * access code.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it is not supported by the hardware.
+ *
+ * According to counters' scope and category, two PMUs are registered
+ * with the perf_event core subsystem.
+ *  - 'cstate_core': The counter is available for each physical core.
+ *    The counters include CORE_C*_RESIDENCY.
+ *  - 'cstate_pkg': The counter is available for each physical package.
+ *    The counters include PKG_C*_RESIDENCY.
+ *
+ * All of these counters are specified in the Intel® 64 and IA-32
+ * Architectures Software Developer.s Manual Vol3b.
+ *
+ * Model specific counters:
+ *	MSR_CORE_C1_RES: CORE C1 Residency Counter
+ *			 perf code: 0x00
+ *			 Available model: SLM,AMT
+ *			 Scope: Core (each processor core has a MSR)
+ *	MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
+ *			       perf code: 0x01
+ *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Core
+ *	MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
+ *			       perf code: 0x02
+ *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Core
+ *	MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
+ *			       perf code: 0x03
+ *			       Available model: SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Core
+ *	MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
+ *			       perf code: 0x00
+ *			       Available model: SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
+ *			       perf code: 0x01
+ *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
+ *			       perf code: 0x02
+ *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
+ *			       perf code: 0x03
+ *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
+ *			       perf code: 0x04
+ *			       Available model: HSW ULT only
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
+ *			       perf code: 0x05
+ *			       Available model: HSW ULT only
+ *			       Scope: Package (physical package)
+ *	MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
+ *			       perf code: 0x06
+ *			       Available model: HSW ULT only
+ *			       Scope: Package (physical package)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../../kernel/cpu/perf_event.h"
+
+#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)		\
+static ssize_t __cstate_##_var##_show(struct kobject *kobj,	\
+				struct kobj_attribute *attr,	\
+				char *page)			\
+{								\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\
+	return sprintf(page, _format "\n");			\
+}								\
+static struct kobj_attribute format_attr_##_var =		\
+	__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf);
+
+struct perf_cstate_msr {
+	u64	msr;
+	struct	perf_pmu_events_attr *attr;
+	bool	(*test)(int idx);
+};
+
+
+/* cstate_core PMU */
+
+static struct pmu cstate_core_pmu;
+static bool has_cstate_core;
+
+enum perf_cstate_core_id {
+	/*
+	 * cstate_core events
+	 */
+	PERF_CSTATE_CORE_C1_RES = 0,
+	PERF_CSTATE_CORE_C3_RES,
+	PERF_CSTATE_CORE_C6_RES,
+	PERF_CSTATE_CORE_C7_RES,
+
+	PERF_CSTATE_CORE_EVENT_MAX,
+};
+
+bool test_core(int idx)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return false;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+		if (idx == PERF_CSTATE_CORE_C3_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES)
+			return true;
+		break;
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 69: /* 22nm Haswell ULT */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		if (idx == PERF_CSTATE_CORE_C3_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES ||
+		    idx == PERF_CSTATE_CORE_C7_RES)
+			return true;
+		break;
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case 76: /* 14nm Atom "Airmont"                   */
+		if (idx == PERF_CSTATE_CORE_C1_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES)
+			return true;
+		break;
+	}
+
+	return false;
+}
+
+PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
+
+static struct perf_cstate_msr core_msr[] = {
+	[PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,		&evattr_cstate_core_c1,	test_core, },
+	[PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,	&evattr_cstate_core_c3, test_core, },
+	[PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,	&evattr_cstate_core_c6, test_core, },
+	[PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,	&evattr_cstate_core_c7,	test_core, },
+};
+
+static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
+	NULL,
+};
+
+static struct attribute_group core_events_attr_group = {
+	.name = "events",
+	.attrs = core_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
+static struct attribute *core_format_attrs[] = {
+	&format_attr_core_event.attr,
+	NULL,
+};
+
+static struct attribute_group core_format_attr_group = {
+	.name = "format",
+	.attrs = core_format_attrs,
+};
+
+static cpumask_t cstate_core_cpu_mask;
+static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
+
+static struct attribute *cstate_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group cpumask_attr_group = {
+	.attrs = cstate_cpumask_attrs,
+};
+
+static const struct attribute_group *core_attr_groups[] = {
+	&core_events_attr_group,
+	&core_format_attr_group,
+	&cpumask_attr_group,
+	NULL,
+};
+
+/* cstate_core PMU end */
+
+
+/* cstate_pkg PMU */
+
+static struct pmu cstate_pkg_pmu;
+static bool has_cstate_pkg;
+
+enum perf_cstate_pkg_id {
+	/*
+	 * cstate_pkg events
+	 */
+	PERF_CSTATE_PKG_C2_RES = 0,
+	PERF_CSTATE_PKG_C3_RES,
+	PERF_CSTATE_PKG_C6_RES,
+	PERF_CSTATE_PKG_C7_RES,
+	PERF_CSTATE_PKG_C8_RES,
+	PERF_CSTATE_PKG_C9_RES,
+	PERF_CSTATE_PKG_C10_RES,
+
+	PERF_CSTATE_PKG_EVENT_MAX,
+};
+
+bool test_pkg(int idx)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return false;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+		if (idx == PERF_CSTATE_CORE_C3_RES ||
+		    idx == PERF_CSTATE_CORE_C6_RES ||
+		    idx == PERF_CSTATE_CORE_C7_RES)
+			return true;
+		break;
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		if (idx == PERF_CSTATE_PKG_C2_RES ||
+		    idx == PERF_CSTATE_PKG_C3_RES ||
+		    idx == PERF_CSTATE_PKG_C6_RES ||
+		    idx == PERF_CSTATE_PKG_C7_RES)
+			return true;
+		break;
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case 76: /* 14nm Atom "Airmont"                   */
+		if (idx == PERF_CSTATE_CORE_C6_RES)
+			return true;
+		break;
+	case 69: /* 22nm Haswell ULT */
+		if (idx == PERF_CSTATE_PKG_C2_RES ||
+		    idx == PERF_CSTATE_PKG_C3_RES ||
+		    idx == PERF_CSTATE_PKG_C6_RES ||
+		    idx == PERF_CSTATE_PKG_C7_RES ||
+		    idx == PERF_CSTATE_PKG_C8_RES ||
+		    idx == PERF_CSTATE_PKG_C9_RES ||
+		    idx == PERF_CSTATE_PKG_C10_RES)
+			return true;
+		break;
+	}
+
+	return false;
+}
+
+PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
+PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
+PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
+PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
+PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
+PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
+PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
+
+static struct perf_cstate_msr pkg_msr[] = {
+	[PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,	&evattr_cstate_pkg_c2,	test_pkg, },
+	[PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,	&evattr_cstate_pkg_c3,	test_pkg, },
+	[PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,	&evattr_cstate_pkg_c6,	test_pkg, },
+	[PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,	&evattr_cstate_pkg_c7,	test_pkg, },
+	[PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,	&evattr_cstate_pkg_c8,	test_pkg, },
+	[PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,	&evattr_cstate_pkg_c9,	test_pkg, },
+	[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,	&evattr_cstate_pkg_c10,	test_pkg, },
+};
+
+static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
+	NULL,
+};
+
+static struct attribute_group pkg_events_attr_group = {
+	.name = "events",
+	.attrs = pkg_events_attrs,
+};
+
+DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
+static struct attribute *pkg_format_attrs[] = {
+	&format_attr_pkg_event.attr,
+	NULL,
+};
+static struct attribute_group pkg_format_attr_group = {
+	.name = "format",
+	.attrs = pkg_format_attrs,
+};
+
+static cpumask_t cstate_pkg_cpu_mask;
+
+static const struct attribute_group *pkg_attr_groups[] = {
+	&pkg_events_attr_group,
+	&pkg_format_attr_group,
+	&cpumask_attr_group,
+	NULL,
+};
+
+/* cstate_pkg PMU end*/
+
+static ssize_t cstate_get_attr_cpumask(struct device *dev,
+				       struct device_attribute *attr,
+				       char *buf)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	if (pmu == &cstate_core_pmu)
+		return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
+	else if (pmu == &cstate_pkg_pmu)
+		return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
+	else
+		return 0;
+}
+
+static int cstate_pmu_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config;
+	int ret = 0;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	if (event->pmu == &cstate_core_pmu) {
+		if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
+			return -EINVAL;
+		if (!core_msr[cfg].attr)
+			return -EINVAL;
+		event->hw.event_base = core_msr[cfg].msr;
+	} else if (event->pmu == &cstate_pkg_pmu) {
+		if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
+			return -EINVAL;
+		if (!pkg_msr[cfg].attr)
+			return -EINVAL;
+		event->hw.event_base = pkg_msr[cfg].msr;
+	} else
+		return -ENOENT;
+
+	/* must be done before validate_group */
+	event->hw.config = cfg;
+	event->hw.idx = -1;
+
+	return ret;
+}
+
+static inline u64 cstate_pmu_read_counter(struct perf_event *event)
+{
+	u64 val;
+
+	rdmsrl(event->hw.event_base, val);
+	return val;
+}
+
+static void cstate_pmu_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev_raw_count, new_raw_count;
+
+again:
+	prev_raw_count = local64_read(&hwc->prev_count);
+	new_raw_count = cstate_pmu_read_counter(event);
+
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			    new_raw_count) != prev_raw_count)
+		goto again;
+
+	local64_add(new_raw_count - prev_raw_count, &event->count);
+}
+
+static void cstate_pmu_event_start(struct perf_event *event, int mode)
+{
+	local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
+}
+
+static void cstate_pmu_event_stop(struct perf_event *event, int mode)
+{
+	cstate_pmu_event_update(event);
+}
+
+static void cstate_pmu_event_del(struct perf_event *event, int mode)
+{
+	cstate_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int cstate_pmu_event_add(struct perf_event *event, int mode)
+{
+	if (mode & PERF_EF_START)
+		cstate_pmu_event_start(event, mode);
+
+	return 0;
+}
+
+static void cstate_cpu_exit(int cpu)
+{
+	int i, id, target;
+
+	/* cpu exit for cstate core */
+	if (has_cstate_core) {
+		id = topology_core_id(cpu);
+		target = -1;
+
+		for_each_online_cpu(i) {
+			if (i == cpu)
+				continue;
+			if (id == topology_core_id(i)) {
+				target = i;
+				break;
+			}
+		}
+		if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
+			cpumask_set_cpu(target, &cstate_core_cpu_mask);
+		WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
+		if (target >= 0)
+			perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
+	}
+
+	/* cpu exit for cstate pkg */
+	if (has_cstate_pkg) {
+		id = topology_physical_package_id(cpu);
+		target = -1;
+
+		for_each_online_cpu(i) {
+			if (i == cpu)
+				continue;
+			if (id == topology_physical_package_id(i)) {
+				target = i;
+				break;
+			}
+		}
+		if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
+			cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
+		WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
+		if (target >= 0)
+			perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
+	}
+}
+
+static void cstate_cpu_init(int cpu)
+{
+	int i, id;
+
+	/* cpu init for cstate core */
+	if (has_cstate_core) {
+		id = topology_core_id(cpu);
+		for_each_cpu(i, &cstate_core_cpu_mask) {
+			if (id == topology_core_id(i))
+				break;
+		}
+		if (i >= nr_cpu_ids)
+			cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
+	}
+
+	/* cpu init for cstate pkg */
+	if (has_cstate_pkg) {
+		id = topology_physical_package_id(cpu);
+		for_each_cpu(i, &cstate_pkg_cpu_mask) {
+			if (id == topology_physical_package_id(i))
+				break;
+		}
+		if (i >= nr_cpu_ids)
+			cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
+	}
+}
+
+static int cstate_cpu_notifier(struct notifier_block *self,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		break;
+	case CPU_STARTING:
+		cstate_cpu_init(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DYING:
+		break;
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		break;
+	case CPU_DOWN_PREPARE:
+		cstate_cpu_exit(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+/*
+ * Probe the cstate events and insert the available one into sysfs attrs
+ * Return false if there is no available events.
+ */
+static bool cstate_probe_msr(struct perf_cstate_msr *msr,
+			     struct attribute	**events_attrs,
+			     int max_event_nr)
+{
+	int i, j = 0;
+	u64 val;
+
+	/* Probe the cstate events. */
+	for (i = 0; i < max_event_nr; i++) {
+		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+			msr[i].attr = NULL;
+	}
+
+	/* List remaining events in the sysfs attrs. */
+	for (i = 0; i < max_event_nr; i++) {
+		if (msr[i].attr)
+			events_attrs[j++] = &msr[i].attr->attr.attr;
+	}
+	events_attrs[j] = NULL;
+
+	return (j > 0) ? true : false;
+}
+
+static int __init cstate_init(void)
+{
+	/* SLM has different MSR for PKG C6 */
+	switch (boot_cpu_data.x86_model) {
+	case 55:
+	case 76:
+	case 77:
+		pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
+	}
+
+	if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
+		has_cstate_core = true;
+
+	if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
+		has_cstate_pkg = true;
+
+	return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
+}
+
+static void __init cstate_cpumask_init(void)
+{
+	int cpu;
+
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu)
+		cstate_cpu_init(cpu);
+
+	__perf_cpu_notifier(cstate_cpu_notifier);
+
+	cpu_notifier_register_done();
+}
+
+static struct pmu cstate_core_pmu = {
+	.attr_groups	= core_attr_groups,
+	.name		= "cstate_core",
+	.task_ctx_nr	= perf_invalid_context,
+	.event_init	= cstate_pmu_event_init,
+	.add		= cstate_pmu_event_add, /* must have */
+	.del		= cstate_pmu_event_del, /* must have */
+	.start		= cstate_pmu_event_start,
+	.stop		= cstate_pmu_event_stop,
+	.read		= cstate_pmu_event_update,
+	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static struct pmu cstate_pkg_pmu = {
+	.attr_groups	= pkg_attr_groups,
+	.name		= "cstate_pkg",
+	.task_ctx_nr	= perf_invalid_context,
+	.event_init	= cstate_pmu_event_init,
+	.add		= cstate_pmu_event_add, /* must have */
+	.del		= cstate_pmu_event_del, /* must have */
+	.start		= cstate_pmu_event_start,
+	.stop		= cstate_pmu_event_stop,
+	.read		= cstate_pmu_event_update,
+	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static void __init cstate_pmus_register(void)
+{
+	int err;
+
+	if (has_cstate_core) {
+		err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
+		if (WARN_ON(err))
+			pr_info("Failed to register PMU %s error %d\n",
+				cstate_core_pmu.name, err);
+	}
+
+	if (has_cstate_pkg) {
+		err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
+		if (WARN_ON(err))
+			pr_info("Failed to register PMU %s error %d\n",
+				cstate_pkg_pmu.name, err);
+	}
+}
+
+static int __init cstate_pmu_init(void)
+{
+	int err;
+
+	if (cpu_has_hypervisor)
+		return -ENODEV;
+
+	err = cstate_init();
+	if (err)
+		return err;
+
+	cstate_cpumask_init();
+
+	cstate_pmus_register();
+
+	return 0;
+}
+
+device_initcall(cstate_pmu_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 64781cf..1bc881b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -35,7 +35,6 @@ obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_cstate.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cstate.c b/arch/x86/kernel/cpu/perf_event_intel_cstate.c
deleted file mode 100644
index 75a38b5..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_cstate.c
+++ /dev/null
@@ -1,694 +0,0 @@
-/*
- * perf_event_intel_cstate.c: support cstate residency counters
- *
- * Copyright (C) 2015, Intel Corp.
- * Author: Kan Liang (kan.liang@intel.com)
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- */
-
-/*
- * This file export cstate related free running (read-only) counters
- * for perf. These counters may be use simultaneously by other tools,
- * such as turbostat. However, it still make sense to implement them
- * in perf. Because we can conveniently collect them together with
- * other events, and allow to use them from tools without special MSR
- * access code.
- *
- * The events only support system-wide mode counting. There is no
- * sampling support because it is not supported by the hardware.
- *
- * According to counters' scope and category, two PMUs are registered
- * with the perf_event core subsystem.
- *  - 'cstate_core': The counter is available for each physical core.
- *    The counters include CORE_C*_RESIDENCY.
- *  - 'cstate_pkg': The counter is available for each physical package.
- *    The counters include PKG_C*_RESIDENCY.
- *
- * All of these counters are specified in the Intel® 64 and IA-32
- * Architectures Software Developer.s Manual Vol3b.
- *
- * Model specific counters:
- *	MSR_CORE_C1_RES: CORE C1 Residency Counter
- *			 perf code: 0x00
- *			 Available model: SLM,AMT
- *			 Scope: Core (each processor core has a MSR)
- *	MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
- *			       perf code: 0x01
- *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *			       Scope: Core
- *	MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
- *			       perf code: 0x02
- *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *			       Scope: Core
- *	MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
- *			       perf code: 0x03
- *			       Available model: SNB,IVB,HSW,BDW,SKL
- *			       Scope: Core
- *	MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
- *			       perf code: 0x00
- *			       Available model: SNB,IVB,HSW,BDW,SKL
- *			       Scope: Package (physical package)
- *	MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
- *			       perf code: 0x01
- *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *			       Scope: Package (physical package)
- *	MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
- *			       perf code: 0x02
- *			       Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *			       Scope: Package (physical package)
- *	MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
- *			       perf code: 0x03
- *			       Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL
- *			       Scope: Package (physical package)
- *	MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
- *			       perf code: 0x04
- *			       Available model: HSW ULT only
- *			       Scope: Package (physical package)
- *	MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
- *			       perf code: 0x05
- *			       Available model: HSW ULT only
- *			       Scope: Package (physical package)
- *	MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
- *			       perf code: 0x06
- *			       Available model: HSW ULT only
- *			       Scope: Package (physical package)
- *
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-#define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)		\
-static ssize_t __cstate_##_var##_show(struct kobject *kobj,	\
-				struct kobj_attribute *attr,	\
-				char *page)			\
-{								\
-	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\
-	return sprintf(page, _format "\n");			\
-}								\
-static struct kobj_attribute format_attr_##_var =		\
-	__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
-				       struct device_attribute *attr,
-				       char *buf);
-
-struct perf_cstate_msr {
-	u64	msr;
-	struct	perf_pmu_events_attr *attr;
-	bool	(*test)(int idx);
-};
-
-
-/* cstate_core PMU */
-
-static struct pmu cstate_core_pmu;
-static bool has_cstate_core;
-
-enum perf_cstate_core_id {
-	/*
-	 * cstate_core events
-	 */
-	PERF_CSTATE_CORE_C1_RES = 0,
-	PERF_CSTATE_CORE_C3_RES,
-	PERF_CSTATE_CORE_C6_RES,
-	PERF_CSTATE_CORE_C7_RES,
-
-	PERF_CSTATE_CORE_EVENT_MAX,
-};
-
-bool test_core(int idx)
-{
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-	    boot_cpu_data.x86 != 6)
-		return false;
-
-	switch (boot_cpu_data.x86_model) {
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
-
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
-		if (idx == PERF_CSTATE_CORE_C3_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES)
-			return true;
-		break;
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
-
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
-
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 69: /* 22nm Haswell ULT */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
-
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-		if (idx == PERF_CSTATE_CORE_C3_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES ||
-		    idx == PERF_CSTATE_CORE_C7_RES)
-			return true;
-		break;
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-	case 76: /* 14nm Atom "Airmont"                   */
-		if (idx == PERF_CSTATE_CORE_C1_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES)
-			return true;
-		break;
-	}
-
-	return false;
-}
-
-PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00");
-PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01");
-PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02");
-PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03");
-
-static struct perf_cstate_msr core_msr[] = {
-	[PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES,		&evattr_cstate_core_c1,	test_core, },
-	[PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY,	&evattr_cstate_core_c3, test_core, },
-	[PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY,	&evattr_cstate_core_c6, test_core, },
-	[PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY,	&evattr_cstate_core_c7,	test_core, },
-};
-
-static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = {
-	NULL,
-};
-
-static struct attribute_group core_events_attr_group = {
-	.name = "events",
-	.attrs = core_events_attrs,
-};
-
-DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63");
-static struct attribute *core_format_attrs[] = {
-	&format_attr_core_event.attr,
-	NULL,
-};
-
-static struct attribute_group core_format_attr_group = {
-	.name = "format",
-	.attrs = core_format_attrs,
-};
-
-static cpumask_t cstate_core_cpu_mask;
-static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
-
-static struct attribute *cstate_cpumask_attrs[] = {
-	&dev_attr_cpumask.attr,
-	NULL,
-};
-
-static struct attribute_group cpumask_attr_group = {
-	.attrs = cstate_cpumask_attrs,
-};
-
-static const struct attribute_group *core_attr_groups[] = {
-	&core_events_attr_group,
-	&core_format_attr_group,
-	&cpumask_attr_group,
-	NULL,
-};
-
-/* cstate_core PMU end */
-
-
-/* cstate_pkg PMU */
-
-static struct pmu cstate_pkg_pmu;
-static bool has_cstate_pkg;
-
-enum perf_cstate_pkg_id {
-	/*
-	 * cstate_pkg events
-	 */
-	PERF_CSTATE_PKG_C2_RES = 0,
-	PERF_CSTATE_PKG_C3_RES,
-	PERF_CSTATE_PKG_C6_RES,
-	PERF_CSTATE_PKG_C7_RES,
-	PERF_CSTATE_PKG_C8_RES,
-	PERF_CSTATE_PKG_C9_RES,
-	PERF_CSTATE_PKG_C10_RES,
-
-	PERF_CSTATE_PKG_EVENT_MAX,
-};
-
-bool test_pkg(int idx)
-{
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-	    boot_cpu_data.x86 != 6)
-		return false;
-
-	switch (boot_cpu_data.x86_model) {
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
-
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
-		if (idx == PERF_CSTATE_CORE_C3_RES ||
-		    idx == PERF_CSTATE_CORE_C6_RES ||
-		    idx == PERF_CSTATE_CORE_C7_RES)
-			return true;
-		break;
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
-
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
-
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
-
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-		if (idx == PERF_CSTATE_PKG_C2_RES ||
-		    idx == PERF_CSTATE_PKG_C3_RES ||
-		    idx == PERF_CSTATE_PKG_C6_RES ||
-		    idx == PERF_CSTATE_PKG_C7_RES)
-			return true;
-		break;
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-	case 76: /* 14nm Atom "Airmont"                   */
-		if (idx == PERF_CSTATE_CORE_C6_RES)
-			return true;
-		break;
-	case 69: /* 22nm Haswell ULT */
-		if (idx == PERF_CSTATE_PKG_C2_RES ||
-		    idx == PERF_CSTATE_PKG_C3_RES ||
-		    idx == PERF_CSTATE_PKG_C6_RES ||
-		    idx == PERF_CSTATE_PKG_C7_RES ||
-		    idx == PERF_CSTATE_PKG_C8_RES ||
-		    idx == PERF_CSTATE_PKG_C9_RES ||
-		    idx == PERF_CSTATE_PKG_C10_RES)
-			return true;
-		break;
-	}
-
-	return false;
-}
-
-PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00");
-PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01");
-PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02");
-PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03");
-PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04");
-PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05");
-PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06");
-
-static struct perf_cstate_msr pkg_msr[] = {
-	[PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY,	&evattr_cstate_pkg_c2,	test_pkg, },
-	[PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY,	&evattr_cstate_pkg_c3,	test_pkg, },
-	[PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY,	&evattr_cstate_pkg_c6,	test_pkg, },
-	[PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY,	&evattr_cstate_pkg_c7,	test_pkg, },
-	[PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY,	&evattr_cstate_pkg_c8,	test_pkg, },
-	[PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY,	&evattr_cstate_pkg_c9,	test_pkg, },
-	[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY,	&evattr_cstate_pkg_c10,	test_pkg, },
-};
-
-static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = {
-	NULL,
-};
-
-static struct attribute_group pkg_events_attr_group = {
-	.name = "events",
-	.attrs = pkg_events_attrs,
-};
-
-DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63");
-static struct attribute *pkg_format_attrs[] = {
-	&format_attr_pkg_event.attr,
-	NULL,
-};
-static struct attribute_group pkg_format_attr_group = {
-	.name = "format",
-	.attrs = pkg_format_attrs,
-};
-
-static cpumask_t cstate_pkg_cpu_mask;
-
-static const struct attribute_group *pkg_attr_groups[] = {
-	&pkg_events_attr_group,
-	&pkg_format_attr_group,
-	&cpumask_attr_group,
-	NULL,
-};
-
-/* cstate_pkg PMU end*/
-
-static ssize_t cstate_get_attr_cpumask(struct device *dev,
-				       struct device_attribute *attr,
-				       char *buf)
-{
-	struct pmu *pmu = dev_get_drvdata(dev);
-
-	if (pmu == &cstate_core_pmu)
-		return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
-	else if (pmu == &cstate_pkg_pmu)
-		return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
-	else
-		return 0;
-}
-
-static int cstate_pmu_event_init(struct perf_event *event)
-{
-	u64 cfg = event->attr.config;
-	int ret = 0;
-
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	if (event->pmu == &cstate_core_pmu) {
-		if (cfg >= PERF_CSTATE_CORE_EVENT_MAX)
-			return -EINVAL;
-		if (!core_msr[cfg].attr)
-			return -EINVAL;
-		event->hw.event_base = core_msr[cfg].msr;
-	} else if (event->pmu == &cstate_pkg_pmu) {
-		if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
-			return -EINVAL;
-		if (!pkg_msr[cfg].attr)
-			return -EINVAL;
-		event->hw.event_base = pkg_msr[cfg].msr;
-	} else
-		return -ENOENT;
-
-	/* must be done before validate_group */
-	event->hw.config = cfg;
-	event->hw.idx = -1;
-
-	return ret;
-}
-
-static inline u64 cstate_pmu_read_counter(struct perf_event *event)
-{
-	u64 val;
-
-	rdmsrl(event->hw.event_base, val);
-	return val;
-}
-
-static void cstate_pmu_event_update(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 prev_raw_count, new_raw_count;
-
-again:
-	prev_raw_count = local64_read(&hwc->prev_count);
-	new_raw_count = cstate_pmu_read_counter(event);
-
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-			    new_raw_count) != prev_raw_count)
-		goto again;
-
-	local64_add(new_raw_count - prev_raw_count, &event->count);
-}
-
-static void cstate_pmu_event_start(struct perf_event *event, int mode)
-{
-	local64_set(&event->hw.prev_count, cstate_pmu_read_counter(event));
-}
-
-static void cstate_pmu_event_stop(struct perf_event *event, int mode)
-{
-	cstate_pmu_event_update(event);
-}
-
-static void cstate_pmu_event_del(struct perf_event *event, int mode)
-{
-	cstate_pmu_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int cstate_pmu_event_add(struct perf_event *event, int mode)
-{
-	if (mode & PERF_EF_START)
-		cstate_pmu_event_start(event, mode);
-
-	return 0;
-}
-
-static void cstate_cpu_exit(int cpu)
-{
-	int i, id, target;
-
-	/* cpu exit for cstate core */
-	if (has_cstate_core) {
-		id = topology_core_id(cpu);
-		target = -1;
-
-		for_each_online_cpu(i) {
-			if (i == cpu)
-				continue;
-			if (id == topology_core_id(i)) {
-				target = i;
-				break;
-			}
-		}
-		if (cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask) && target >= 0)
-			cpumask_set_cpu(target, &cstate_core_cpu_mask);
-		WARN_ON(cpumask_empty(&cstate_core_cpu_mask));
-		if (target >= 0)
-			perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
-	}
-
-	/* cpu exit for cstate pkg */
-	if (has_cstate_pkg) {
-		id = topology_physical_package_id(cpu);
-		target = -1;
-
-		for_each_online_cpu(i) {
-			if (i == cpu)
-				continue;
-			if (id == topology_physical_package_id(i)) {
-				target = i;
-				break;
-			}
-		}
-		if (cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask) && target >= 0)
-			cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
-		WARN_ON(cpumask_empty(&cstate_pkg_cpu_mask));
-		if (target >= 0)
-			perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
-	}
-}
-
-static void cstate_cpu_init(int cpu)
-{
-	int i, id;
-
-	/* cpu init for cstate core */
-	if (has_cstate_core) {
-		id = topology_core_id(cpu);
-		for_each_cpu(i, &cstate_core_cpu_mask) {
-			if (id == topology_core_id(i))
-				break;
-		}
-		if (i >= nr_cpu_ids)
-			cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
-	}
-
-	/* cpu init for cstate pkg */
-	if (has_cstate_pkg) {
-		id = topology_physical_package_id(cpu);
-		for_each_cpu(i, &cstate_pkg_cpu_mask) {
-			if (id == topology_physical_package_id(i))
-				break;
-		}
-		if (i >= nr_cpu_ids)
-			cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
-	}
-}
-
-static int cstate_cpu_notifier(struct notifier_block *self,
-				  unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		break;
-	case CPU_STARTING:
-		cstate_cpu_init(cpu);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_DYING:
-		break;
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		break;
-	case CPU_DOWN_PREPARE:
-		cstate_cpu_exit(cpu);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-/*
- * Probe the cstate events and insert the available one into sysfs attrs
- * Return false if there is no available events.
- */
-static bool cstate_probe_msr(struct perf_cstate_msr *msr,
-			     struct attribute	**events_attrs,
-			     int max_event_nr)
-{
-	int i, j = 0;
-	u64 val;
-
-	/* Probe the cstate events. */
-	for (i = 0; i < max_event_nr; i++) {
-		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
-			msr[i].attr = NULL;
-	}
-
-	/* List remaining events in the sysfs attrs. */
-	for (i = 0; i < max_event_nr; i++) {
-		if (msr[i].attr)
-			events_attrs[j++] = &msr[i].attr->attr.attr;
-	}
-	events_attrs[j] = NULL;
-
-	return (j > 0) ? true : false;
-}
-
-static int __init cstate_init(void)
-{
-	/* SLM has different MSR for PKG C6 */
-	switch (boot_cpu_data.x86_model) {
-	case 55:
-	case 76:
-	case 77:
-		pkg_msr[PERF_CSTATE_PKG_C6_RES].msr = MSR_PKG_C7_RESIDENCY;
-	}
-
-	if (cstate_probe_msr(core_msr, core_events_attrs, PERF_CSTATE_CORE_EVENT_MAX))
-		has_cstate_core = true;
-
-	if (cstate_probe_msr(pkg_msr, pkg_events_attrs, PERF_CSTATE_PKG_EVENT_MAX))
-		has_cstate_pkg = true;
-
-	return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV;
-}
-
-static void __init cstate_cpumask_init(void)
-{
-	int cpu;
-
-	cpu_notifier_register_begin();
-
-	for_each_online_cpu(cpu)
-		cstate_cpu_init(cpu);
-
-	__perf_cpu_notifier(cstate_cpu_notifier);
-
-	cpu_notifier_register_done();
-}
-
-static struct pmu cstate_core_pmu = {
-	.attr_groups	= core_attr_groups,
-	.name		= "cstate_core",
-	.task_ctx_nr	= perf_invalid_context,
-	.event_init	= cstate_pmu_event_init,
-	.add		= cstate_pmu_event_add, /* must have */
-	.del		= cstate_pmu_event_del, /* must have */
-	.start		= cstate_pmu_event_start,
-	.stop		= cstate_pmu_event_stop,
-	.read		= cstate_pmu_event_update,
-	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static struct pmu cstate_pkg_pmu = {
-	.attr_groups	= pkg_attr_groups,
-	.name		= "cstate_pkg",
-	.task_ctx_nr	= perf_invalid_context,
-	.event_init	= cstate_pmu_event_init,
-	.add		= cstate_pmu_event_add, /* must have */
-	.del		= cstate_pmu_event_del, /* must have */
-	.start		= cstate_pmu_event_start,
-	.stop		= cstate_pmu_event_stop,
-	.read		= cstate_pmu_event_update,
-	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static void __init cstate_pmus_register(void)
-{
-	int err;
-
-	if (has_cstate_core) {
-		err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
-		if (WARN_ON(err))
-			pr_info("Failed to register PMU %s error %d\n",
-				cstate_core_pmu.name, err);
-	}
-
-	if (has_cstate_pkg) {
-		err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1);
-		if (WARN_ON(err))
-			pr_info("Failed to register PMU %s error %d\n",
-				cstate_pkg_pmu.name, err);
-	}
-}
-
-static int __init cstate_pmu_init(void)
-{
-	int err;
-
-	if (cpu_has_hypervisor)
-		return -ENODEV;
-
-	err = cstate_init();
-	if (err)
-		return err;
-
-	cstate_cpumask_init();
-
-	cstate_pmus_register();
-
-	return 0;
-}
-
-device_initcall(cstate_pmu_init);
-- 
cgit v0.10.2


From 7010d129137aa44a79a6a1911881e07e1cd5da60 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:11 +0100
Subject: perf/x86: Move perf_event_intel_ds.c ......... =>
 x86/events/intel/ds.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-6-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index e0a70fe..c4d41b6 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -6,4 +6,4 @@ ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
new file mode 100644
index 0000000..9677207
--- /dev/null
+++ b/arch/x86/events/intel/ds.c
@@ -0,0 +1,1368 @@
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE		24
+
+#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
+#define PEBS_FIXUP_SIZE		PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+	u32 flags, ip;
+	u32 ax, bc, cx, dx;
+	u32 si, di, bp, sp;
+};
+
+ */
+
+union intel_x86_pebs_dse {
+	u64 val;
+	struct {
+		unsigned int ld_dse:4;
+		unsigned int ld_stlb_miss:1;
+		unsigned int ld_locked:1;
+		unsigned int ld_reserved:26;
+	};
+	struct {
+		unsigned int st_l1d_hit:1;
+		unsigned int st_reserved1:3;
+		unsigned int st_stlb_miss:1;
+		unsigned int st_locked:1;
+		unsigned int st_reserved2:26;
+	};
+};
+
+
+/*
+ * Map PEBS Load Latency Data Source encodings to generic
+ * memory data source information
+ */
+#define P(a, b) PERF_MEM_S(a, b)
+#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
+#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
+
+static const u64 pebs_data_source[] = {
+	P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
+	OP_LH | P(LVL, L1)  | P(SNOOP, NONE),	/* 0x01: L1 local */
+	OP_LH | P(LVL, LFB) | P(SNOOP, NONE),	/* 0x02: LFB hit */
+	OP_LH | P(LVL, L2)  | P(SNOOP, NONE),	/* 0x03: L2 hit */
+	OP_LH | P(LVL, L3)  | P(SNOOP, NONE),	/* 0x04: L3 hit */
+	OP_LH | P(LVL, L3)  | P(SNOOP, MISS),	/* 0x05: L3 hit, snoop miss */
+	OP_LH | P(LVL, L3)  | P(SNOOP, HIT),	/* 0x06: L3 hit, snoop hit */
+	OP_LH | P(LVL, L3)  | P(SNOOP, HITM),	/* 0x07: L3 hit, snoop hitm */
+	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
+	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
+	OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
+	OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
+	OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
+	OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
+	OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
+	OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
+};
+
+static u64 precise_store_data(u64 status)
+{
+	union intel_x86_pebs_dse dse;
+	u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
+
+	dse.val = status;
+
+	/*
+	 * bit 4: TLB access
+	 * 1 = stored missed 2nd level TLB
+	 *
+	 * so it either hit the walker or the OS
+	 * otherwise hit 2nd level TLB
+	 */
+	if (dse.st_stlb_miss)
+		val |= P(TLB, MISS);
+	else
+		val |= P(TLB, HIT);
+
+	/*
+	 * bit 0: hit L1 data cache
+	 * if not set, then all we know is that
+	 * it missed L1D
+	 */
+	if (dse.st_l1d_hit)
+		val |= P(LVL, HIT);
+	else
+		val |= P(LVL, MISS);
+
+	/*
+	 * bit 5: Locked prefix
+	 */
+	if (dse.st_locked)
+		val |= P(LOCK, LOCKED);
+
+	return val;
+}
+
+static u64 precise_datala_hsw(struct perf_event *event, u64 status)
+{
+	union perf_mem_data_src dse;
+
+	dse.val = PERF_MEM_NA;
+
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+		dse.mem_op = PERF_MEM_OP_STORE;
+	else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
+		dse.mem_op = PERF_MEM_OP_LOAD;
+
+	/*
+	 * L1 info only valid for following events:
+	 *
+	 * MEM_UOPS_RETIRED.STLB_MISS_STORES
+	 * MEM_UOPS_RETIRED.LOCK_STORES
+	 * MEM_UOPS_RETIRED.SPLIT_STORES
+	 * MEM_UOPS_RETIRED.ALL_STORES
+	 */
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
+		if (status & 1)
+			dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+		else
+			dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+	}
+	return dse.val;
+}
+
+static u64 load_latency_data(u64 status)
+{
+	union intel_x86_pebs_dse dse;
+	u64 val;
+	int model = boot_cpu_data.x86_model;
+	int fam = boot_cpu_data.x86;
+
+	dse.val = status;
+
+	/*
+	 * use the mapping table for bit 0-3
+	 */
+	val = pebs_data_source[dse.ld_dse];
+
+	/*
+	 * Nehalem models do not support TLB, Lock infos
+	 */
+	if (fam == 0x6 && (model == 26 || model == 30
+	    || model == 31 || model == 46)) {
+		val |= P(TLB, NA) | P(LOCK, NA);
+		return val;
+	}
+	/*
+	 * bit 4: TLB access
+	 * 0 = did not miss 2nd level TLB
+	 * 1 = missed 2nd level TLB
+	 */
+	if (dse.ld_stlb_miss)
+		val |= P(TLB, MISS) | P(TLB, L2);
+	else
+		val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+	/*
+	 * bit 5: locked prefix
+	 */
+	if (dse.ld_locked)
+		val |= P(LOCK, LOCKED);
+
+	return val;
+}
+
+struct pebs_record_core {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+};
+
+/*
+ * Same as pebs_record_nhm, with two additional fields.
+ */
+struct pebs_record_hsw {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+	u64 real_ip, tsx_tuning;
+};
+
+union hsw_tsx_tuning {
+	struct {
+		u32 cycles_last_block     : 32,
+		    hle_abort		  : 1,
+		    rtm_abort		  : 1,
+		    instruction_abort     : 1,
+		    non_instruction_abort : 1,
+		    retry		  : 1,
+		    data_conflict	  : 1,
+		    capacity_writes	  : 1,
+		    capacity_reads	  : 1;
+	};
+	u64	    value;
+};
+
+#define PEBS_HSW_TSX_FLAGS	0xff00000000ULL
+
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+	u64 real_ip, tsx_tuning;
+	u64 tsc;
+};
+
+void init_debug_store_on_cpu(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+		     (u32)((u64)(unsigned long)ds),
+		     (u32)((u64)(unsigned long)ds >> 32));
+}
+
+void fini_debug_store_on_cpu(int cpu)
+{
+	if (!per_cpu(cpu_hw_events, cpu).ds)
+		return;
+
+	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static DEFINE_PER_CPU(void *, insn_buffer);
+
+static int alloc_pebs_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int node = cpu_to_node(cpu);
+	int max;
+	void *buffer, *ibuffer;
+
+	if (!x86_pmu.pebs)
+		return 0;
+
+	buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
+	if (unlikely(!buffer))
+		return -ENOMEM;
+
+	/*
+	 * HSW+ already provides us the eventing ip; no need to allocate this
+	 * buffer then.
+	 */
+	if (x86_pmu.intel_cap.pebs_format < 2) {
+		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+		if (!ibuffer) {
+			kfree(buffer);
+			return -ENOMEM;
+		}
+		per_cpu(insn_buffer, cpu) = ibuffer;
+	}
+
+	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+
+	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+	ds->pebs_index = ds->pebs_buffer_base;
+	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+		max * x86_pmu.pebs_record_size;
+
+	return 0;
+}
+
+static void release_pebs_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds || !x86_pmu.pebs)
+		return;
+
+	kfree(per_cpu(insn_buffer, cpu));
+	per_cpu(insn_buffer, cpu) = NULL;
+
+	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	ds->pebs_buffer_base = 0;
+}
+
+static int alloc_bts_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	int node = cpu_to_node(cpu);
+	int max, thresh;
+	void *buffer;
+
+	if (!x86_pmu.bts)
+		return 0;
+
+	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+	if (unlikely(!buffer)) {
+		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
+		return -ENOMEM;
+	}
+
+	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+	thresh = max / 16;
+
+	ds->bts_buffer_base = (u64)(unsigned long)buffer;
+	ds->bts_index = ds->bts_buffer_base;
+	ds->bts_absolute_maximum = ds->bts_buffer_base +
+		max * BTS_RECORD_SIZE;
+	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+		thresh * BTS_RECORD_SIZE;
+
+	return 0;
+}
+
+static void release_bts_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds || !x86_pmu.bts)
+		return;
+
+	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	ds->bts_buffer_base = 0;
+}
+
+static int alloc_ds_buffer(int cpu)
+{
+	int node = cpu_to_node(cpu);
+	struct debug_store *ds;
+
+	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
+	if (unlikely(!ds))
+		return -ENOMEM;
+
+	per_cpu(cpu_hw_events, cpu).ds = ds;
+
+	return 0;
+}
+
+static void release_ds_buffer(int cpu)
+{
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+	if (!ds)
+		return;
+
+	per_cpu(cpu_hw_events, cpu).ds = NULL;
+	kfree(ds);
+}
+
+void release_ds_buffers(void)
+{
+	int cpu;
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		fini_debug_store_on_cpu(cpu);
+
+	for_each_possible_cpu(cpu) {
+		release_pebs_buffer(cpu);
+		release_bts_buffer(cpu);
+		release_ds_buffer(cpu);
+	}
+	put_online_cpus();
+}
+
+void reserve_ds_buffers(void)
+{
+	int bts_err = 0, pebs_err = 0;
+	int cpu;
+
+	x86_pmu.bts_active = 0;
+	x86_pmu.pebs_active = 0;
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return;
+
+	if (!x86_pmu.bts)
+		bts_err = 1;
+
+	if (!x86_pmu.pebs)
+		pebs_err = 1;
+
+	get_online_cpus();
+
+	for_each_possible_cpu(cpu) {
+		if (alloc_ds_buffer(cpu)) {
+			bts_err = 1;
+			pebs_err = 1;
+		}
+
+		if (!bts_err && alloc_bts_buffer(cpu))
+			bts_err = 1;
+
+		if (!pebs_err && alloc_pebs_buffer(cpu))
+			pebs_err = 1;
+
+		if (bts_err && pebs_err)
+			break;
+	}
+
+	if (bts_err) {
+		for_each_possible_cpu(cpu)
+			release_bts_buffer(cpu);
+	}
+
+	if (pebs_err) {
+		for_each_possible_cpu(cpu)
+			release_pebs_buffer(cpu);
+	}
+
+	if (bts_err && pebs_err) {
+		for_each_possible_cpu(cpu)
+			release_ds_buffer(cpu);
+	} else {
+		if (x86_pmu.bts && !bts_err)
+			x86_pmu.bts_active = 1;
+
+		if (x86_pmu.pebs && !pebs_err)
+			x86_pmu.pebs_active = 1;
+
+		for_each_online_cpu(cpu)
+			init_debug_store_on_cpu(cpu);
+	}
+
+	put_online_cpus();
+}
+
+/*
+ * BTS
+ */
+
+struct event_constraint bts_constraint =
+	EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
+
+void intel_pmu_enable_bts(u64 config)
+{
+	unsigned long debugctlmsr;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr |= DEBUGCTLMSR_TR;
+	debugctlmsr |= DEBUGCTLMSR_BTS;
+	if (config & ARCH_PERFMON_EVENTSEL_INT)
+		debugctlmsr |= DEBUGCTLMSR_BTINT;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
+
+	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+void intel_pmu_disable_bts(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	unsigned long debugctlmsr;
+
+	if (!cpuc->ds)
+		return;
+
+	debugctlmsr = get_debugctlmsr();
+
+	debugctlmsr &=
+		~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+		  DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
+
+	update_debugctlmsr(debugctlmsr);
+}
+
+int intel_pmu_drain_bts_buffer(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct bts_record {
+		u64	from;
+		u64	to;
+		u64	flags;
+	};
+	struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
+	struct bts_record *at, *base, *top;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	struct perf_sample_data data;
+	unsigned long skip = 0;
+	struct pt_regs regs;
+
+	if (!event)
+		return 0;
+
+	if (!x86_pmu.bts_active)
+		return 0;
+
+	base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+	top  = (struct bts_record *)(unsigned long)ds->bts_index;
+
+	if (top <= base)
+		return 0;
+
+	memset(&regs, 0, sizeof(regs));
+
+	ds->bts_index = ds->bts_buffer_base;
+
+	perf_sample_data_init(&data, 0, event->hw.last_period);
+
+	/*
+	 * BTS leaks kernel addresses in branches across the cpl boundary,
+	 * such as traps or system calls, so unless the user is asking for
+	 * kernel tracing (and right now it's not possible), we'd need to
+	 * filter them out. But first we need to count how many of those we
+	 * have in the current batch. This is an extra O(n) pass, however,
+	 * it's much faster than the other one especially considering that
+	 * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
+	 * alloc_bts_buffer()).
+	 */
+	for (at = base; at < top; at++) {
+		/*
+		 * Note that right now *this* BTS code only works if
+		 * attr::exclude_kernel is set, but let's keep this extra
+		 * check here in case that changes.
+		 */
+		if (event->attr.exclude_kernel &&
+		    (kernel_ip(at->from) || kernel_ip(at->to)))
+			skip++;
+	}
+
+	/*
+	 * Prepare a generic sample, i.e. fill in the invariant fields.
+	 * We will overwrite the from and to address before we output
+	 * the sample.
+	 */
+	perf_prepare_sample(&header, &data, event, &regs);
+
+	if (perf_output_begin(&handle, event, header.size *
+			      (top - base - skip)))
+		return 1;
+
+	for (at = base; at < top; at++) {
+		/* Filter out any records that contain kernel addresses. */
+		if (event->attr.exclude_kernel &&
+		    (kernel_ip(at->from) || kernel_ip(at->to)))
+			continue;
+
+		data.ip		= at->from;
+		data.addr	= at->to;
+
+		perf_output_sample(&handle, &header, &data, event);
+	}
+
+	perf_output_end(&handle);
+
+	/* There's new data available. */
+	event->hw.interrupts++;
+	event->pending_kill = POLL_IN;
+	return 1;
+}
+
+static inline void intel_pmu_drain_pebs_buffer(void)
+{
+	struct pt_regs regs;
+
+	x86_pmu.drain_pebs(&regs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	if (!sched_in)
+		intel_pmu_drain_pebs_buffer();
+}
+
+/*
+ * PEBS
+ */
+struct event_constraint intel_core2_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_atom_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_slm_pebs_event_constraints[] = {
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_nehalem_pebs_event_constraints[] = {
+	INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
+	INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_westmere_pebs_event_constraints[] = {
+	INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
+	INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_snb_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+	INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_ivb_pebs_event_constraints[] = {
+        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
+	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+        EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_hsw_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+	INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+	/* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
+	INTEL_PLD_CONSTRAINT(0x1cd, 0xf),		      /* MEM_TRANS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event)
+{
+	struct event_constraint *c;
+
+	if (!event->attr.precise_ip)
+		return NULL;
+
+	if (x86_pmu.pebs_constraints) {
+		for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+			if ((event->hw.config & c->cmask) == c->code) {
+				event->hw.flags |= c->flags;
+				return c;
+			}
+		}
+	}
+
+	return &emptyconstraint;
+}
+
+static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+{
+	return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+}
+
+void intel_pmu_pebs_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	struct debug_store *ds = cpuc->ds;
+	bool first_pebs;
+	u64 threshold;
+
+	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+	first_pebs = !pebs_is_enabled(cpuc);
+	cpuc->pebs_enabled |= 1ULL << hwc->idx;
+
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+		cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
+	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+		cpuc->pebs_enabled |= 1ULL << 63;
+
+	/*
+	 * When the event is constrained enough we can use a larger
+	 * threshold and run the event with less frequent PMI.
+	 */
+	if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
+		threshold = ds->pebs_absolute_maximum -
+			x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+
+		if (first_pebs)
+			perf_sched_cb_inc(event->ctx->pmu);
+	} else {
+		threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+		/*
+		 * If not all events can use larger buffer,
+		 * roll back to threshold = 1
+		 */
+		if (!first_pebs &&
+		    (ds->pebs_interrupt_threshold > threshold))
+			perf_sched_cb_dec(event->ctx->pmu);
+	}
+
+	/* Use auto-reload if possible to save a MSR write in the PMI */
+	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+		ds->pebs_event_reset[hwc->idx] =
+			(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+	}
+
+	if (first_pebs || ds->pebs_interrupt_threshold > threshold)
+		ds->pebs_interrupt_threshold = threshold;
+}
+
+void intel_pmu_pebs_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	struct debug_store *ds = cpuc->ds;
+	bool large_pebs = ds->pebs_interrupt_threshold >
+		ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+	if (large_pebs)
+		intel_pmu_drain_pebs_buffer();
+
+	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
+		cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
+	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
+		cpuc->pebs_enabled &= ~(1ULL << 63);
+
+	if (large_pebs && !pebs_is_enabled(cpuc))
+		perf_sched_cb_dec(event->ctx->pmu);
+
+	if (cpuc->enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+
+	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+}
+
+void intel_pmu_pebs_enable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+void intel_pmu_pebs_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (cpuc->pebs_enabled)
+		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	unsigned long from = cpuc->lbr_entries[0].from;
+	unsigned long old_to, to = cpuc->lbr_entries[0].to;
+	unsigned long ip = regs->ip;
+	int is_64bit = 0;
+	void *kaddr;
+	int size;
+
+	/*
+	 * We don't need to fixup if the PEBS assist is fault like
+	 */
+	if (!x86_pmu.intel_cap.pebs_trap)
+		return 1;
+
+	/*
+	 * No LBR entry, no basic block, no rewinding
+	 */
+	if (!cpuc->lbr_stack.nr || !from || !to)
+		return 0;
+
+	/*
+	 * Basic blocks should never cross user/kernel boundaries
+	 */
+	if (kernel_ip(ip) != kernel_ip(to))
+		return 0;
+
+	/*
+	 * unsigned math, either ip is before the start (impossible) or
+	 * the basic block is larger than 1 page (sanity)
+	 */
+	if ((ip - to) > PEBS_FIXUP_SIZE)
+		return 0;
+
+	/*
+	 * We sampled a branch insn, rewind using the LBR stack
+	 */
+	if (ip == to) {
+		set_linear_ip(regs, from);
+		return 1;
+	}
+
+	size = ip - to;
+	if (!kernel_ip(ip)) {
+		int bytes;
+		u8 *buf = this_cpu_read(insn_buffer);
+
+		/* 'size' must fit our buffer, see above */
+		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+		if (bytes != 0)
+			return 0;
+
+		kaddr = buf;
+	} else {
+		kaddr = (void *)to;
+	}
+
+	do {
+		struct insn insn;
+
+		old_to = to;
+
+#ifdef CONFIG_X86_64
+		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
+#endif
+		insn_init(&insn, kaddr, size, is_64bit);
+		insn_get_length(&insn);
+		/*
+		 * Make sure there was not a problem decoding the
+		 * instruction and getting the length.  This is
+		 * doubly important because we have an infinite
+		 * loop if insn.length=0.
+		 */
+		if (!insn.length)
+			break;
+
+		to += insn.length;
+		kaddr += insn.length;
+		size -= insn.length;
+	} while (to < ip);
+
+	if (to == ip) {
+		set_linear_ip(regs, old_to);
+		return 1;
+	}
+
+	/*
+	 * Even though we decoded the basic block, the instruction stream
+	 * never matched the given IP, either the TO or the IP got corrupted.
+	 */
+	return 0;
+}
+
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
+{
+	if (pebs->tsx_tuning) {
+		union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+		return tsx.cycles_last_block;
+	}
+	return 0;
+}
+
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
+{
+	u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+
+	/* For RTM XABORTs also log the abort code from AX */
+	if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
+		txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+	return txn;
+}
+
+static void setup_pebs_sample_data(struct perf_event *event,
+				   struct pt_regs *iregs, void *__pebs,
+				   struct perf_sample_data *data,
+				   struct pt_regs *regs)
+{
+#define PERF_X86_EVENT_PEBS_HSW_PREC \
+		(PERF_X86_EVENT_PEBS_ST_HSW | \
+		 PERF_X86_EVENT_PEBS_LD_HSW | \
+		 PERF_X86_EVENT_PEBS_NA_HSW)
+	/*
+	 * We cast to the biggest pebs_record but are careful not to
+	 * unconditionally access the 'extra' entries.
+	 */
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct pebs_record_skl *pebs = __pebs;
+	u64 sample_type;
+	int fll, fst, dsrc;
+	int fl = event->hw.flags;
+
+	if (pebs == NULL)
+		return;
+
+	sample_type = event->attr.sample_type;
+	dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
+
+	fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
+	fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
+
+	perf_sample_data_init(data, 0, event->hw.last_period);
+
+	data->period = event->hw.last_period;
+
+	/*
+	 * Use latency for weight (only avail with PEBS-LL)
+	 */
+	if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
+		data->weight = pebs->lat;
+
+	/*
+	 * data.data_src encodes the data source
+	 */
+	if (dsrc) {
+		u64 val = PERF_MEM_NA;
+		if (fll)
+			val = load_latency_data(pebs->dse);
+		else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+			val = precise_datala_hsw(event, pebs->dse);
+		else if (fst)
+			val = precise_store_data(pebs->dse);
+		data->data_src.val = val;
+	}
+
+	/*
+	 * We use the interrupt regs as a base because the PEBS record
+	 * does not contain a full regs set, specifically it seems to
+	 * lack segment descriptors, which get used by things like
+	 * user_mode().
+	 *
+	 * In the simple case fix up only the IP and BP,SP regs, for
+	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+	 */
+	*regs = *iregs;
+	regs->flags = pebs->flags;
+	set_linear_ip(regs, pebs->ip);
+	regs->bp = pebs->bp;
+	regs->sp = pebs->sp;
+
+	if (sample_type & PERF_SAMPLE_REGS_INTR) {
+		regs->ax = pebs->ax;
+		regs->bx = pebs->bx;
+		regs->cx = pebs->cx;
+		regs->dx = pebs->dx;
+		regs->si = pebs->si;
+		regs->di = pebs->di;
+		regs->bp = pebs->bp;
+		regs->sp = pebs->sp;
+
+		regs->flags = pebs->flags;
+#ifndef CONFIG_X86_32
+		regs->r8 = pebs->r8;
+		regs->r9 = pebs->r9;
+		regs->r10 = pebs->r10;
+		regs->r11 = pebs->r11;
+		regs->r12 = pebs->r12;
+		regs->r13 = pebs->r13;
+		regs->r14 = pebs->r14;
+		regs->r15 = pebs->r15;
+#endif
+	}
+
+	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
+		regs->ip = pebs->real_ip;
+		regs->flags |= PERF_EFLAGS_EXACT;
+	} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
+		regs->flags |= PERF_EFLAGS_EXACT;
+	else
+		regs->flags &= ~PERF_EFLAGS_EXACT;
+
+	if ((sample_type & PERF_SAMPLE_ADDR) &&
+	    x86_pmu.intel_cap.pebs_format >= 1)
+		data->addr = pebs->dla;
+
+	if (x86_pmu.intel_cap.pebs_format >= 2) {
+		/* Only set the TSX weight when no memory weight. */
+		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+			data->weight = intel_hsw_weight(pebs);
+
+		if (sample_type & PERF_SAMPLE_TRANSACTION)
+			data->txn = intel_hsw_transaction(pebs);
+	}
+
+	/*
+	 * v3 supplies an accurate time stamp, so we use that
+	 * for the time stamp.
+	 *
+	 * We can only do this for the default trace clock.
+	 */
+	if (x86_pmu.intel_cap.pebs_format >= 3 &&
+		event->attr.use_clockid == 0)
+		data->time = native_sched_clock_from_tsc(pebs->tsc);
+
+	if (has_branch_stack(event))
+		data->br_stack = &cpuc->lbr_stack;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	void *at;
+	u64 pebs_status;
+
+	/*
+	 * fmt0 does not have a status bitfield (does not use
+	 * perf_record_nhm format)
+	 */
+	if (x86_pmu.intel_cap.pebs_format < 1)
+		return base;
+
+	if (base == NULL)
+		return NULL;
+
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+		struct pebs_record_nhm *p = at;
+
+		if (test_bit(bit, (unsigned long *)&p->status)) {
+			/* PEBS v3 has accurate status bits */
+			if (x86_pmu.intel_cap.pebs_format >= 3)
+				return at;
+
+			if (p->status == (1 << bit))
+				return at;
+
+			/* clear non-PEBS bit and re-check */
+			pebs_status = p->status & cpuc->pebs_enabled;
+			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+			if (pebs_status == (1 << bit))
+				return at;
+		}
+	}
+	return NULL;
+}
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+				   struct pt_regs *iregs,
+				   void *base, void *top,
+				   int bit, int count)
+{
+	struct perf_sample_data data;
+	struct pt_regs regs;
+	void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+	if (!intel_pmu_save_and_restart(event) &&
+	    !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+		return;
+
+	while (count > 1) {
+		setup_pebs_sample_data(event, iregs, at, &data, &regs);
+		perf_event_output(event, &data, &regs);
+		at += x86_pmu.pebs_record_size;
+		at = get_next_pebs_record_by_bit(at, top, bit);
+		count--;
+	}
+
+	setup_pebs_sample_data(event, iregs, at, &data, &regs);
+
+	/*
+	 * All but the last records are processed.
+	 * The last one is left to be able to call the overflow handler.
+	 */
+	if (perf_event_overflow(event, &data, &regs)) {
+		x86_pmu_stop(event, 0);
+		return;
+	}
+
+}
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+	struct pebs_record_core *at, *top;
+	int n;
+
+	if (!x86_pmu.pebs_active)
+		return;
+
+	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+	/*
+	 * Whatever else happens, drain the thing
+	 */
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	if (!test_bit(0, cpuc->active_mask))
+		return;
+
+	WARN_ON_ONCE(!event);
+
+	if (!event->attr.precise_ip)
+		return;
+
+	n = top - at;
+	if (n <= 0)
+		return;
+
+	__intel_pmu_pebs_event(event, iregs, at, top, 0, n);
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct debug_store *ds = cpuc->ds;
+	struct perf_event *event;
+	void *base, *at, *top;
+	short counts[MAX_PEBS_EVENTS] = {};
+	short error[MAX_PEBS_EVENTS] = {};
+	int bit, i;
+
+	if (!x86_pmu.pebs_active)
+		return;
+
+	base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+	ds->pebs_index = ds->pebs_buffer_base;
+
+	if (unlikely(base >= top))
+		return;
+
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+		struct pebs_record_nhm *p = at;
+		u64 pebs_status;
+
+		/* PEBS v3 has accurate status bits */
+		if (x86_pmu.intel_cap.pebs_format >= 3) {
+			for_each_set_bit(bit, (unsigned long *)&p->status,
+					 MAX_PEBS_EVENTS)
+				counts[bit]++;
+
+			continue;
+		}
+
+		pebs_status = p->status & cpuc->pebs_enabled;
+		pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+
+		/*
+		 * On some CPUs the PEBS status can be zero when PEBS is
+		 * racing with clearing of GLOBAL_STATUS.
+		 *
+		 * Normally we would drop that record, but in the
+		 * case when there is only a single active PEBS event
+		 * we can assume it's for that event.
+		 */
+		if (!pebs_status && cpuc->pebs_enabled &&
+			!(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
+			pebs_status = cpuc->pebs_enabled;
+
+		bit = find_first_bit((unsigned long *)&pebs_status,
+					x86_pmu.max_pebs_events);
+		if (bit >= x86_pmu.max_pebs_events)
+			continue;
+
+		/*
+		 * The PEBS hardware does not deal well with the situation
+		 * when events happen near to each other and multiple bits
+		 * are set. But it should happen rarely.
+		 *
+		 * If these events include one PEBS and multiple non-PEBS
+		 * events, it doesn't impact PEBS record. The record will
+		 * be handled normally. (slow path)
+		 *
+		 * If these events include two or more PEBS events, the
+		 * records for the events can be collapsed into a single
+		 * one, and it's not possible to reconstruct all events
+		 * that caused the PEBS record. It's called collision.
+		 * If collision happened, the record will be dropped.
+		 */
+		if (p->status != (1ULL << bit)) {
+			for_each_set_bit(i, (unsigned long *)&pebs_status,
+					 x86_pmu.max_pebs_events)
+				error[i]++;
+			continue;
+		}
+
+		counts[bit]++;
+	}
+
+	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+		if ((counts[bit] == 0) && (error[bit] == 0))
+			continue;
+
+		event = cpuc->events[bit];
+		WARN_ON_ONCE(!event);
+		WARN_ON_ONCE(!event->attr.precise_ip);
+
+		/* log dropped samples number */
+		if (error[bit])
+			perf_log_lost_samples(event, error[bit]);
+
+		if (counts[bit]) {
+			__intel_pmu_pebs_event(event, iregs, base,
+					       top, bit, counts[bit]);
+		}
+	}
+}
+
+/*
+ * BTS, PEBS probe and setup
+ */
+
+void __init intel_ds_init(void)
+{
+	/*
+	 * No support for 32bit formats
+	 */
+	if (!boot_cpu_has(X86_FEATURE_DTES64))
+		return;
+
+	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
+	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+	if (x86_pmu.pebs) {
+		char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
+		int format = x86_pmu.intel_cap.pebs_format;
+
+		switch (format) {
+		case 0:
+			pr_cont("PEBS fmt0%c, ", pebs_type);
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+			break;
+
+		case 1:
+			pr_cont("PEBS fmt1%c, ", pebs_type);
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			break;
+
+		case 2:
+			pr_cont("PEBS fmt2%c, ", pebs_type);
+			x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			break;
+
+		case 3:
+			pr_cont("PEBS fmt3%c, ", pebs_type);
+			x86_pmu.pebs_record_size =
+						sizeof(struct pebs_record_skl);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
+			break;
+
+		default:
+			pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
+			x86_pmu.pebs = 0;
+		}
+	}
+}
+
+void perf_restore_debug_store(void)
+{
+	struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+
+	if (!x86_pmu.bts && !x86_pmu.pebs)
+		return;
+
+	wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
+}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 1bc881b..035bdb6 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
deleted file mode 100644
index 7c79261..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ /dev/null
@@ -1,1368 +0,0 @@
-#include <linux/bitops.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-
-#include <asm/perf_event.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE		24
-
-#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
-#define PEBS_FIXUP_SIZE		PAGE_SIZE
-
-/*
- * pebs_record_32 for p4 and core not supported
-
-struct pebs_record_32 {
-	u32 flags, ip;
-	u32 ax, bc, cx, dx;
-	u32 si, di, bp, sp;
-};
-
- */
-
-union intel_x86_pebs_dse {
-	u64 val;
-	struct {
-		unsigned int ld_dse:4;
-		unsigned int ld_stlb_miss:1;
-		unsigned int ld_locked:1;
-		unsigned int ld_reserved:26;
-	};
-	struct {
-		unsigned int st_l1d_hit:1;
-		unsigned int st_reserved1:3;
-		unsigned int st_stlb_miss:1;
-		unsigned int st_locked:1;
-		unsigned int st_reserved2:26;
-	};
-};
-
-
-/*
- * Map PEBS Load Latency Data Source encodings to generic
- * memory data source information
- */
-#define P(a, b) PERF_MEM_S(a, b)
-#define OP_LH (P(OP, LOAD) | P(LVL, HIT))
-#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
-
-static const u64 pebs_data_source[] = {
-	P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
-	OP_LH | P(LVL, L1)  | P(SNOOP, NONE),	/* 0x01: L1 local */
-	OP_LH | P(LVL, LFB) | P(SNOOP, NONE),	/* 0x02: LFB hit */
-	OP_LH | P(LVL, L2)  | P(SNOOP, NONE),	/* 0x03: L2 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, NONE),	/* 0x04: L3 hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, MISS),	/* 0x05: L3 hit, snoop miss */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HIT),	/* 0x06: L3 hit, snoop hit */
-	OP_LH | P(LVL, L3)  | P(SNOOP, HITM),	/* 0x07: L3 hit, snoop hitm */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT),  /* 0x08: L3 miss snoop hit */
-	OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/
-	OP_LH | P(LVL, LOC_RAM)  | P(SNOOP, HIT),  /* 0x0a: L3 miss, shared */
-	OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT),  /* 0x0b: L3 miss, shared */
-	OP_LH | P(LVL, LOC_RAM)  | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */
-	OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */
-	OP_LH | P(LVL, IO)  | P(SNOOP, NONE), /* 0x0e: I/O */
-	OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
-};
-
-static u64 precise_store_data(u64 status)
-{
-	union intel_x86_pebs_dse dse;
-	u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2);
-
-	dse.val = status;
-
-	/*
-	 * bit 4: TLB access
-	 * 1 = stored missed 2nd level TLB
-	 *
-	 * so it either hit the walker or the OS
-	 * otherwise hit 2nd level TLB
-	 */
-	if (dse.st_stlb_miss)
-		val |= P(TLB, MISS);
-	else
-		val |= P(TLB, HIT);
-
-	/*
-	 * bit 0: hit L1 data cache
-	 * if not set, then all we know is that
-	 * it missed L1D
-	 */
-	if (dse.st_l1d_hit)
-		val |= P(LVL, HIT);
-	else
-		val |= P(LVL, MISS);
-
-	/*
-	 * bit 5: Locked prefix
-	 */
-	if (dse.st_locked)
-		val |= P(LOCK, LOCKED);
-
-	return val;
-}
-
-static u64 precise_datala_hsw(struct perf_event *event, u64 status)
-{
-	union perf_mem_data_src dse;
-
-	dse.val = PERF_MEM_NA;
-
-	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
-		dse.mem_op = PERF_MEM_OP_STORE;
-	else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
-		dse.mem_op = PERF_MEM_OP_LOAD;
-
-	/*
-	 * L1 info only valid for following events:
-	 *
-	 * MEM_UOPS_RETIRED.STLB_MISS_STORES
-	 * MEM_UOPS_RETIRED.LOCK_STORES
-	 * MEM_UOPS_RETIRED.SPLIT_STORES
-	 * MEM_UOPS_RETIRED.ALL_STORES
-	 */
-	if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
-		if (status & 1)
-			dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
-		else
-			dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
-	}
-	return dse.val;
-}
-
-static u64 load_latency_data(u64 status)
-{
-	union intel_x86_pebs_dse dse;
-	u64 val;
-	int model = boot_cpu_data.x86_model;
-	int fam = boot_cpu_data.x86;
-
-	dse.val = status;
-
-	/*
-	 * use the mapping table for bit 0-3
-	 */
-	val = pebs_data_source[dse.ld_dse];
-
-	/*
-	 * Nehalem models do not support TLB, Lock infos
-	 */
-	if (fam == 0x6 && (model == 26 || model == 30
-	    || model == 31 || model == 46)) {
-		val |= P(TLB, NA) | P(LOCK, NA);
-		return val;
-	}
-	/*
-	 * bit 4: TLB access
-	 * 0 = did not miss 2nd level TLB
-	 * 1 = missed 2nd level TLB
-	 */
-	if (dse.ld_stlb_miss)
-		val |= P(TLB, MISS) | P(TLB, L2);
-	else
-		val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
-
-	/*
-	 * bit 5: locked prefix
-	 */
-	if (dse.ld_locked)
-		val |= P(LOCK, LOCKED);
-
-	return val;
-}
-
-struct pebs_record_core {
-	u64 flags, ip;
-	u64 ax, bx, cx, dx;
-	u64 si, di, bp, sp;
-	u64 r8,  r9,  r10, r11;
-	u64 r12, r13, r14, r15;
-};
-
-struct pebs_record_nhm {
-	u64 flags, ip;
-	u64 ax, bx, cx, dx;
-	u64 si, di, bp, sp;
-	u64 r8,  r9,  r10, r11;
-	u64 r12, r13, r14, r15;
-	u64 status, dla, dse, lat;
-};
-
-/*
- * Same as pebs_record_nhm, with two additional fields.
- */
-struct pebs_record_hsw {
-	u64 flags, ip;
-	u64 ax, bx, cx, dx;
-	u64 si, di, bp, sp;
-	u64 r8,  r9,  r10, r11;
-	u64 r12, r13, r14, r15;
-	u64 status, dla, dse, lat;
-	u64 real_ip, tsx_tuning;
-};
-
-union hsw_tsx_tuning {
-	struct {
-		u32 cycles_last_block     : 32,
-		    hle_abort		  : 1,
-		    rtm_abort		  : 1,
-		    instruction_abort     : 1,
-		    non_instruction_abort : 1,
-		    retry		  : 1,
-		    data_conflict	  : 1,
-		    capacity_writes	  : 1,
-		    capacity_reads	  : 1;
-	};
-	u64	    value;
-};
-
-#define PEBS_HSW_TSX_FLAGS	0xff00000000ULL
-
-/* Same as HSW, plus TSC */
-
-struct pebs_record_skl {
-	u64 flags, ip;
-	u64 ax, bx, cx, dx;
-	u64 si, di, bp, sp;
-	u64 r8,  r9,  r10, r11;
-	u64 r12, r13, r14, r15;
-	u64 status, dla, dse, lat;
-	u64 real_ip, tsx_tuning;
-	u64 tsc;
-};
-
-void init_debug_store_on_cpu(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
-		     (u32)((u64)(unsigned long)ds),
-		     (u32)((u64)(unsigned long)ds >> 32));
-}
-
-void fini_debug_store_on_cpu(int cpu)
-{
-	if (!per_cpu(cpu_hw_events, cpu).ds)
-		return;
-
-	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static DEFINE_PER_CPU(void *, insn_buffer);
-
-static int alloc_pebs_buffer(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	int node = cpu_to_node(cpu);
-	int max;
-	void *buffer, *ibuffer;
-
-	if (!x86_pmu.pebs)
-		return 0;
-
-	buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
-	if (unlikely(!buffer))
-		return -ENOMEM;
-
-	/*
-	 * HSW+ already provides us the eventing ip; no need to allocate this
-	 * buffer then.
-	 */
-	if (x86_pmu.intel_cap.pebs_format < 2) {
-		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
-		if (!ibuffer) {
-			kfree(buffer);
-			return -ENOMEM;
-		}
-		per_cpu(insn_buffer, cpu) = ibuffer;
-	}
-
-	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
-
-	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
-	ds->pebs_index = ds->pebs_buffer_base;
-	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
-		max * x86_pmu.pebs_record_size;
-
-	return 0;
-}
-
-static void release_pebs_buffer(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds || !x86_pmu.pebs)
-		return;
-
-	kfree(per_cpu(insn_buffer, cpu));
-	per_cpu(insn_buffer, cpu) = NULL;
-
-	kfree((void *)(unsigned long)ds->pebs_buffer_base);
-	ds->pebs_buffer_base = 0;
-}
-
-static int alloc_bts_buffer(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	int node = cpu_to_node(cpu);
-	int max, thresh;
-	void *buffer;
-
-	if (!x86_pmu.bts)
-		return 0;
-
-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
-	if (unlikely(!buffer)) {
-		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
-		return -ENOMEM;
-	}
-
-	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
-	thresh = max / 16;
-
-	ds->bts_buffer_base = (u64)(unsigned long)buffer;
-	ds->bts_index = ds->bts_buffer_base;
-	ds->bts_absolute_maximum = ds->bts_buffer_base +
-		max * BTS_RECORD_SIZE;
-	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
-		thresh * BTS_RECORD_SIZE;
-
-	return 0;
-}
-
-static void release_bts_buffer(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds || !x86_pmu.bts)
-		return;
-
-	kfree((void *)(unsigned long)ds->bts_buffer_base);
-	ds->bts_buffer_base = 0;
-}
-
-static int alloc_ds_buffer(int cpu)
-{
-	int node = cpu_to_node(cpu);
-	struct debug_store *ds;
-
-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-	if (unlikely(!ds))
-		return -ENOMEM;
-
-	per_cpu(cpu_hw_events, cpu).ds = ds;
-
-	return 0;
-}
-
-static void release_ds_buffer(int cpu)
-{
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds)
-		return;
-
-	per_cpu(cpu_hw_events, cpu).ds = NULL;
-	kfree(ds);
-}
-
-void release_ds_buffers(void)
-{
-	int cpu;
-
-	if (!x86_pmu.bts && !x86_pmu.pebs)
-		return;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu)
-		fini_debug_store_on_cpu(cpu);
-
-	for_each_possible_cpu(cpu) {
-		release_pebs_buffer(cpu);
-		release_bts_buffer(cpu);
-		release_ds_buffer(cpu);
-	}
-	put_online_cpus();
-}
-
-void reserve_ds_buffers(void)
-{
-	int bts_err = 0, pebs_err = 0;
-	int cpu;
-
-	x86_pmu.bts_active = 0;
-	x86_pmu.pebs_active = 0;
-
-	if (!x86_pmu.bts && !x86_pmu.pebs)
-		return;
-
-	if (!x86_pmu.bts)
-		bts_err = 1;
-
-	if (!x86_pmu.pebs)
-		pebs_err = 1;
-
-	get_online_cpus();
-
-	for_each_possible_cpu(cpu) {
-		if (alloc_ds_buffer(cpu)) {
-			bts_err = 1;
-			pebs_err = 1;
-		}
-
-		if (!bts_err && alloc_bts_buffer(cpu))
-			bts_err = 1;
-
-		if (!pebs_err && alloc_pebs_buffer(cpu))
-			pebs_err = 1;
-
-		if (bts_err && pebs_err)
-			break;
-	}
-
-	if (bts_err) {
-		for_each_possible_cpu(cpu)
-			release_bts_buffer(cpu);
-	}
-
-	if (pebs_err) {
-		for_each_possible_cpu(cpu)
-			release_pebs_buffer(cpu);
-	}
-
-	if (bts_err && pebs_err) {
-		for_each_possible_cpu(cpu)
-			release_ds_buffer(cpu);
-	} else {
-		if (x86_pmu.bts && !bts_err)
-			x86_pmu.bts_active = 1;
-
-		if (x86_pmu.pebs && !pebs_err)
-			x86_pmu.pebs_active = 1;
-
-		for_each_online_cpu(cpu)
-			init_debug_store_on_cpu(cpu);
-	}
-
-	put_online_cpus();
-}
-
-/*
- * BTS
- */
-
-struct event_constraint bts_constraint =
-	EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0);
-
-void intel_pmu_enable_bts(u64 config)
-{
-	unsigned long debugctlmsr;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr |= DEBUGCTLMSR_TR;
-	debugctlmsr |= DEBUGCTLMSR_BTS;
-	if (config & ARCH_PERFMON_EVENTSEL_INT)
-		debugctlmsr |= DEBUGCTLMSR_BTINT;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
-		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
-
-	if (!(config & ARCH_PERFMON_EVENTSEL_USR))
-		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-void intel_pmu_disable_bts(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	unsigned long debugctlmsr;
-
-	if (!cpuc->ds)
-		return;
-
-	debugctlmsr = get_debugctlmsr();
-
-	debugctlmsr &=
-		~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
-		  DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
-
-	update_debugctlmsr(debugctlmsr);
-}
-
-int intel_pmu_drain_bts_buffer(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct debug_store *ds = cpuc->ds;
-	struct bts_record {
-		u64	from;
-		u64	to;
-		u64	flags;
-	};
-	struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS];
-	struct bts_record *at, *base, *top;
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_sample_data data;
-	unsigned long skip = 0;
-	struct pt_regs regs;
-
-	if (!event)
-		return 0;
-
-	if (!x86_pmu.bts_active)
-		return 0;
-
-	base = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
-	top  = (struct bts_record *)(unsigned long)ds->bts_index;
-
-	if (top <= base)
-		return 0;
-
-	memset(&regs, 0, sizeof(regs));
-
-	ds->bts_index = ds->bts_buffer_base;
-
-	perf_sample_data_init(&data, 0, event->hw.last_period);
-
-	/*
-	 * BTS leaks kernel addresses in branches across the cpl boundary,
-	 * such as traps or system calls, so unless the user is asking for
-	 * kernel tracing (and right now it's not possible), we'd need to
-	 * filter them out. But first we need to count how many of those we
-	 * have in the current batch. This is an extra O(n) pass, however,
-	 * it's much faster than the other one especially considering that
-	 * n <= 2560 (BTS_BUFFER_SIZE / BTS_RECORD_SIZE * 15/16; see the
-	 * alloc_bts_buffer()).
-	 */
-	for (at = base; at < top; at++) {
-		/*
-		 * Note that right now *this* BTS code only works if
-		 * attr::exclude_kernel is set, but let's keep this extra
-		 * check here in case that changes.
-		 */
-		if (event->attr.exclude_kernel &&
-		    (kernel_ip(at->from) || kernel_ip(at->to)))
-			skip++;
-	}
-
-	/*
-	 * Prepare a generic sample, i.e. fill in the invariant fields.
-	 * We will overwrite the from and to address before we output
-	 * the sample.
-	 */
-	perf_prepare_sample(&header, &data, event, &regs);
-
-	if (perf_output_begin(&handle, event, header.size *
-			      (top - base - skip)))
-		return 1;
-
-	for (at = base; at < top; at++) {
-		/* Filter out any records that contain kernel addresses. */
-		if (event->attr.exclude_kernel &&
-		    (kernel_ip(at->from) || kernel_ip(at->to)))
-			continue;
-
-		data.ip		= at->from;
-		data.addr	= at->to;
-
-		perf_output_sample(&handle, &header, &data, event);
-	}
-
-	perf_output_end(&handle);
-
-	/* There's new data available. */
-	event->hw.interrupts++;
-	event->pending_kill = POLL_IN;
-	return 1;
-}
-
-static inline void intel_pmu_drain_pebs_buffer(void)
-{
-	struct pt_regs regs;
-
-	x86_pmu.drain_pebs(&regs);
-}
-
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-	if (!sched_in)
-		intel_pmu_drain_pebs_buffer();
-}
-
-/*
- * PEBS
- */
-struct event_constraint intel_core2_pebs_event_constraints[] = {
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
-	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_atom_pebs_event_constraints[] = {
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
-	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_slm_pebs_event_constraints[] = {
-	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_nehalem_pebs_event_constraints[] = {
-	INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INST_RETIRED.ANY */
-	INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
-	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_westmere_pebs_event_constraints[] = {
-	INTEL_PLD_CONSTRAINT(0x100b, 0xf),      /* MEM_INST_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf),    /* MEM_UNCORE_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xf),    /* INSTR_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xc2, 0xf),    /* UOPS_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xc7, 0xf),    /* SSEX_UOPS_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
-	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_snb_pebs_event_constraints[] = {
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-	INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
-	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_ivb_pebs_event_constraints[] = {
-        INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-        INTEL_PLD_CONSTRAINT(0x01cd, 0x8),    /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
-	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
-	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-        EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_hsw_pebs_event_constraints[] = {
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
-	INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
-	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
-	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint intel_skl_pebs_event_constraints[] = {
-	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
-	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
-	/* INST_RETIRED.TOTAL_CYCLES_PS (inv=1, cmask=16) (cycles:p). */
-	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
-	INTEL_PLD_CONSTRAINT(0x1cd, 0xf),		      /* MEM_TRANS_RETIRED.* */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
-	/* Allow all events as PEBS with no flags */
-	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
-	EVENT_CONSTRAINT_END
-};
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event)
-{
-	struct event_constraint *c;
-
-	if (!event->attr.precise_ip)
-		return NULL;
-
-	if (x86_pmu.pebs_constraints) {
-		for_each_event_constraint(c, x86_pmu.pebs_constraints) {
-			if ((event->hw.config & c->cmask) == c->code) {
-				event->hw.flags |= c->flags;
-				return c;
-			}
-		}
-	}
-
-	return &emptyconstraint;
-}
-
-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
-{
-	return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
-}
-
-void intel_pmu_pebs_enable(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct hw_perf_event *hwc = &event->hw;
-	struct debug_store *ds = cpuc->ds;
-	bool first_pebs;
-	u64 threshold;
-
-	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
-
-	first_pebs = !pebs_is_enabled(cpuc);
-	cpuc->pebs_enabled |= 1ULL << hwc->idx;
-
-	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
-		cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
-	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
-		cpuc->pebs_enabled |= 1ULL << 63;
-
-	/*
-	 * When the event is constrained enough we can use a larger
-	 * threshold and run the event with less frequent PMI.
-	 */
-	if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
-		threshold = ds->pebs_absolute_maximum -
-			x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
-		if (first_pebs)
-			perf_sched_cb_inc(event->ctx->pmu);
-	} else {
-		threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-		/*
-		 * If not all events can use larger buffer,
-		 * roll back to threshold = 1
-		 */
-		if (!first_pebs &&
-		    (ds->pebs_interrupt_threshold > threshold))
-			perf_sched_cb_dec(event->ctx->pmu);
-	}
-
-	/* Use auto-reload if possible to save a MSR write in the PMI */
-	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
-		ds->pebs_event_reset[hwc->idx] =
-			(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
-	}
-
-	if (first_pebs || ds->pebs_interrupt_threshold > threshold)
-		ds->pebs_interrupt_threshold = threshold;
-}
-
-void intel_pmu_pebs_disable(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct hw_perf_event *hwc = &event->hw;
-	struct debug_store *ds = cpuc->ds;
-	bool large_pebs = ds->pebs_interrupt_threshold >
-		ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-	if (large_pebs)
-		intel_pmu_drain_pebs_buffer();
-
-	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
-
-	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
-		cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
-	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
-		cpuc->pebs_enabled &= ~(1ULL << 63);
-
-	if (large_pebs && !pebs_is_enabled(cpuc))
-		perf_sched_cb_dec(event->ctx->pmu);
-
-	if (cpuc->enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-
-	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
-}
-
-void intel_pmu_pebs_enable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (cpuc->pebs_enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
-}
-
-void intel_pmu_pebs_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (cpuc->pebs_enabled)
-		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
-}
-
-static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	unsigned long from = cpuc->lbr_entries[0].from;
-	unsigned long old_to, to = cpuc->lbr_entries[0].to;
-	unsigned long ip = regs->ip;
-	int is_64bit = 0;
-	void *kaddr;
-	int size;
-
-	/*
-	 * We don't need to fixup if the PEBS assist is fault like
-	 */
-	if (!x86_pmu.intel_cap.pebs_trap)
-		return 1;
-
-	/*
-	 * No LBR entry, no basic block, no rewinding
-	 */
-	if (!cpuc->lbr_stack.nr || !from || !to)
-		return 0;
-
-	/*
-	 * Basic blocks should never cross user/kernel boundaries
-	 */
-	if (kernel_ip(ip) != kernel_ip(to))
-		return 0;
-
-	/*
-	 * unsigned math, either ip is before the start (impossible) or
-	 * the basic block is larger than 1 page (sanity)
-	 */
-	if ((ip - to) > PEBS_FIXUP_SIZE)
-		return 0;
-
-	/*
-	 * We sampled a branch insn, rewind using the LBR stack
-	 */
-	if (ip == to) {
-		set_linear_ip(regs, from);
-		return 1;
-	}
-
-	size = ip - to;
-	if (!kernel_ip(ip)) {
-		int bytes;
-		u8 *buf = this_cpu_read(insn_buffer);
-
-		/* 'size' must fit our buffer, see above */
-		bytes = copy_from_user_nmi(buf, (void __user *)to, size);
-		if (bytes != 0)
-			return 0;
-
-		kaddr = buf;
-	} else {
-		kaddr = (void *)to;
-	}
-
-	do {
-		struct insn insn;
-
-		old_to = to;
-
-#ifdef CONFIG_X86_64
-		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
-#endif
-		insn_init(&insn, kaddr, size, is_64bit);
-		insn_get_length(&insn);
-		/*
-		 * Make sure there was not a problem decoding the
-		 * instruction and getting the length.  This is
-		 * doubly important because we have an infinite
-		 * loop if insn.length=0.
-		 */
-		if (!insn.length)
-			break;
-
-		to += insn.length;
-		kaddr += insn.length;
-		size -= insn.length;
-	} while (to < ip);
-
-	if (to == ip) {
-		set_linear_ip(regs, old_to);
-		return 1;
-	}
-
-	/*
-	 * Even though we decoded the basic block, the instruction stream
-	 * never matched the given IP, either the TO or the IP got corrupted.
-	 */
-	return 0;
-}
-
-static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
-{
-	if (pebs->tsx_tuning) {
-		union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
-		return tsx.cycles_last_block;
-	}
-	return 0;
-}
-
-static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
-{
-	u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
-
-	/* For RTM XABORTs also log the abort code from AX */
-	if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
-		txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
-	return txn;
-}
-
-static void setup_pebs_sample_data(struct perf_event *event,
-				   struct pt_regs *iregs, void *__pebs,
-				   struct perf_sample_data *data,
-				   struct pt_regs *regs)
-{
-#define PERF_X86_EVENT_PEBS_HSW_PREC \
-		(PERF_X86_EVENT_PEBS_ST_HSW | \
-		 PERF_X86_EVENT_PEBS_LD_HSW | \
-		 PERF_X86_EVENT_PEBS_NA_HSW)
-	/*
-	 * We cast to the biggest pebs_record but are careful not to
-	 * unconditionally access the 'extra' entries.
-	 */
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct pebs_record_skl *pebs = __pebs;
-	u64 sample_type;
-	int fll, fst, dsrc;
-	int fl = event->hw.flags;
-
-	if (pebs == NULL)
-		return;
-
-	sample_type = event->attr.sample_type;
-	dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
-
-	fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
-	fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
-
-	perf_sample_data_init(data, 0, event->hw.last_period);
-
-	data->period = event->hw.last_period;
-
-	/*
-	 * Use latency for weight (only avail with PEBS-LL)
-	 */
-	if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-		data->weight = pebs->lat;
-
-	/*
-	 * data.data_src encodes the data source
-	 */
-	if (dsrc) {
-		u64 val = PERF_MEM_NA;
-		if (fll)
-			val = load_latency_data(pebs->dse);
-		else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
-			val = precise_datala_hsw(event, pebs->dse);
-		else if (fst)
-			val = precise_store_data(pebs->dse);
-		data->data_src.val = val;
-	}
-
-	/*
-	 * We use the interrupt regs as a base because the PEBS record
-	 * does not contain a full regs set, specifically it seems to
-	 * lack segment descriptors, which get used by things like
-	 * user_mode().
-	 *
-	 * In the simple case fix up only the IP and BP,SP regs, for
-	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
-	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
-	 */
-	*regs = *iregs;
-	regs->flags = pebs->flags;
-	set_linear_ip(regs, pebs->ip);
-	regs->bp = pebs->bp;
-	regs->sp = pebs->sp;
-
-	if (sample_type & PERF_SAMPLE_REGS_INTR) {
-		regs->ax = pebs->ax;
-		regs->bx = pebs->bx;
-		regs->cx = pebs->cx;
-		regs->dx = pebs->dx;
-		regs->si = pebs->si;
-		regs->di = pebs->di;
-		regs->bp = pebs->bp;
-		regs->sp = pebs->sp;
-
-		regs->flags = pebs->flags;
-#ifndef CONFIG_X86_32
-		regs->r8 = pebs->r8;
-		regs->r9 = pebs->r9;
-		regs->r10 = pebs->r10;
-		regs->r11 = pebs->r11;
-		regs->r12 = pebs->r12;
-		regs->r13 = pebs->r13;
-		regs->r14 = pebs->r14;
-		regs->r15 = pebs->r15;
-#endif
-	}
-
-	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
-		regs->ip = pebs->real_ip;
-		regs->flags |= PERF_EFLAGS_EXACT;
-	} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
-		regs->flags |= PERF_EFLAGS_EXACT;
-	else
-		regs->flags &= ~PERF_EFLAGS_EXACT;
-
-	if ((sample_type & PERF_SAMPLE_ADDR) &&
-	    x86_pmu.intel_cap.pebs_format >= 1)
-		data->addr = pebs->dla;
-
-	if (x86_pmu.intel_cap.pebs_format >= 2) {
-		/* Only set the TSX weight when no memory weight. */
-		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-			data->weight = intel_hsw_weight(pebs);
-
-		if (sample_type & PERF_SAMPLE_TRANSACTION)
-			data->txn = intel_hsw_transaction(pebs);
-	}
-
-	/*
-	 * v3 supplies an accurate time stamp, so we use that
-	 * for the time stamp.
-	 *
-	 * We can only do this for the default trace clock.
-	 */
-	if (x86_pmu.intel_cap.pebs_format >= 3 &&
-		event->attr.use_clockid == 0)
-		data->time = native_sched_clock_from_tsc(pebs->tsc);
-
-	if (has_branch_stack(event))
-		data->br_stack = &cpuc->lbr_stack;
-}
-
-static inline void *
-get_next_pebs_record_by_bit(void *base, void *top, int bit)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	void *at;
-	u64 pebs_status;
-
-	/*
-	 * fmt0 does not have a status bitfield (does not use
-	 * perf_record_nhm format)
-	 */
-	if (x86_pmu.intel_cap.pebs_format < 1)
-		return base;
-
-	if (base == NULL)
-		return NULL;
-
-	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
-		struct pebs_record_nhm *p = at;
-
-		if (test_bit(bit, (unsigned long *)&p->status)) {
-			/* PEBS v3 has accurate status bits */
-			if (x86_pmu.intel_cap.pebs_format >= 3)
-				return at;
-
-			if (p->status == (1 << bit))
-				return at;
-
-			/* clear non-PEBS bit and re-check */
-			pebs_status = p->status & cpuc->pebs_enabled;
-			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
-			if (pebs_status == (1 << bit))
-				return at;
-		}
-	}
-	return NULL;
-}
-
-static void __intel_pmu_pebs_event(struct perf_event *event,
-				   struct pt_regs *iregs,
-				   void *base, void *top,
-				   int bit, int count)
-{
-	struct perf_sample_data data;
-	struct pt_regs regs;
-	void *at = get_next_pebs_record_by_bit(base, top, bit);
-
-	if (!intel_pmu_save_and_restart(event) &&
-	    !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
-		return;
-
-	while (count > 1) {
-		setup_pebs_sample_data(event, iregs, at, &data, &regs);
-		perf_event_output(event, &data, &regs);
-		at += x86_pmu.pebs_record_size;
-		at = get_next_pebs_record_by_bit(at, top, bit);
-		count--;
-	}
-
-	setup_pebs_sample_data(event, iregs, at, &data, &regs);
-
-	/*
-	 * All but the last records are processed.
-	 * The last one is left to be able to call the overflow handler.
-	 */
-	if (perf_event_overflow(event, &data, &regs)) {
-		x86_pmu_stop(event, 0);
-		return;
-	}
-
-}
-
-static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct debug_store *ds = cpuc->ds;
-	struct perf_event *event = cpuc->events[0]; /* PMC0 only */
-	struct pebs_record_core *at, *top;
-	int n;
-
-	if (!x86_pmu.pebs_active)
-		return;
-
-	at  = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
-	top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
-
-	/*
-	 * Whatever else happens, drain the thing
-	 */
-	ds->pebs_index = ds->pebs_buffer_base;
-
-	if (!test_bit(0, cpuc->active_mask))
-		return;
-
-	WARN_ON_ONCE(!event);
-
-	if (!event->attr.precise_ip)
-		return;
-
-	n = top - at;
-	if (n <= 0)
-		return;
-
-	__intel_pmu_pebs_event(event, iregs, at, top, 0, n);
-}
-
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct debug_store *ds = cpuc->ds;
-	struct perf_event *event;
-	void *base, *at, *top;
-	short counts[MAX_PEBS_EVENTS] = {};
-	short error[MAX_PEBS_EVENTS] = {};
-	int bit, i;
-
-	if (!x86_pmu.pebs_active)
-		return;
-
-	base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
-	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
-
-	ds->pebs_index = ds->pebs_buffer_base;
-
-	if (unlikely(base >= top))
-		return;
-
-	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
-		struct pebs_record_nhm *p = at;
-		u64 pebs_status;
-
-		/* PEBS v3 has accurate status bits */
-		if (x86_pmu.intel_cap.pebs_format >= 3) {
-			for_each_set_bit(bit, (unsigned long *)&p->status,
-					 MAX_PEBS_EVENTS)
-				counts[bit]++;
-
-			continue;
-		}
-
-		pebs_status = p->status & cpuc->pebs_enabled;
-		pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
-
-		/*
-		 * On some CPUs the PEBS status can be zero when PEBS is
-		 * racing with clearing of GLOBAL_STATUS.
-		 *
-		 * Normally we would drop that record, but in the
-		 * case when there is only a single active PEBS event
-		 * we can assume it's for that event.
-		 */
-		if (!pebs_status && cpuc->pebs_enabled &&
-			!(cpuc->pebs_enabled & (cpuc->pebs_enabled-1)))
-			pebs_status = cpuc->pebs_enabled;
-
-		bit = find_first_bit((unsigned long *)&pebs_status,
-					x86_pmu.max_pebs_events);
-		if (bit >= x86_pmu.max_pebs_events)
-			continue;
-
-		/*
-		 * The PEBS hardware does not deal well with the situation
-		 * when events happen near to each other and multiple bits
-		 * are set. But it should happen rarely.
-		 *
-		 * If these events include one PEBS and multiple non-PEBS
-		 * events, it doesn't impact PEBS record. The record will
-		 * be handled normally. (slow path)
-		 *
-		 * If these events include two or more PEBS events, the
-		 * records for the events can be collapsed into a single
-		 * one, and it's not possible to reconstruct all events
-		 * that caused the PEBS record. It's called collision.
-		 * If collision happened, the record will be dropped.
-		 */
-		if (p->status != (1ULL << bit)) {
-			for_each_set_bit(i, (unsigned long *)&pebs_status,
-					 x86_pmu.max_pebs_events)
-				error[i]++;
-			continue;
-		}
-
-		counts[bit]++;
-	}
-
-	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
-		if ((counts[bit] == 0) && (error[bit] == 0))
-			continue;
-
-		event = cpuc->events[bit];
-		WARN_ON_ONCE(!event);
-		WARN_ON_ONCE(!event->attr.precise_ip);
-
-		/* log dropped samples number */
-		if (error[bit])
-			perf_log_lost_samples(event, error[bit]);
-
-		if (counts[bit]) {
-			__intel_pmu_pebs_event(event, iregs, base,
-					       top, bit, counts[bit]);
-		}
-	}
-}
-
-/*
- * BTS, PEBS probe and setup
- */
-
-void __init intel_ds_init(void)
-{
-	/*
-	 * No support for 32bit formats
-	 */
-	if (!boot_cpu_has(X86_FEATURE_DTES64))
-		return;
-
-	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
-	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
-	if (x86_pmu.pebs) {
-		char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
-		int format = x86_pmu.intel_cap.pebs_format;
-
-		switch (format) {
-		case 0:
-			pr_cont("PEBS fmt0%c, ", pebs_type);
-			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
-			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
-			break;
-
-		case 1:
-			pr_cont("PEBS fmt1%c, ", pebs_type);
-			x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
-			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-			break;
-
-		case 2:
-			pr_cont("PEBS fmt2%c, ", pebs_type);
-			x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
-			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-			break;
-
-		case 3:
-			pr_cont("PEBS fmt3%c, ", pebs_type);
-			x86_pmu.pebs_record_size =
-						sizeof(struct pebs_record_skl);
-			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-			x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
-			break;
-
-		default:
-			pr_cont("no PEBS fmt%d%c, ", format, pebs_type);
-			x86_pmu.pebs = 0;
-		}
-	}
-}
-
-void perf_restore_debug_store(void)
-{
-	struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
-
-	if (!x86_pmu.bts && !x86_pmu.pebs)
-		return;
-
-	wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds);
-}
-- 
cgit v0.10.2


From c85cc4497f823b83379a23e798018d69fe566185 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:12 +0100
Subject: perf/x86: Move perf_event_intel_lbr.c ........ =>
 x86/events/intel/lbr.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-7-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index c4d41b6..48097fd 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -6,4 +6,4 @@ ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
new file mode 100644
index 0000000..78c88f9
--- /dev/null
+++ b/arch/x86/events/intel/lbr.c
@@ -0,0 +1,1062 @@
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/perf_event.h>
+#include <asm/msr.h>
+#include <asm/insn.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+enum {
+	LBR_FORMAT_32		= 0x00,
+	LBR_FORMAT_LIP		= 0x01,
+	LBR_FORMAT_EIP		= 0x02,
+	LBR_FORMAT_EIP_FLAGS	= 0x03,
+	LBR_FORMAT_EIP_FLAGS2	= 0x04,
+	LBR_FORMAT_INFO		= 0x05,
+	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
+};
+
+static enum {
+	LBR_EIP_FLAGS		= 1,
+	LBR_TSX			= 2,
+} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
+	[LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
+	[LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
+};
+
+/*
+ * Intel LBR_SELECT bits
+ * Intel Vol3a, April 2011, Section 16.7 Table 16-10
+ *
+ * Hardware branch filter (not available on all CPUs)
+ */
+#define LBR_KERNEL_BIT		0 /* do not capture at ring0 */
+#define LBR_USER_BIT		1 /* do not capture at ring > 0 */
+#define LBR_JCC_BIT		2 /* do not capture conditional branches */
+#define LBR_REL_CALL_BIT	3 /* do not capture relative calls */
+#define LBR_IND_CALL_BIT	4 /* do not capture indirect calls */
+#define LBR_RETURN_BIT		5 /* do not capture near returns */
+#define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */
+#define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */
+#define LBR_FAR_BIT		8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT	9 /* enable call stack */
+
+/*
+ * Following bit only exists in Linux; we mask it out before writing it to
+ * the actual MSR. But it helps the constraint perf code to understand
+ * that this is a separate configuration.
+ */
+#define LBR_NO_INFO_BIT	       63 /* don't read LBR_INFO. */
+
+#define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
+#define LBR_USER	(1 << LBR_USER_BIT)
+#define LBR_JCC		(1 << LBR_JCC_BIT)
+#define LBR_REL_CALL	(1 << LBR_REL_CALL_BIT)
+#define LBR_IND_CALL	(1 << LBR_IND_CALL_BIT)
+#define LBR_RETURN	(1 << LBR_RETURN_BIT)
+#define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT)
+#define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
+#define LBR_FAR		(1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
+#define LBR_NO_INFO	(1ULL << LBR_NO_INFO_BIT)
+
+#define LBR_PLM (LBR_KERNEL | LBR_USER)
+
+#define LBR_SEL_MASK	0x1ff	/* valid bits in LBR_SELECT */
+#define LBR_NOT_SUPP	-1	/* LBR filter not supported */
+#define LBR_IGN		0	/* ignored */
+
+#define LBR_ANY		 \
+	(LBR_JCC	|\
+	 LBR_REL_CALL	|\
+	 LBR_IND_CALL	|\
+	 LBR_RETURN	|\
+	 LBR_REL_JMP	|\
+	 LBR_IND_JMP	|\
+	 LBR_FAR)
+
+#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
+#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
+
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+	X86_BR_NONE		= 0,      /* unknown */
+
+	X86_BR_USER		= 1 << 0, /* branch target is user */
+	X86_BR_KERNEL		= 1 << 1, /* branch target is kernel */
+
+	X86_BR_CALL		= 1 << 2, /* call */
+	X86_BR_RET		= 1 << 3, /* return */
+	X86_BR_SYSCALL		= 1 << 4, /* syscall */
+	X86_BR_SYSRET		= 1 << 5, /* syscall return */
+	X86_BR_INT		= 1 << 6, /* sw interrupt */
+	X86_BR_IRET		= 1 << 7, /* return from interrupt */
+	X86_BR_JCC		= 1 << 8, /* conditional */
+	X86_BR_JMP		= 1 << 9, /* jump */
+	X86_BR_IRQ		= 1 << 10,/* hw interrupt or trap or fault */
+	X86_BR_IND_CALL		= 1 << 11,/* indirect calls */
+	X86_BR_ABORT		= 1 << 12,/* transaction abort */
+	X86_BR_IN_TX		= 1 << 13,/* in transaction */
+	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
+	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
+	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
+	X86_BR_IND_JMP		= 1 << 17,/* indirect jump */
+};
+
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
+
+#define X86_BR_ANY       \
+	(X86_BR_CALL    |\
+	 X86_BR_RET     |\
+	 X86_BR_SYSCALL |\
+	 X86_BR_SYSRET  |\
+	 X86_BR_INT     |\
+	 X86_BR_IRET    |\
+	 X86_BR_JCC     |\
+	 X86_BR_JMP	 |\
+	 X86_BR_IRQ	 |\
+	 X86_BR_ABORT	 |\
+	 X86_BR_IND_CALL |\
+	 X86_BR_IND_JMP  |\
+	 X86_BR_ZERO_CALL)
+
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+
+#define X86_BR_ANY_CALL		 \
+	(X86_BR_CALL		|\
+	 X86_BR_IND_CALL	|\
+	 X86_BR_ZERO_CALL	|\
+	 X86_BR_SYSCALL		|\
+	 X86_BR_IRQ		|\
+	 X86_BR_INT)
+
+static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+static void __intel_pmu_lbr_enable(bool pmi)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	u64 debugctl, lbr_select = 0, orig_debugctl;
+
+	/*
+	 * No need to unfreeze manually, as v4 can do that as part
+	 * of the GLOBAL_STATUS ack.
+	 */
+	if (pmi && x86_pmu.version >= 4)
+		return;
+
+	/*
+	 * No need to reprogram LBR_SELECT in a PMI, as it
+	 * did not change.
+	 */
+	if (cpuc->lbr_sel)
+		lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
+	if (!pmi && cpuc->lbr_sel)
+		wrmsrl(MSR_LBR_SELECT, lbr_select);
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	orig_debugctl = debugctl;
+	debugctl |= DEBUGCTLMSR_LBR;
+	/*
+	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+	 * may cause superfluous increase/decrease of LBR_TOS.
+	 */
+	if (!(lbr_select & LBR_CALL_STACK))
+		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+	if (orig_debugctl != debugctl)
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void __intel_pmu_lbr_disable(void)
+{
+	u64 debugctl;
+
+	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void intel_pmu_lbr_reset_32(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++)
+		wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+
+static void intel_pmu_lbr_reset_64(void)
+{
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		wrmsrl(x86_pmu.lbr_from + i, 0);
+		wrmsrl(x86_pmu.lbr_to   + i, 0);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + i, 0);
+	}
+}
+
+void intel_pmu_lbr_reset(void)
+{
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+		intel_pmu_lbr_reset_32();
+	else
+		intel_pmu_lbr_reset_64();
+}
+
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+	u64 tos;
+
+	rdmsrl(x86_pmu.lbr_tos, tos);
+	return tos;
+}
+
+enum {
+	LBR_NONE,
+	LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+	int i;
+	unsigned lbr_idx, mask;
+	u64 tos;
+
+	if (task_ctx->lbr_callstack_users == 0 ||
+	    task_ctx->lbr_stack_state == LBR_NONE) {
+		intel_pmu_lbr_reset();
+		return;
+	}
+
+	mask = x86_pmu.lbr_nr - 1;
+	tos = task_ctx->tos;
+	for (i = 0; i < tos; i++) {
+		lbr_idx = (tos - i) & mask;
+		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+	}
+	wrmsrl(x86_pmu.lbr_tos, tos);
+	task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+	int i;
+	unsigned lbr_idx, mask;
+	u64 tos;
+
+	if (task_ctx->lbr_callstack_users == 0) {
+		task_ctx->lbr_stack_state = LBR_NONE;
+		return;
+	}
+
+	mask = x86_pmu.lbr_nr - 1;
+	tos = intel_pmu_lbr_tos();
+	for (i = 0; i < tos; i++) {
+		lbr_idx = (tos - i) & mask;
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+	}
+	task_ctx->tos = tos;
+	task_ctx->lbr_stack_state = LBR_VALID;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
+
+	/*
+	 * If LBR callstack feature is enabled and the stack was saved when
+	 * the task was scheduled out, restore the stack. Otherwise flush
+	 * the LBR stack.
+	 */
+	task_ctx = ctx ? ctx->task_ctx_data : NULL;
+	if (task_ctx) {
+		if (sched_in) {
+			__intel_pmu_lbr_restore(task_ctx);
+			cpuc->lbr_context = ctx;
+		} else {
+			__intel_pmu_lbr_save(task_ctx);
+		}
+		return;
+	}
+
+	/*
+	 * When sampling the branck stack in system-wide, it may be
+	 * necessary to flush the stack on context switch. This happens
+	 * when the branch stack does not tag its entries with the pid
+	 * of the current task. Otherwise it becomes impossible to
+	 * associate a branch entry with a task. This ambiguity is more
+	 * likely to appear when the branch stack supports priv level
+	 * filtering and the user sets it to monitor only at the user
+	 * level (which could be a useful measurement in system-wide
+	 * mode). In that case, the risk is high of having a branch
+	 * stack with branch from multiple tasks.
+ 	 */
+	if (sched_in) {
+		intel_pmu_lbr_reset();
+		cpuc->lbr_context = ctx;
+	}
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+	return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
+void intel_pmu_lbr_enable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	/*
+	 * Reset the LBR stack if we changed task context to
+	 * avoid data leaks.
+	 */
+	if (event->ctx->task && cpuc->lbr_context != event->ctx) {
+		intel_pmu_lbr_reset();
+		cpuc->lbr_context = event->ctx;
+	}
+	cpuc->br_sel = event->hw.branch_reg.reg;
+
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+					event->ctx->task_ctx_data) {
+		task_ctx = event->ctx->task_ctx_data;
+		task_ctx->lbr_callstack_users++;
+	}
+
+	cpuc->lbr_users++;
+	perf_sched_cb_inc(event->ctx->pmu);
+}
+
+void intel_pmu_lbr_disable(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+					event->ctx->task_ctx_data) {
+		task_ctx = event->ctx->task_ctx_data;
+		task_ctx->lbr_callstack_users--;
+	}
+
+	cpuc->lbr_users--;
+	WARN_ON_ONCE(cpuc->lbr_users < 0);
+	perf_sched_cb_dec(event->ctx->pmu);
+
+	if (cpuc->enabled && !cpuc->lbr_users) {
+		__intel_pmu_lbr_disable();
+		/* avoid stale pointer */
+		cpuc->lbr_context = NULL;
+	}
+}
+
+void intel_pmu_lbr_enable_all(bool pmi)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (cpuc->lbr_users)
+		__intel_pmu_lbr_enable(pmi);
+}
+
+void intel_pmu_lbr_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (cpuc->lbr_users)
+		__intel_pmu_lbr_disable();
+}
+
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+	unsigned long mask = x86_pmu.lbr_nr - 1;
+	u64 tos = intel_pmu_lbr_tos();
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		unsigned long lbr_idx = (tos - i) & mask;
+		union {
+			struct {
+				u32 from;
+				u32 to;
+			};
+			u64     lbr;
+		} msr_lastbranch;
+
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+		cpuc->lbr_entries[i].from	= msr_lastbranch.from;
+		cpuc->lbr_entries[i].to		= msr_lastbranch.to;
+		cpuc->lbr_entries[i].mispred	= 0;
+		cpuc->lbr_entries[i].predicted	= 0;
+		cpuc->lbr_entries[i].reserved	= 0;
+	}
+	cpuc->lbr_stack.nr = i;
+}
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+	bool need_info = false;
+	unsigned long mask = x86_pmu.lbr_nr - 1;
+	int lbr_format = x86_pmu.intel_cap.lbr_format;
+	u64 tos = intel_pmu_lbr_tos();
+	int i;
+	int out = 0;
+	int num = x86_pmu.lbr_nr;
+
+	if (cpuc->lbr_sel) {
+		need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
+		if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+			num = tos;
+	}
+
+	for (i = 0; i < num; i++) {
+		unsigned long lbr_idx = (tos - i) & mask;
+		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
+		int skip = 0;
+		u16 cycles = 0;
+		int lbr_flags = lbr_desc[lbr_format];
+
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
+		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
+
+		if (lbr_format == LBR_FORMAT_INFO && need_info) {
+			u64 info;
+
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+			mis = !!(info & LBR_INFO_MISPRED);
+			pred = !mis;
+			in_tx = !!(info & LBR_INFO_IN_TX);
+			abort = !!(info & LBR_INFO_ABORT);
+			cycles = (info & LBR_INFO_CYCLES);
+		}
+		if (lbr_flags & LBR_EIP_FLAGS) {
+			mis = !!(from & LBR_FROM_FLAG_MISPRED);
+			pred = !mis;
+			skip = 1;
+		}
+		if (lbr_flags & LBR_TSX) {
+			in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+			abort = !!(from & LBR_FROM_FLAG_ABORT);
+			skip = 3;
+		}
+		from = (u64)((((s64)from) << skip) >> skip);
+
+		/*
+		 * Some CPUs report duplicated abort records,
+		 * with the second entry not having an abort bit set.
+		 * Skip them here. This loop runs backwards,
+		 * so we need to undo the previous record.
+		 * If the abort just happened outside the window
+		 * the extra entry cannot be removed.
+		 */
+		if (abort && x86_pmu.lbr_double_abort && out > 0)
+			out--;
+
+		cpuc->lbr_entries[out].from	 = from;
+		cpuc->lbr_entries[out].to	 = to;
+		cpuc->lbr_entries[out].mispred	 = mis;
+		cpuc->lbr_entries[out].predicted = pred;
+		cpuc->lbr_entries[out].in_tx	 = in_tx;
+		cpuc->lbr_entries[out].abort	 = abort;
+		cpuc->lbr_entries[out].cycles	 = cycles;
+		cpuc->lbr_entries[out].reserved	 = 0;
+		out++;
+	}
+	cpuc->lbr_stack.nr = out;
+}
+
+void intel_pmu_lbr_read(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (!cpuc->lbr_users)
+		return;
+
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+		intel_pmu_lbr_read_32(cpuc);
+	else
+		intel_pmu_lbr_read_64(cpuc);
+
+	intel_pmu_lbr_filter(cpuc);
+}
+
+/*
+ * SW filter is used:
+ * - in case there is no HW filter
+ * - in case the HW filter has errata or limitations
+ */
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+{
+	u64 br_type = event->attr.branch_sample_type;
+	int mask = 0;
+
+	if (br_type & PERF_SAMPLE_BRANCH_USER)
+		mask |= X86_BR_USER;
+
+	if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+		mask |= X86_BR_KERNEL;
+
+	/* we ignore BRANCH_HV here */
+
+	if (br_type & PERF_SAMPLE_BRANCH_ANY)
+		mask |= X86_BR_ANY;
+
+	if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+		mask |= X86_BR_ANY_CALL;
+
+	if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+		mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+	if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+		mask |= X86_BR_IND_CALL;
+
+	if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
+		mask |= X86_BR_ABORT;
+
+	if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
+		mask |= X86_BR_IN_TX;
+
+	if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
+		mask |= X86_BR_NO_TX;
+
+	if (br_type & PERF_SAMPLE_BRANCH_COND)
+		mask |= X86_BR_JCC;
+
+	if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+		if (!x86_pmu_has_lbr_callstack())
+			return -EOPNOTSUPP;
+		if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+			return -EINVAL;
+		mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+			X86_BR_CALL_STACK;
+	}
+
+	if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+		mask |= X86_BR_IND_JMP;
+
+	if (br_type & PERF_SAMPLE_BRANCH_CALL)
+		mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+	/*
+	 * stash actual user request into reg, it may
+	 * be used by fixup code for some CPU
+	 */
+	event->hw.branch_reg.reg = mask;
+	return 0;
+}
+
+/*
+ * setup the HW LBR filter
+ * Used only when available, may not be enough to disambiguate
+ * all branches, may need the help of the SW filter
+ */
+static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg;
+	u64 br_type = event->attr.branch_sample_type;
+	u64 mask = 0, v;
+	int i;
+
+	for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+		if (!(br_type & (1ULL << i)))
+			continue;
+
+		v = x86_pmu.lbr_sel_map[i];
+		if (v == LBR_NOT_SUPP)
+			return -EOPNOTSUPP;
+
+		if (v != LBR_IGN)
+			mask |= v;
+	}
+
+	reg = &event->hw.branch_reg;
+	reg->idx = EXTRA_REG_LBR;
+
+	/*
+	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+	 * in suppress mode. So LBR_SELECT should be set to
+	 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+	 */
+	reg->config = mask ^ x86_pmu.lbr_sel_mask;
+
+	if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
+	    (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
+	    (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+		reg->config |= LBR_NO_INFO;
+
+	return 0;
+}
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event)
+{
+	int ret = 0;
+
+	/*
+	 * no LBR on this PMU
+	 */
+	if (!x86_pmu.lbr_nr)
+		return -EOPNOTSUPP;
+
+	/*
+	 * setup SW LBR filter
+	 */
+	ret = intel_pmu_setup_sw_lbr_filter(event);
+	if (ret)
+		return ret;
+
+	/*
+	 * setup HW LBR filter, if any
+	 */
+	if (x86_pmu.lbr_sel_map)
+		ret = intel_pmu_setup_hw_lbr_filter(event);
+
+	return ret;
+}
+
+/*
+ * return the type of control flow change at address "from"
+ * intruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ */
+static int branch_type(unsigned long from, unsigned long to, int abort)
+{
+	struct insn insn;
+	void *addr;
+	int bytes_read, bytes_left;
+	int ret = X86_BR_NONE;
+	int ext, to_plm, from_plm;
+	u8 buf[MAX_INSN_SIZE];
+	int is64 = 0;
+
+	to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+	from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+
+	/*
+	 * maybe zero if lbr did not fill up after a reset by the time
+	 * we get a PMU interrupt
+	 */
+	if (from == 0 || to == 0)
+		return X86_BR_NONE;
+
+	if (abort)
+		return X86_BR_ABORT | to_plm;
+
+	if (from_plm == X86_BR_USER) {
+		/*
+		 * can happen if measuring at the user level only
+		 * and we interrupt in a kernel thread, e.g., idle.
+		 */
+		if (!current->mm)
+			return X86_BR_NONE;
+
+		/* may fail if text not present */
+		bytes_left = copy_from_user_nmi(buf, (void __user *)from,
+						MAX_INSN_SIZE);
+		bytes_read = MAX_INSN_SIZE - bytes_left;
+		if (!bytes_read)
+			return X86_BR_NONE;
+
+		addr = buf;
+	} else {
+		/*
+		 * The LBR logs any address in the IP, even if the IP just
+		 * faulted. This means userspace can control the from address.
+		 * Ensure we don't blindy read any address by validating it is
+		 * a known text address.
+		 */
+		if (kernel_text_address(from)) {
+			addr = (void *)from;
+			/*
+			 * Assume we can get the maximum possible size
+			 * when grabbing kernel data.  This is not
+			 * _strictly_ true since we could possibly be
+			 * executing up next to a memory hole, but
+			 * it is very unlikely to be a problem.
+			 */
+			bytes_read = MAX_INSN_SIZE;
+		} else {
+			return X86_BR_NONE;
+		}
+	}
+
+	/*
+	 * decoder needs to know the ABI especially
+	 * on 64-bit systems running 32-bit apps
+	 */
+#ifdef CONFIG_X86_64
+	is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
+#endif
+	insn_init(&insn, addr, bytes_read, is64);
+	insn_get_opcode(&insn);
+	if (!insn.opcode.got)
+		return X86_BR_ABORT;
+
+	switch (insn.opcode.bytes[0]) {
+	case 0xf:
+		switch (insn.opcode.bytes[1]) {
+		case 0x05: /* syscall */
+		case 0x34: /* sysenter */
+			ret = X86_BR_SYSCALL;
+			break;
+		case 0x07: /* sysret */
+		case 0x35: /* sysexit */
+			ret = X86_BR_SYSRET;
+			break;
+		case 0x80 ... 0x8f: /* conditional */
+			ret = X86_BR_JCC;
+			break;
+		default:
+			ret = X86_BR_NONE;
+		}
+		break;
+	case 0x70 ... 0x7f: /* conditional */
+		ret = X86_BR_JCC;
+		break;
+	case 0xc2: /* near ret */
+	case 0xc3: /* near ret */
+	case 0xca: /* far ret */
+	case 0xcb: /* far ret */
+		ret = X86_BR_RET;
+		break;
+	case 0xcf: /* iret */
+		ret = X86_BR_IRET;
+		break;
+	case 0xcc ... 0xce: /* int */
+		ret = X86_BR_INT;
+		break;
+	case 0xe8: /* call near rel */
+		insn_get_immediate(&insn);
+		if (insn.immediate1.value == 0) {
+			/* zero length call */
+			ret = X86_BR_ZERO_CALL;
+			break;
+		}
+	case 0x9a: /* call far absolute */
+		ret = X86_BR_CALL;
+		break;
+	case 0xe0 ... 0xe3: /* loop jmp */
+		ret = X86_BR_JCC;
+		break;
+	case 0xe9 ... 0xeb: /* jmp */
+		ret = X86_BR_JMP;
+		break;
+	case 0xff: /* call near absolute, call far absolute ind */
+		insn_get_modrm(&insn);
+		ext = (insn.modrm.bytes[0] >> 3) & 0x7;
+		switch (ext) {
+		case 2: /* near ind call */
+		case 3: /* far ind call */
+			ret = X86_BR_IND_CALL;
+			break;
+		case 4:
+		case 5:
+			ret = X86_BR_IND_JMP;
+			break;
+		}
+		break;
+	default:
+		ret = X86_BR_NONE;
+	}
+	/*
+	 * interrupts, traps, faults (and thus ring transition) may
+	 * occur on any instructions. Thus, to classify them correctly,
+	 * we need to first look at the from and to priv levels. If they
+	 * are different and to is in the kernel, then it indicates
+	 * a ring transition. If the from instruction is not a ring
+	 * transition instr (syscall, systenter, int), then it means
+	 * it was a irq, trap or fault.
+	 *
+	 * we have no way of detecting kernel to kernel faults.
+	 */
+	if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+	    && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+		ret = X86_BR_IRQ;
+
+	/*
+	 * branch priv level determined by target as
+	 * is done by HW when LBR_SELECT is implemented
+	 */
+	if (ret != X86_BR_NONE)
+		ret |= to_plm;
+
+	return ret;
+}
+
+/*
+ * implement actual branch filter based on user demand.
+ * Hardware may not exactly satisfy that request, thus
+ * we need to inspect opcodes. Mismatched branches are
+ * discarded. Therefore, the number of branches returned
+ * in PERF_SAMPLE_BRANCH_STACK sample may vary.
+ */
+static void
+intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
+{
+	u64 from, to;
+	int br_sel = cpuc->br_sel;
+	int i, j, type;
+	bool compress = false;
+
+	/* if sampling all branches, then nothing to filter */
+	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
+		return;
+
+	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+
+		from = cpuc->lbr_entries[i].from;
+		to = cpuc->lbr_entries[i].to;
+
+		type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+		if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
+			if (cpuc->lbr_entries[i].in_tx)
+				type |= X86_BR_IN_TX;
+			else
+				type |= X86_BR_NO_TX;
+		}
+
+		/* if type does not correspond, then discard */
+		if (type == X86_BR_NONE || (br_sel & type) != type) {
+			cpuc->lbr_entries[i].from = 0;
+			compress = true;
+		}
+	}
+
+	if (!compress)
+		return;
+
+	/* remove all entries with from=0 */
+	for (i = 0; i < cpuc->lbr_stack.nr; ) {
+		if (!cpuc->lbr_entries[i].from) {
+			j = i;
+			while (++j < cpuc->lbr_stack.nr)
+				cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
+			cpuc->lbr_stack.nr--;
+			if (!cpuc->lbr_entries[i].from)
+				continue;
+		}
+		i++;
+	}
+}
+
+/*
+ * Map interface branch filters onto LBR filters
+ */
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_REL_JMP
+						| LBR_IND_JMP | LBR_FAR,
+	/*
+	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
+	 */
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
+	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
+	/*
+	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
+	 */
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+};
+
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_FAR,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_FAR,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
+	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_RETURN | LBR_CALL_STACK,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
+};
+
+/* core */
+void __init intel_pmu_lbr_init_core(void)
+{
+	x86_pmu.lbr_nr     = 4;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+	/*
+	 * SW branch filter usage:
+	 * - compensate for lack of HW filter
+	 */
+	pr_cont("4-deep LBR, ");
+}
+
+/* nehalem/westmere */
+void __init intel_pmu_lbr_init_nhm(void)
+{
+	x86_pmu.lbr_nr     = 16;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
+
+	/*
+	 * SW branch filter usage:
+	 * - workaround LBR_SEL errata (see above)
+	 * - support syscall, sysret capture.
+	 *   That requires LBR_FAR but that means far
+	 *   jmp need to be filtered out
+	 */
+	pr_cont("16-deep LBR, ");
+}
+
+/* sandy bridge */
+void __init intel_pmu_lbr_init_snb(void)
+{
+	x86_pmu.lbr_nr	 = 16;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+	/*
+	 * SW branch filter usage:
+	 * - support syscall, sysret capture.
+	 *   That requires LBR_FAR but that means far
+	 *   jmp need to be filtered out
+	 */
+	pr_cont("16-deep LBR, ");
+}
+
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+	x86_pmu.lbr_nr	 = 16;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+	pr_cont("16-deep LBR, ");
+}
+
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+	x86_pmu.lbr_nr	 = 32;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+	/*
+	 * SW branch filter usage:
+	 * - support syscall, sysret capture.
+	 *   That requires LBR_FAR but that means far
+	 *   jmp need to be filtered out
+	 */
+	pr_cont("32-deep LBR, ");
+}
+
+/* atom */
+void __init intel_pmu_lbr_init_atom(void)
+{
+	/*
+	 * only models starting at stepping 10 seems
+	 * to have an operational LBR which can freeze
+	 * on PMU interrupt
+	 */
+	if (boot_cpu_data.x86_model == 28
+	    && boot_cpu_data.x86_mask < 10) {
+		pr_cont("LBR disabled due to erratum");
+		return;
+	}
+
+	x86_pmu.lbr_nr	   = 8;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
+
+	/*
+	 * SW branch filter usage:
+	 * - compensate for lack of HW filter
+	 */
+	pr_cont("8-deep LBR, ");
+}
+
+/* Knights Landing */
+void intel_pmu_lbr_init_knl(void)
+{
+	x86_pmu.lbr_nr	   = 8;
+	x86_pmu.lbr_tos    = MSR_LBR_TOS;
+	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
+
+	pr_cont("8-deep LBR, ");
+}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 035bdb6..d52ed9b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,6 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
deleted file mode 100644
index 653f88d..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ /dev/null
@@ -1,1062 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/perf_event.h>
-#include <asm/msr.h>
-#include <asm/insn.h>
-
-#include "perf_event.h"
-
-enum {
-	LBR_FORMAT_32		= 0x00,
-	LBR_FORMAT_LIP		= 0x01,
-	LBR_FORMAT_EIP		= 0x02,
-	LBR_FORMAT_EIP_FLAGS	= 0x03,
-	LBR_FORMAT_EIP_FLAGS2	= 0x04,
-	LBR_FORMAT_INFO		= 0x05,
-	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
-};
-
-static enum {
-	LBR_EIP_FLAGS		= 1,
-	LBR_TSX			= 2,
-} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
-	[LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
-	[LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
-};
-
-/*
- * Intel LBR_SELECT bits
- * Intel Vol3a, April 2011, Section 16.7 Table 16-10
- *
- * Hardware branch filter (not available on all CPUs)
- */
-#define LBR_KERNEL_BIT		0 /* do not capture at ring0 */
-#define LBR_USER_BIT		1 /* do not capture at ring > 0 */
-#define LBR_JCC_BIT		2 /* do not capture conditional branches */
-#define LBR_REL_CALL_BIT	3 /* do not capture relative calls */
-#define LBR_IND_CALL_BIT	4 /* do not capture indirect calls */
-#define LBR_RETURN_BIT		5 /* do not capture near returns */
-#define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */
-#define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */
-#define LBR_FAR_BIT		8 /* do not capture far branches */
-#define LBR_CALL_STACK_BIT	9 /* enable call stack */
-
-/*
- * Following bit only exists in Linux; we mask it out before writing it to
- * the actual MSR. But it helps the constraint perf code to understand
- * that this is a separate configuration.
- */
-#define LBR_NO_INFO_BIT	       63 /* don't read LBR_INFO. */
-
-#define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
-#define LBR_USER	(1 << LBR_USER_BIT)
-#define LBR_JCC		(1 << LBR_JCC_BIT)
-#define LBR_REL_CALL	(1 << LBR_REL_CALL_BIT)
-#define LBR_IND_CALL	(1 << LBR_IND_CALL_BIT)
-#define LBR_RETURN	(1 << LBR_RETURN_BIT)
-#define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT)
-#define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
-#define LBR_FAR		(1 << LBR_FAR_BIT)
-#define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
-#define LBR_NO_INFO	(1ULL << LBR_NO_INFO_BIT)
-
-#define LBR_PLM (LBR_KERNEL | LBR_USER)
-
-#define LBR_SEL_MASK	0x1ff	/* valid bits in LBR_SELECT */
-#define LBR_NOT_SUPP	-1	/* LBR filter not supported */
-#define LBR_IGN		0	/* ignored */
-
-#define LBR_ANY		 \
-	(LBR_JCC	|\
-	 LBR_REL_CALL	|\
-	 LBR_IND_CALL	|\
-	 LBR_RETURN	|\
-	 LBR_REL_JMP	|\
-	 LBR_IND_JMP	|\
-	 LBR_FAR)
-
-#define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
-#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
-#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
-
-/*
- * x86control flow change classification
- * x86control flow changes include branches, interrupts, traps, faults
- */
-enum {
-	X86_BR_NONE		= 0,      /* unknown */
-
-	X86_BR_USER		= 1 << 0, /* branch target is user */
-	X86_BR_KERNEL		= 1 << 1, /* branch target is kernel */
-
-	X86_BR_CALL		= 1 << 2, /* call */
-	X86_BR_RET		= 1 << 3, /* return */
-	X86_BR_SYSCALL		= 1 << 4, /* syscall */
-	X86_BR_SYSRET		= 1 << 5, /* syscall return */
-	X86_BR_INT		= 1 << 6, /* sw interrupt */
-	X86_BR_IRET		= 1 << 7, /* return from interrupt */
-	X86_BR_JCC		= 1 << 8, /* conditional */
-	X86_BR_JMP		= 1 << 9, /* jump */
-	X86_BR_IRQ		= 1 << 10,/* hw interrupt or trap or fault */
-	X86_BR_IND_CALL		= 1 << 11,/* indirect calls */
-	X86_BR_ABORT		= 1 << 12,/* transaction abort */
-	X86_BR_IN_TX		= 1 << 13,/* in transaction */
-	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
-	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
-	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
-	X86_BR_IND_JMP		= 1 << 17,/* indirect jump */
-};
-
-#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
-#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
-
-#define X86_BR_ANY       \
-	(X86_BR_CALL    |\
-	 X86_BR_RET     |\
-	 X86_BR_SYSCALL |\
-	 X86_BR_SYSRET  |\
-	 X86_BR_INT     |\
-	 X86_BR_IRET    |\
-	 X86_BR_JCC     |\
-	 X86_BR_JMP	 |\
-	 X86_BR_IRQ	 |\
-	 X86_BR_ABORT	 |\
-	 X86_BR_IND_CALL |\
-	 X86_BR_IND_JMP  |\
-	 X86_BR_ZERO_CALL)
-
-#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
-
-#define X86_BR_ANY_CALL		 \
-	(X86_BR_CALL		|\
-	 X86_BR_IND_CALL	|\
-	 X86_BR_ZERO_CALL	|\
-	 X86_BR_SYSCALL		|\
-	 X86_BR_IRQ		|\
-	 X86_BR_INT)
-
-static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
-
-/*
- * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
- * otherwise it becomes near impossible to get a reliable stack.
- */
-
-static void __intel_pmu_lbr_enable(bool pmi)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	u64 debugctl, lbr_select = 0, orig_debugctl;
-
-	/*
-	 * No need to unfreeze manually, as v4 can do that as part
-	 * of the GLOBAL_STATUS ack.
-	 */
-	if (pmi && x86_pmu.version >= 4)
-		return;
-
-	/*
-	 * No need to reprogram LBR_SELECT in a PMI, as it
-	 * did not change.
-	 */
-	if (cpuc->lbr_sel)
-		lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
-	if (!pmi && cpuc->lbr_sel)
-		wrmsrl(MSR_LBR_SELECT, lbr_select);
-
-	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	orig_debugctl = debugctl;
-	debugctl |= DEBUGCTLMSR_LBR;
-	/*
-	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
-	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
-	 * may cause superfluous increase/decrease of LBR_TOS.
-	 */
-	if (!(lbr_select & LBR_CALL_STACK))
-		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
-	if (orig_debugctl != debugctl)
-		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void __intel_pmu_lbr_disable(void)
-{
-	u64 debugctl;
-
-	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
-	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-}
-
-static void intel_pmu_lbr_reset_32(void)
-{
-	int i;
-
-	for (i = 0; i < x86_pmu.lbr_nr; i++)
-		wrmsrl(x86_pmu.lbr_from + i, 0);
-}
-
-static void intel_pmu_lbr_reset_64(void)
-{
-	int i;
-
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
-		wrmsrl(x86_pmu.lbr_from + i, 0);
-		wrmsrl(x86_pmu.lbr_to   + i, 0);
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + i, 0);
-	}
-}
-
-void intel_pmu_lbr_reset(void)
-{
-	if (!x86_pmu.lbr_nr)
-		return;
-
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-		intel_pmu_lbr_reset_32();
-	else
-		intel_pmu_lbr_reset_64();
-}
-
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
-	u64 tos;
-
-	rdmsrl(x86_pmu.lbr_tos, tos);
-	return tos;
-}
-
-enum {
-	LBR_NONE,
-	LBR_VALID,
-};
-
-static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
-{
-	int i;
-	unsigned lbr_idx, mask;
-	u64 tos;
-
-	if (task_ctx->lbr_callstack_users == 0 ||
-	    task_ctx->lbr_stack_state == LBR_NONE) {
-		intel_pmu_lbr_reset();
-		return;
-	}
-
-	mask = x86_pmu.lbr_nr - 1;
-	tos = task_ctx->tos;
-	for (i = 0; i < tos; i++) {
-		lbr_idx = (tos - i) & mask;
-		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
-		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
-	}
-	wrmsrl(x86_pmu.lbr_tos, tos);
-	task_ctx->lbr_stack_state = LBR_NONE;
-}
-
-static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
-{
-	int i;
-	unsigned lbr_idx, mask;
-	u64 tos;
-
-	if (task_ctx->lbr_callstack_users == 0) {
-		task_ctx->lbr_stack_state = LBR_NONE;
-		return;
-	}
-
-	mask = x86_pmu.lbr_nr - 1;
-	tos = intel_pmu_lbr_tos();
-	for (i = 0; i < tos; i++) {
-		lbr_idx = (tos - i) & mask;
-		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
-		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
-	}
-	task_ctx->tos = tos;
-	task_ctx->lbr_stack_state = LBR_VALID;
-}
-
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
-
-	/*
-	 * If LBR callstack feature is enabled and the stack was saved when
-	 * the task was scheduled out, restore the stack. Otherwise flush
-	 * the LBR stack.
-	 */
-	task_ctx = ctx ? ctx->task_ctx_data : NULL;
-	if (task_ctx) {
-		if (sched_in) {
-			__intel_pmu_lbr_restore(task_ctx);
-			cpuc->lbr_context = ctx;
-		} else {
-			__intel_pmu_lbr_save(task_ctx);
-		}
-		return;
-	}
-
-	/*
-	 * When sampling the branck stack in system-wide, it may be
-	 * necessary to flush the stack on context switch. This happens
-	 * when the branch stack does not tag its entries with the pid
-	 * of the current task. Otherwise it becomes impossible to
-	 * associate a branch entry with a task. This ambiguity is more
-	 * likely to appear when the branch stack supports priv level
-	 * filtering and the user sets it to monitor only at the user
-	 * level (which could be a useful measurement in system-wide
-	 * mode). In that case, the risk is high of having a branch
-	 * stack with branch from multiple tasks.
- 	 */
-	if (sched_in) {
-		intel_pmu_lbr_reset();
-		cpuc->lbr_context = ctx;
-	}
-}
-
-static inline bool branch_user_callstack(unsigned br_sel)
-{
-	return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
-}
-
-void intel_pmu_lbr_enable(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
-
-	if (!x86_pmu.lbr_nr)
-		return;
-
-	/*
-	 * Reset the LBR stack if we changed task context to
-	 * avoid data leaks.
-	 */
-	if (event->ctx->task && cpuc->lbr_context != event->ctx) {
-		intel_pmu_lbr_reset();
-		cpuc->lbr_context = event->ctx;
-	}
-	cpuc->br_sel = event->hw.branch_reg.reg;
-
-	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-					event->ctx->task_ctx_data) {
-		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->lbr_callstack_users++;
-	}
-
-	cpuc->lbr_users++;
-	perf_sched_cb_inc(event->ctx->pmu);
-}
-
-void intel_pmu_lbr_disable(struct perf_event *event)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
-
-	if (!x86_pmu.lbr_nr)
-		return;
-
-	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
-					event->ctx->task_ctx_data) {
-		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->lbr_callstack_users--;
-	}
-
-	cpuc->lbr_users--;
-	WARN_ON_ONCE(cpuc->lbr_users < 0);
-	perf_sched_cb_dec(event->ctx->pmu);
-
-	if (cpuc->enabled && !cpuc->lbr_users) {
-		__intel_pmu_lbr_disable();
-		/* avoid stale pointer */
-		cpuc->lbr_context = NULL;
-	}
-}
-
-void intel_pmu_lbr_enable_all(bool pmi)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (cpuc->lbr_users)
-		__intel_pmu_lbr_enable(pmi);
-}
-
-void intel_pmu_lbr_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (cpuc->lbr_users)
-		__intel_pmu_lbr_disable();
-}
-
-static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
-{
-	unsigned long mask = x86_pmu.lbr_nr - 1;
-	u64 tos = intel_pmu_lbr_tos();
-	int i;
-
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
-		unsigned long lbr_idx = (tos - i) & mask;
-		union {
-			struct {
-				u32 from;
-				u32 to;
-			};
-			u64     lbr;
-		} msr_lastbranch;
-
-		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
-
-		cpuc->lbr_entries[i].from	= msr_lastbranch.from;
-		cpuc->lbr_entries[i].to		= msr_lastbranch.to;
-		cpuc->lbr_entries[i].mispred	= 0;
-		cpuc->lbr_entries[i].predicted	= 0;
-		cpuc->lbr_entries[i].reserved	= 0;
-	}
-	cpuc->lbr_stack.nr = i;
-}
-
-/*
- * Due to lack of segmentation in Linux the effective address (offset)
- * is the same as the linear address, allowing us to merge the LIP and EIP
- * LBR formats.
- */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
-{
-	bool need_info = false;
-	unsigned long mask = x86_pmu.lbr_nr - 1;
-	int lbr_format = x86_pmu.intel_cap.lbr_format;
-	u64 tos = intel_pmu_lbr_tos();
-	int i;
-	int out = 0;
-	int num = x86_pmu.lbr_nr;
-
-	if (cpuc->lbr_sel) {
-		need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO);
-		if (cpuc->lbr_sel->config & LBR_CALL_STACK)
-			num = tos;
-	}
-
-	for (i = 0; i < num; i++) {
-		unsigned long lbr_idx = (tos - i) & mask;
-		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
-		int skip = 0;
-		u16 cycles = 0;
-		int lbr_flags = lbr_desc[lbr_format];
-
-		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
-		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
-
-		if (lbr_format == LBR_FORMAT_INFO && need_info) {
-			u64 info;
-
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
-			mis = !!(info & LBR_INFO_MISPRED);
-			pred = !mis;
-			in_tx = !!(info & LBR_INFO_IN_TX);
-			abort = !!(info & LBR_INFO_ABORT);
-			cycles = (info & LBR_INFO_CYCLES);
-		}
-		if (lbr_flags & LBR_EIP_FLAGS) {
-			mis = !!(from & LBR_FROM_FLAG_MISPRED);
-			pred = !mis;
-			skip = 1;
-		}
-		if (lbr_flags & LBR_TSX) {
-			in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
-			abort = !!(from & LBR_FROM_FLAG_ABORT);
-			skip = 3;
-		}
-		from = (u64)((((s64)from) << skip) >> skip);
-
-		/*
-		 * Some CPUs report duplicated abort records,
-		 * with the second entry not having an abort bit set.
-		 * Skip them here. This loop runs backwards,
-		 * so we need to undo the previous record.
-		 * If the abort just happened outside the window
-		 * the extra entry cannot be removed.
-		 */
-		if (abort && x86_pmu.lbr_double_abort && out > 0)
-			out--;
-
-		cpuc->lbr_entries[out].from	 = from;
-		cpuc->lbr_entries[out].to	 = to;
-		cpuc->lbr_entries[out].mispred	 = mis;
-		cpuc->lbr_entries[out].predicted = pred;
-		cpuc->lbr_entries[out].in_tx	 = in_tx;
-		cpuc->lbr_entries[out].abort	 = abort;
-		cpuc->lbr_entries[out].cycles	 = cycles;
-		cpuc->lbr_entries[out].reserved	 = 0;
-		out++;
-	}
-	cpuc->lbr_stack.nr = out;
-}
-
-void intel_pmu_lbr_read(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (!cpuc->lbr_users)
-		return;
-
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-		intel_pmu_lbr_read_32(cpuc);
-	else
-		intel_pmu_lbr_read_64(cpuc);
-
-	intel_pmu_lbr_filter(cpuc);
-}
-
-/*
- * SW filter is used:
- * - in case there is no HW filter
- * - in case the HW filter has errata or limitations
- */
-static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
-{
-	u64 br_type = event->attr.branch_sample_type;
-	int mask = 0;
-
-	if (br_type & PERF_SAMPLE_BRANCH_USER)
-		mask |= X86_BR_USER;
-
-	if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
-		mask |= X86_BR_KERNEL;
-
-	/* we ignore BRANCH_HV here */
-
-	if (br_type & PERF_SAMPLE_BRANCH_ANY)
-		mask |= X86_BR_ANY;
-
-	if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
-		mask |= X86_BR_ANY_CALL;
-
-	if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
-		mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
-
-	if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
-		mask |= X86_BR_IND_CALL;
-
-	if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
-		mask |= X86_BR_ABORT;
-
-	if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
-		mask |= X86_BR_IN_TX;
-
-	if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
-		mask |= X86_BR_NO_TX;
-
-	if (br_type & PERF_SAMPLE_BRANCH_COND)
-		mask |= X86_BR_JCC;
-
-	if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
-		if (!x86_pmu_has_lbr_callstack())
-			return -EOPNOTSUPP;
-		if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
-			return -EINVAL;
-		mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
-			X86_BR_CALL_STACK;
-	}
-
-	if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
-		mask |= X86_BR_IND_JMP;
-
-	if (br_type & PERF_SAMPLE_BRANCH_CALL)
-		mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
-	/*
-	 * stash actual user request into reg, it may
-	 * be used by fixup code for some CPU
-	 */
-	event->hw.branch_reg.reg = mask;
-	return 0;
-}
-
-/*
- * setup the HW LBR filter
- * Used only when available, may not be enough to disambiguate
- * all branches, may need the help of the SW filter
- */
-static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg;
-	u64 br_type = event->attr.branch_sample_type;
-	u64 mask = 0, v;
-	int i;
-
-	for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
-		if (!(br_type & (1ULL << i)))
-			continue;
-
-		v = x86_pmu.lbr_sel_map[i];
-		if (v == LBR_NOT_SUPP)
-			return -EOPNOTSUPP;
-
-		if (v != LBR_IGN)
-			mask |= v;
-	}
-
-	reg = &event->hw.branch_reg;
-	reg->idx = EXTRA_REG_LBR;
-
-	/*
-	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
-	 * in suppress mode. So LBR_SELECT should be set to
-	 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
-	 */
-	reg->config = mask ^ x86_pmu.lbr_sel_mask;
-
-	if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
-	    (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
-	    (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
-		reg->config |= LBR_NO_INFO;
-
-	return 0;
-}
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event)
-{
-	int ret = 0;
-
-	/*
-	 * no LBR on this PMU
-	 */
-	if (!x86_pmu.lbr_nr)
-		return -EOPNOTSUPP;
-
-	/*
-	 * setup SW LBR filter
-	 */
-	ret = intel_pmu_setup_sw_lbr_filter(event);
-	if (ret)
-		return ret;
-
-	/*
-	 * setup HW LBR filter, if any
-	 */
-	if (x86_pmu.lbr_sel_map)
-		ret = intel_pmu_setup_hw_lbr_filter(event);
-
-	return ret;
-}
-
-/*
- * return the type of control flow change at address "from"
- * intruction is not necessarily a branch (in case of interrupt).
- *
- * The branch type returned also includes the priv level of the
- * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
- *
- * If a branch type is unknown OR the instruction cannot be
- * decoded (e.g., text page not present), then X86_BR_NONE is
- * returned.
- */
-static int branch_type(unsigned long from, unsigned long to, int abort)
-{
-	struct insn insn;
-	void *addr;
-	int bytes_read, bytes_left;
-	int ret = X86_BR_NONE;
-	int ext, to_plm, from_plm;
-	u8 buf[MAX_INSN_SIZE];
-	int is64 = 0;
-
-	to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
-	from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
-
-	/*
-	 * maybe zero if lbr did not fill up after a reset by the time
-	 * we get a PMU interrupt
-	 */
-	if (from == 0 || to == 0)
-		return X86_BR_NONE;
-
-	if (abort)
-		return X86_BR_ABORT | to_plm;
-
-	if (from_plm == X86_BR_USER) {
-		/*
-		 * can happen if measuring at the user level only
-		 * and we interrupt in a kernel thread, e.g., idle.
-		 */
-		if (!current->mm)
-			return X86_BR_NONE;
-
-		/* may fail if text not present */
-		bytes_left = copy_from_user_nmi(buf, (void __user *)from,
-						MAX_INSN_SIZE);
-		bytes_read = MAX_INSN_SIZE - bytes_left;
-		if (!bytes_read)
-			return X86_BR_NONE;
-
-		addr = buf;
-	} else {
-		/*
-		 * The LBR logs any address in the IP, even if the IP just
-		 * faulted. This means userspace can control the from address.
-		 * Ensure we don't blindy read any address by validating it is
-		 * a known text address.
-		 */
-		if (kernel_text_address(from)) {
-			addr = (void *)from;
-			/*
-			 * Assume we can get the maximum possible size
-			 * when grabbing kernel data.  This is not
-			 * _strictly_ true since we could possibly be
-			 * executing up next to a memory hole, but
-			 * it is very unlikely to be a problem.
-			 */
-			bytes_read = MAX_INSN_SIZE;
-		} else {
-			return X86_BR_NONE;
-		}
-	}
-
-	/*
-	 * decoder needs to know the ABI especially
-	 * on 64-bit systems running 32-bit apps
-	 */
-#ifdef CONFIG_X86_64
-	is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
-#endif
-	insn_init(&insn, addr, bytes_read, is64);
-	insn_get_opcode(&insn);
-	if (!insn.opcode.got)
-		return X86_BR_ABORT;
-
-	switch (insn.opcode.bytes[0]) {
-	case 0xf:
-		switch (insn.opcode.bytes[1]) {
-		case 0x05: /* syscall */
-		case 0x34: /* sysenter */
-			ret = X86_BR_SYSCALL;
-			break;
-		case 0x07: /* sysret */
-		case 0x35: /* sysexit */
-			ret = X86_BR_SYSRET;
-			break;
-		case 0x80 ... 0x8f: /* conditional */
-			ret = X86_BR_JCC;
-			break;
-		default:
-			ret = X86_BR_NONE;
-		}
-		break;
-	case 0x70 ... 0x7f: /* conditional */
-		ret = X86_BR_JCC;
-		break;
-	case 0xc2: /* near ret */
-	case 0xc3: /* near ret */
-	case 0xca: /* far ret */
-	case 0xcb: /* far ret */
-		ret = X86_BR_RET;
-		break;
-	case 0xcf: /* iret */
-		ret = X86_BR_IRET;
-		break;
-	case 0xcc ... 0xce: /* int */
-		ret = X86_BR_INT;
-		break;
-	case 0xe8: /* call near rel */
-		insn_get_immediate(&insn);
-		if (insn.immediate1.value == 0) {
-			/* zero length call */
-			ret = X86_BR_ZERO_CALL;
-			break;
-		}
-	case 0x9a: /* call far absolute */
-		ret = X86_BR_CALL;
-		break;
-	case 0xe0 ... 0xe3: /* loop jmp */
-		ret = X86_BR_JCC;
-		break;
-	case 0xe9 ... 0xeb: /* jmp */
-		ret = X86_BR_JMP;
-		break;
-	case 0xff: /* call near absolute, call far absolute ind */
-		insn_get_modrm(&insn);
-		ext = (insn.modrm.bytes[0] >> 3) & 0x7;
-		switch (ext) {
-		case 2: /* near ind call */
-		case 3: /* far ind call */
-			ret = X86_BR_IND_CALL;
-			break;
-		case 4:
-		case 5:
-			ret = X86_BR_IND_JMP;
-			break;
-		}
-		break;
-	default:
-		ret = X86_BR_NONE;
-	}
-	/*
-	 * interrupts, traps, faults (and thus ring transition) may
-	 * occur on any instructions. Thus, to classify them correctly,
-	 * we need to first look at the from and to priv levels. If they
-	 * are different and to is in the kernel, then it indicates
-	 * a ring transition. If the from instruction is not a ring
-	 * transition instr (syscall, systenter, int), then it means
-	 * it was a irq, trap or fault.
-	 *
-	 * we have no way of detecting kernel to kernel faults.
-	 */
-	if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
-	    && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
-		ret = X86_BR_IRQ;
-
-	/*
-	 * branch priv level determined by target as
-	 * is done by HW when LBR_SELECT is implemented
-	 */
-	if (ret != X86_BR_NONE)
-		ret |= to_plm;
-
-	return ret;
-}
-
-/*
- * implement actual branch filter based on user demand.
- * Hardware may not exactly satisfy that request, thus
- * we need to inspect opcodes. Mismatched branches are
- * discarded. Therefore, the number of branches returned
- * in PERF_SAMPLE_BRANCH_STACK sample may vary.
- */
-static void
-intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
-{
-	u64 from, to;
-	int br_sel = cpuc->br_sel;
-	int i, j, type;
-	bool compress = false;
-
-	/* if sampling all branches, then nothing to filter */
-	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
-		return;
-
-	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
-
-		from = cpuc->lbr_entries[i].from;
-		to = cpuc->lbr_entries[i].to;
-
-		type = branch_type(from, to, cpuc->lbr_entries[i].abort);
-		if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
-			if (cpuc->lbr_entries[i].in_tx)
-				type |= X86_BR_IN_TX;
-			else
-				type |= X86_BR_NO_TX;
-		}
-
-		/* if type does not correspond, then discard */
-		if (type == X86_BR_NONE || (br_sel & type) != type) {
-			cpuc->lbr_entries[i].from = 0;
-			compress = true;
-		}
-	}
-
-	if (!compress)
-		return;
-
-	/* remove all entries with from=0 */
-	for (i = 0; i < cpuc->lbr_stack.nr; ) {
-		if (!cpuc->lbr_entries[i].from) {
-			j = i;
-			while (++j < cpuc->lbr_stack.nr)
-				cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
-			cpuc->lbr_stack.nr--;
-			if (!cpuc->lbr_entries[i].from)
-				continue;
-		}
-		i++;
-	}
-}
-
-/*
- * Map interface branch filters onto LBR filters
- */
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_REL_JMP
-						| LBR_IND_JMP | LBR_FAR,
-	/*
-	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
-	 */
-	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
-	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
-	/*
-	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
-	 */
-	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
-	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
-	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
-};
-
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
-	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
-						| LBR_FAR,
-	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
-	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
-	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
-	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
-};
-
-static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
-	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
-	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
-						| LBR_FAR,
-	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
-	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
-	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
-						| LBR_RETURN | LBR_CALL_STACK,
-	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
-	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
-};
-
-/* core */
-void __init intel_pmu_lbr_init_core(void)
-{
-	x86_pmu.lbr_nr     = 4;
-	x86_pmu.lbr_tos    = MSR_LBR_TOS;
-	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
-
-	/*
-	 * SW branch filter usage:
-	 * - compensate for lack of HW filter
-	 */
-	pr_cont("4-deep LBR, ");
-}
-
-/* nehalem/westmere */
-void __init intel_pmu_lbr_init_nhm(void)
-{
-	x86_pmu.lbr_nr     = 16;
-	x86_pmu.lbr_tos    = MSR_LBR_TOS;
-	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
-	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
-
-	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-	x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
-
-	/*
-	 * SW branch filter usage:
-	 * - workaround LBR_SEL errata (see above)
-	 * - support syscall, sysret capture.
-	 *   That requires LBR_FAR but that means far
-	 *   jmp need to be filtered out
-	 */
-	pr_cont("16-deep LBR, ");
-}
-
-/* sandy bridge */
-void __init intel_pmu_lbr_init_snb(void)
-{
-	x86_pmu.lbr_nr	 = 16;
-	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
-	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
-
-	/*
-	 * SW branch filter usage:
-	 * - support syscall, sysret capture.
-	 *   That requires LBR_FAR but that means far
-	 *   jmp need to be filtered out
-	 */
-	pr_cont("16-deep LBR, ");
-}
-
-/* haswell */
-void intel_pmu_lbr_init_hsw(void)
-{
-	x86_pmu.lbr_nr	 = 16;
-	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
-	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
-
-	pr_cont("16-deep LBR, ");
-}
-
-/* skylake */
-__init void intel_pmu_lbr_init_skl(void)
-{
-	x86_pmu.lbr_nr	 = 32;
-	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
-	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
-	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
-
-	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
-
-	/*
-	 * SW branch filter usage:
-	 * - support syscall, sysret capture.
-	 *   That requires LBR_FAR but that means far
-	 *   jmp need to be filtered out
-	 */
-	pr_cont("32-deep LBR, ");
-}
-
-/* atom */
-void __init intel_pmu_lbr_init_atom(void)
-{
-	/*
-	 * only models starting at stepping 10 seems
-	 * to have an operational LBR which can freeze
-	 * on PMU interrupt
-	 */
-	if (boot_cpu_data.x86_model == 28
-	    && boot_cpu_data.x86_mask < 10) {
-		pr_cont("LBR disabled due to erratum");
-		return;
-	}
-
-	x86_pmu.lbr_nr	   = 8;
-	x86_pmu.lbr_tos    = MSR_LBR_TOS;
-	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
-	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
-
-	/*
-	 * SW branch filter usage:
-	 * - compensate for lack of HW filter
-	 */
-	pr_cont("8-deep LBR, ");
-}
-
-/* Knights Landing */
-void intel_pmu_lbr_init_knl(void)
-{
-	x86_pmu.lbr_nr	   = 8;
-	x86_pmu.lbr_tos    = MSR_LBR_TOS;
-	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
-	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
-
-	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
-	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
-
-	pr_cont("8-deep LBR, ");
-}
-- 
cgit v0.10.2


From fd1c601c25785ef38d698ff0091b5fe253074715 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:13 +0100
Subject: perf/x86: Move perf_event_intel_pt.[ch] ...... =>
 x86/events/intel/pt.[ch]

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-8-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 48097fd..2a2f55d 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
new file mode 100644
index 0000000..e56cebe
--- /dev/null
+++ b/arch/x86/events/intel/pt.c
@@ -0,0 +1,1188 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+#include <asm/intel_pt.h>
+
+#include "../../kernel/cpu/perf_event.h"
+#include "pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+	CR_EAX = 0,
+	CR_ECX,
+	CR_EDX,
+	CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m)						\
+	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
+			    .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+	const char	*name;
+	u32		leaf;
+	u8		reg;
+	u32		mask;
+} pt_caps[] = {
+	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
+	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
+	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)),
+	PT_CAP(mtc,			0, CR_EBX, BIT(3)),
+	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
+	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
+	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),
+	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
+	PT_CAP(mtc_periods,		1, CR_EAX, 0xffff0000),
+	PT_CAP(cycle_thresholds,	1, CR_EBX, 0xffff),
+	PT_CAP(psb_periods,		1, CR_EBX, 0xffff0000),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+	struct pt_cap_desc *cd = &pt_caps[cap];
+	u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
+	unsigned int shift = __ffs(cd->mask);
+
+	return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct dev_ext_attribute *ea =
+		container_of(attr, struct dev_ext_attribute, attr);
+	enum pt_capabilities cap = (long)ea->var;
+
+	return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+	.name	= "caps",
+};
+
+PMU_FORMAT_ATTR(cyc,		"config:1"	);
+PMU_FORMAT_ATTR(mtc,		"config:9"	);
+PMU_FORMAT_ATTR(tsc,		"config:10"	);
+PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
+PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
+PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
+PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
+
+static struct attribute *pt_formats_attr[] = {
+	&format_attr_cyc.attr,
+	&format_attr_mtc.attr,
+	&format_attr_tsc.attr,
+	&format_attr_noretcomp.attr,
+	&format_attr_mtc_period.attr,
+	&format_attr_cyc_thresh.attr,
+	&format_attr_psb_period.attr,
+	NULL,
+};
+
+static struct attribute_group pt_format_group = {
+	.name	= "format",
+	.attrs	= pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+	&pt_cap_group,
+	&pt_format_group,
+	NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+	struct dev_ext_attribute *de_attrs;
+	struct attribute **attrs;
+	size_t size;
+	int ret;
+	long i;
+
+	attrs = NULL;
+
+	for (i = 0; i < PT_CPUID_LEAVES; i++) {
+		cpuid_count(20, i,
+			    &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
+	}
+
+	ret = -ENOMEM;
+	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+	attrs = kzalloc(size, GFP_KERNEL);
+	if (!attrs)
+		goto fail;
+
+	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+	de_attrs = kzalloc(size, GFP_KERNEL);
+	if (!de_attrs)
+		goto fail;
+
+	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+		struct dev_ext_attribute *de_attr = de_attrs + i;
+
+		de_attr->attr.attr.name = pt_caps[i].name;
+
+		sysfs_attr_init(&de_attr->attr.attr);
+
+		de_attr->attr.attr.mode		= S_IRUGO;
+		de_attr->attr.show		= pt_cap_show;
+		de_attr->var			= (void *)i;
+
+		attrs[i] = &de_attr->attr.attr;
+	}
+
+	pt_cap_group.attrs = attrs;
+
+	return 0;
+
+fail:
+	kfree(attrs);
+
+	return ret;
+}
+
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC	| \
+			  RTIT_CTL_CYC_THRESH	| \
+			  RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \
+			 RTIT_CTL_MTC_RANGE)
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \
+			RTIT_CTL_DISRETC	| \
+			RTIT_CTL_CYC_PSB	| \
+			RTIT_CTL_MTC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+	u64 config = event->attr.config;
+	u64 allowed, requested;
+
+	if ((config & PT_CONFIG_MASK) != config)
+		return false;
+
+	if (config & RTIT_CTL_CYC_PSB) {
+		if (!pt_cap_get(PT_CAP_psb_cyc))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_psb_periods);
+		requested = (config & RTIT_CTL_PSB_FREQ) >>
+			RTIT_CTL_PSB_FREQ_OFFSET;
+		if (requested && (!(allowed & BIT(requested))))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+		requested = (config & RTIT_CTL_CYC_THRESH) >>
+			RTIT_CTL_CYC_THRESH_OFFSET;
+		if (requested && (!(allowed & BIT(requested))))
+			return false;
+	}
+
+	if (config & RTIT_CTL_MTC) {
+		/*
+		 * In the unlikely case that CPUID lists valid mtc periods,
+		 * but not the mtc capability, drop out here.
+		 *
+		 * Spec says that setting mtc period bits while mtc bit in
+		 * CPUID is 0 will #GP, so better safe than sorry.
+		 */
+		if (!pt_cap_get(PT_CAP_mtc))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_mtc_periods);
+		if (!allowed)
+			return false;
+
+		requested = (config & RTIT_CTL_MTC_RANGE) >>
+			RTIT_CTL_MTC_RANGE_OFFSET;
+
+		if (!(allowed & BIT(requested)))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static void pt_config(struct perf_event *event)
+{
+	u64 reg;
+
+	if (!event->hw.itrace_started) {
+		event->hw.itrace_started = 1;
+		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+	}
+
+	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+	if (!event->attr.exclude_kernel)
+		reg |= RTIT_CTL_OS;
+	if (!event->attr.exclude_user)
+		reg |= RTIT_CTL_USR;
+
+	reg |= (event->attr.config & PT_CONFIG_MASK);
+
+	wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+	u64 ctl;
+
+	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+	if (start)
+		ctl |= RTIT_CTL_TRACEEN;
+	else
+		ctl &= ~RTIT_CTL_TRACEEN;
+	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+	/*
+	 * A wrmsr that disables trace generation serializes other PT
+	 * registers and causes all data packets to be written to memory,
+	 * but a fence is required for the data to become globally visible.
+	 *
+	 * The below WMB, separating data store and aux_head store matches
+	 * the consumer's RMB that separates aux_head load and data load.
+	 */
+	if (!start)
+		wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+			     unsigned int output_off)
+{
+	u64 reg;
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table:	actual ToPA table entries, as understood by PT hardware
+ * @list:	linkage to struct pt_buffer's list of tables
+ * @phys:	physical address of this page
+ * @offset:	offset of the first entry in this table in the buffer
+ * @size:	total size of all entries in this table
+ * @last:	index of the last initialized entry in this table
+ */
+struct topa {
+	struct topa_entry	table[TENTS_PER_PAGE];
+	struct list_head	list;
+	u64			phys;
+	u64			offset;
+	size_t			size;
+	int			last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu:	CPU on which to allocate.
+ * @gfp:	Allocation flags.
+ *
+ * Return:	On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+	int node = cpu_to_node(cpu);
+	struct topa *topa;
+	struct page *p;
+
+	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+	if (!p)
+		return NULL;
+
+	topa = page_address(p);
+	topa->last = 0;
+	topa->phys = page_to_phys(p);
+
+	/*
+	 * In case of singe-entry ToPA, always put the self-referencing END
+	 * link as the 2nd entry in the table
+	 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+		TOPA_ENTRY(topa, 1)->end = 1;
+	}
+
+	return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa:	Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+	free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf:	 PT buffer that's being extended.
+ * @topa:	 New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+	struct topa *last = buf->last;
+
+	list_add_tail(&topa->list, &buf->tables);
+
+	if (!buf->first) {
+		buf->first = buf->last = buf->cur = topa;
+		return;
+	}
+
+	topa->offset = last->offset + last->size;
+	buf->last = topa;
+
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return;
+
+	BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+	TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+	TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa:	ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+	/* single-entry ToPA is a special case */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return !!topa->last;
+
+	return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf:	PT buffer being initialized.
+ * @gfp:	Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return:	0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+	struct topa *topa = buf->last;
+	int order = 0;
+	struct page *p;
+
+	p = virt_to_page(buf->data_pages[buf->nr_pages]);
+	if (PagePrivate(p))
+		order = page_private(p);
+
+	if (topa_table_full(topa)) {
+		topa = topa_alloc(buf->cpu, gfp);
+		if (!topa)
+			return -ENOMEM;
+
+		topa_insert_table(buf, topa);
+	}
+
+	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+	TOPA_ENTRY(topa, -1)->size = order;
+	if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(topa, -1)->intr = 1;
+		TOPA_ENTRY(topa, -1)->stop = 1;
+	}
+
+	topa->last++;
+	topa->size += sizes(order);
+
+	buf->nr_pages += 1ul << order;
+
+	return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf:	PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+	struct topa *topa;
+
+	list_for_each_entry(topa, &buf->tables, list) {
+		int i;
+
+		pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+			 topa->phys, topa->offset, topa->size);
+		for (i = 0; i < TENTS_PER_PAGE; i++) {
+			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+				 &topa->table[i],
+				 (unsigned long)topa->table[i].base << TOPA_SHIFT,
+				 sizes(topa->table[i].size),
+				 topa->table[i].end ?  'E' : ' ',
+				 topa->table[i].intr ? 'I' : ' ',
+				 topa->table[i].stop ? 'S' : ' ',
+				 *(u64 *)&topa->table[i]);
+			if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+			     topa->table[i].stop) ||
+			    topa->table[i].end)
+				break;
+		}
+	}
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf:	PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+	buf->output_off = 0;
+	buf->cur_idx++;
+
+	if (buf->cur_idx == buf->cur->last) {
+		if (buf->cur == buf->last)
+			buf->cur = buf->first;
+		else
+			buf->cur = list_entry(buf->cur->list.next, struct topa,
+					      list);
+		buf->cur_idx = 0;
+	}
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt:		Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	u64 topa_idx, base, old;
+
+	/* offset of the first region in this table from the beginning of buf */
+	base = buf->cur->offset + buf->output_off;
+
+	/* offset of the current output region within this table */
+	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+		base += sizes(buf->cur->table[topa_idx].size);
+
+	if (buf->snapshot) {
+		local_set(&buf->data_size, base);
+	} else {
+		old = (local64_xchg(&buf->head, base) &
+		       ((buf->nr_pages << PAGE_SHIFT) - 1));
+		if (base < old)
+			base += buf->nr_pages << PAGE_SHIFT;
+
+		local_add(base - old, &buf->data_size);
+	}
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf:	PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+	return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf:	PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+	return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt:		Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	int advance = 0;
+	u64 status;
+
+	rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+	if (status & RTIT_STATUS_ERROR) {
+		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+		pt_topa_dump(buf);
+		status &= ~RTIT_STATUS_ERROR;
+	}
+
+	if (status & RTIT_STATUS_STOPPED) {
+		status &= ~RTIT_STATUS_STOPPED;
+
+		/*
+		 * On systems that only do single-entry ToPA, hitting STOP
+		 * means we are already losing data; need to let the decoder
+		 * know.
+		 */
+		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+			local_inc(&buf->lost);
+			advance++;
+		}
+	}
+
+	/*
+	 * Also on single-entry ToPA implementations, interrupt will come
+	 * before the output reaches its output region's boundary.
+	 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+		void *head = pt_buffer_region(buf);
+
+		/* everything within this margin needs to be zeroed out */
+		memset(head + buf->output_off, 0,
+		       pt_buffer_region_size(buf) -
+		       buf->output_off);
+		advance++;
+	}
+
+	if (advance)
+		pt_buffer_advance(buf);
+
+	wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf:	PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+	u64 offset, base_topa;
+
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+	buf->cur = phys_to_virt(base_topa);
+
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+	/* offset within current output region */
+	buf->output_off = offset >> 32;
+	/* index of current output region within this table */
+	buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf:	PT buffer.
+ * @pg:		Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+	struct topa_entry *te = buf->topa_index[pg];
+
+	/* one region */
+	if (buf->first == buf->last && buf->first->last == 1)
+		return pg;
+
+	do {
+		pg++;
+		pg &= buf->nr_pages - 1;
+	} while (buf->topa_index[pg] == te);
+
+	return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf:	PT buffer.
+ * @handle:	Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+				   struct perf_output_handle *handle)
+
+{
+	unsigned long head = local64_read(&buf->head);
+	unsigned long idx, npages, wakeup;
+
+	/* can't stop in the middle of an output region */
+	if (buf->output_off + handle->size + 1 <
+	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+		return -EINVAL;
+
+
+	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return 0;
+
+	/* clear STOP and INT from current entry */
+	buf->topa_index[buf->stop_pos]->stop = 0;
+	buf->topa_index[buf->intr_pos]->intr = 0;
+
+	/* how many pages till the STOP marker */
+	npages = handle->size >> PAGE_SHIFT;
+
+	/* if it's on a page boundary, fill up one more page */
+	if (!offset_in_page(head + handle->size + 1))
+		npages++;
+
+	idx = (head >> PAGE_SHIFT) + npages;
+	idx &= buf->nr_pages - 1;
+	buf->stop_pos = idx;
+
+	wakeup = handle->wakeup >> PAGE_SHIFT;
+
+	/* in the worst case, wake up the consumer one page before hard stop */
+	idx = (head >> PAGE_SHIFT) + npages - 1;
+	if (idx > wakeup)
+		idx = wakeup;
+
+	idx &= buf->nr_pages - 1;
+	buf->intr_pos = idx;
+
+	buf->topa_index[buf->stop_pos]->stop = 1;
+	buf->topa_index[buf->intr_pos]->intr = 1;
+
+	return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf:	PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+	struct topa *cur = buf->first, *prev = buf->last;
+	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
+	int pg = 0, idx = 0;
+
+	while (pg < buf->nr_pages) {
+		int tidx;
+
+		/* pages within one topa entry */
+		for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+			buf->topa_index[pg] = te_prev;
+
+		te_prev = te_cur;
+
+		if (idx == cur->last - 1) {
+			/* advance to next topa table */
+			idx = 0;
+			cur = list_entry(cur->list.next, struct topa, list);
+		} else {
+			idx++;
+		}
+		te_cur = TOPA_ENTRY(cur, idx);
+	}
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf:	PT buffer.
+ * @head:	Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+	int pg;
+
+	if (buf->snapshot)
+		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+	pg = pt_topa_next_entry(buf, pg);
+
+	buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+	buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+			(unsigned long)buf->cur) / sizeof(struct topa_entry);
+	buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+	local64_set(&buf->head, head);
+	local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf:	PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+	struct topa *topa, *iter;
+
+	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+		/*
+		 * right now, this is in free_aux() path only, so
+		 * no need to unlink this table from the list
+		 */
+		topa_free(topa);
+	}
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf:	PT buffer.
+ * @size:	Total size of all regions within this ToPA.
+ * @gfp:	Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+			       gfp_t gfp)
+{
+	struct topa *topa;
+	int err;
+
+	topa = topa_alloc(buf->cpu, gfp);
+	if (!topa)
+		return -ENOMEM;
+
+	topa_insert_table(buf, topa);
+
+	while (buf->nr_pages < nr_pages) {
+		err = topa_insert_pages(buf, gfp);
+		if (err) {
+			pt_buffer_fini_topa(buf);
+			return -ENOMEM;
+		}
+	}
+
+	pt_buffer_setup_topa_index(buf);
+
+	/* link last table to the first one, unless we're double buffering */
+	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+		TOPA_ENTRY(buf->last, -1)->end = 1;
+	}
+
+	pt_topa_dump(buf);
+	return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu:	Cpu on which to allocate, -1 means current.
+ * @pages:	Array of pointers to buffer pages passed from perf core.
+ * @nr_pages:	Number of pages in the buffer.
+ * @snapshot:	If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return:	Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+	struct pt_buffer *buf;
+	int node, ret;
+
+	if (!nr_pages)
+		return NULL;
+
+	if (cpu == -1)
+		cpu = raw_smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+			   GFP_KERNEL, node);
+	if (!buf)
+		return NULL;
+
+	buf->cpu = cpu;
+	buf->snapshot = snapshot;
+	buf->data_pages = pages;
+
+	INIT_LIST_HEAD(&buf->tables);
+
+	ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+	if (ret) {
+		kfree(buf);
+		return NULL;
+	}
+
+	return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data:	PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+	struct pt_buffer *buf = data;
+
+	pt_buffer_fini_topa(buf);
+	kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf:	PT buffer.
+ * @pt:		Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+	if (buf->snapshot)
+		return false;
+
+	if (local_read(&buf->data_size) >= pt->handle.size)
+		return true;
+
+	return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf;
+	struct perf_event *event = pt->handle.event;
+
+	/*
+	 * There may be a dangling PT bit in the interrupt status register
+	 * after PT has been disabled by pt_event_stop(). Make sure we don't
+	 * do anything (particularly, re-enable) for this event here.
+	 */
+	if (!ACCESS_ONCE(pt->handle_nmi))
+		return;
+
+	pt_config_start(false);
+
+	if (!event)
+		return;
+
+	buf = perf_get_aux(&pt->handle);
+	if (!buf)
+		return;
+
+	pt_read_offset(buf);
+
+	pt_handle_status(pt);
+
+	pt_update_head(pt);
+
+	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+			    local_xchg(&buf->lost, 0));
+
+	if (!event->hw.state) {
+		int ret;
+
+		buf = perf_aux_output_begin(&pt->handle, event);
+		if (!buf) {
+			event->hw.state = PERF_HES_STOPPED;
+			return;
+		}
+
+		pt_buffer_reset_offsets(buf, pt->handle.head);
+		/* snapshot counters don't use PMI, so it's safe */
+		ret = pt_buffer_reset_markers(buf, &pt->handle);
+		if (ret) {
+			perf_aux_output_end(&pt->handle, 0, true);
+			return;
+		}
+
+		pt_config_buffer(buf->cur->table, buf->cur_idx,
+				 buf->output_off);
+		pt_config(event);
+	}
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+	if (!buf || pt_buffer_is_full(buf, pt)) {
+		event->hw.state = PERF_HES_STOPPED;
+		return;
+	}
+
+	ACCESS_ONCE(pt->handle_nmi) = 1;
+	event->hw.state = 0;
+
+	pt_config_buffer(buf->cur->table, buf->cur_idx,
+			 buf->output_off);
+	pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+	/*
+	 * Protect against the PMI racing with disabling wrmsr,
+	 * see comment in intel_pt_interrupt().
+	 */
+	ACCESS_ONCE(pt->handle_nmi) = 0;
+	pt_config_start(false);
+
+	if (event->hw.state == PERF_HES_STOPPED)
+		return;
+
+	event->hw.state = PERF_HES_STOPPED;
+
+	if (mode & PERF_EF_UPDATE) {
+		struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+		if (!buf)
+			return;
+
+		if (WARN_ON_ONCE(pt->handle.event != event))
+			return;
+
+		pt_read_offset(buf);
+
+		pt_handle_status(pt);
+
+		pt_update_head(pt);
+	}
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf;
+
+	pt_event_stop(event, PERF_EF_UPDATE);
+
+	buf = perf_get_aux(&pt->handle);
+
+	if (buf) {
+		if (buf->snapshot)
+			pt->handle.head =
+				local_xchg(&buf->data_size,
+					   buf->nr_pages << PAGE_SHIFT);
+		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+				    local_xchg(&buf->lost, 0));
+	}
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+	struct pt_buffer *buf;
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = -EBUSY;
+
+	if (pt->handle.event)
+		goto fail;
+
+	buf = perf_aux_output_begin(&pt->handle, event);
+	ret = -EINVAL;
+	if (!buf)
+		goto fail_stop;
+
+	pt_buffer_reset_offsets(buf, pt->handle.head);
+	if (!buf->snapshot) {
+		ret = pt_buffer_reset_markers(buf, &pt->handle);
+		if (ret)
+			goto fail_end_stop;
+	}
+
+	if (mode & PERF_EF_START) {
+		pt_event_start(event, 0);
+		ret = -EBUSY;
+		if (hwc->state == PERF_HES_STOPPED)
+			goto fail_end_stop;
+	} else {
+		hwc->state = PERF_HES_STOPPED;
+	}
+
+	return 0;
+
+fail_end_stop:
+	perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+	hwc->state = PERF_HES_STOPPED;
+fail:
+	return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+	x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+	if (event->attr.type != pt_pmu.pmu.type)
+		return -ENOENT;
+
+	if (!pt_event_valid(event))
+		return -EINVAL;
+
+	if (x86_add_exclusive(x86_lbr_exclusive_pt))
+		return -EBUSY;
+
+	event->destroy = pt_event_destroy;
+
+	return 0;
+}
+
+void cpu_emergency_stop_pt(void)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+	if (pt->handle.event)
+		pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
+}
+
+static __init int pt_init(void)
+{
+	int ret, cpu, prior_warn = 0;
+
+	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+
+	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+		return -ENODEV;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		u64 ctl;
+
+		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+		if (!ret && (ctl & RTIT_CTL_TRACEEN))
+			prior_warn++;
+	}
+	put_online_cpus();
+
+	if (prior_warn) {
+		x86_add_exclusive(x86_lbr_exclusive_pt);
+		pr_warn("PT is enabled at boot time, doing nothing\n");
+
+		return -EBUSY;
+	}
+
+	ret = pt_pmu_hw_init();
+	if (ret)
+		return ret;
+
+	if (!pt_cap_get(PT_CAP_topa_output)) {
+		pr_warn("ToPA output is not supported on this CPU\n");
+		return -ENODEV;
+	}
+
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		pt_pmu.pmu.capabilities =
+			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+	pt_pmu.pmu.attr_groups	= pt_attr_groups;
+	pt_pmu.pmu.task_ctx_nr	= perf_sw_context;
+	pt_pmu.pmu.event_init	= pt_event_init;
+	pt_pmu.pmu.add		= pt_event_add;
+	pt_pmu.pmu.del		= pt_event_del;
+	pt_pmu.pmu.start	= pt_event_start;
+	pt_pmu.pmu.stop		= pt_event_stop;
+	pt_pmu.pmu.read		= pt_event_read;
+	pt_pmu.pmu.setup_aux	= pt_buffer_setup_aux;
+	pt_pmu.pmu.free_aux	= pt_buffer_free_aux;
+	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+	return ret;
+}
+arch_initcall(pt_init);
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
new file mode 100644
index 0000000..336878a
--- /dev/null
+++ b/arch/x86/events/intel/pt.h
@@ -0,0 +1,116 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+#define TOPA_SHIFT 12
+
+static inline unsigned int sizes(unsigned int tsz)
+{
+	return 1 << (tsz + TOPA_SHIFT);
+};
+
+struct topa_entry {
+	u64	end	: 1;
+	u64	rsvd0	: 1;
+	u64	intr	: 1;
+	u64	rsvd1	: 1;
+	u64	stop	: 1;
+	u64	rsvd2	: 1;
+	u64	size	: 4;
+	u64	rsvd3	: 2;
+	u64	base	: 36;
+	u64	rsvd4	: 16;
+};
+
+#define PT_CPUID_LEAVES		2
+#define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */
+
+enum pt_capabilities {
+	PT_CAP_max_subleaf = 0,
+	PT_CAP_cr3_filtering,
+	PT_CAP_psb_cyc,
+	PT_CAP_mtc,
+	PT_CAP_topa_output,
+	PT_CAP_topa_multiple_entries,
+	PT_CAP_single_range_output,
+	PT_CAP_payloads_lip,
+	PT_CAP_mtc_periods,
+	PT_CAP_cycle_thresholds,
+	PT_CAP_psb_periods,
+};
+
+struct pt_pmu {
+	struct pmu		pmu;
+	u32			caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ *		cpu, depending on perf event configuration
+ * @cpu:	cpu for per-cpu allocation
+ * @tables:	list of ToPA tables in this buffer
+ * @first:	shorthand for first topa table
+ * @last:	shorthand for last topa table
+ * @cur:	current topa table
+ * @nr_pages:	buffer size in pages
+ * @cur_idx:	current output region's index within @cur table
+ * @output_off:	offset within the current output region
+ * @data_size:	running total of the amount of data in this buffer
+ * @lost:	if data was lost/truncated
+ * @head:	logical write offset inside the buffer
+ * @snapshot:	if this is for a snapshot/overwrite counter
+ * @stop_pos:	STOP topa entry in the buffer
+ * @intr_pos:	INT topa entry in the buffer
+ * @data_pages:	array of pages from perf
+ * @topa_index:	table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+	int			cpu;
+	struct list_head	tables;
+	struct topa		*first, *last, *cur;
+	unsigned int		cur_idx;
+	size_t			output_off;
+	unsigned long		nr_pages;
+	local_t			data_size;
+	local_t			lost;
+	local64_t		head;
+	bool			snapshot;
+	unsigned long		stop_pos, intr_pos;
+	void			**data_pages;
+	struct topa_entry	*topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle:	perf output handle
+ * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+	struct perf_output_handle handle;
+	int			handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d52ed9b..595e3ff 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
deleted file mode 100644
index 336878a..0000000
--- a/arch/x86/kernel/cpu/intel_pt.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Intel(R) Processor Trace PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Intel PT is specified in the Intel Architecture Instruction Set Extensions
- * Programming Reference:
- * http://software.intel.com/en-us/intel-isa-extensions
- */
-
-#ifndef __INTEL_PT_H__
-#define __INTEL_PT_H__
-
-/*
- * Single-entry ToPA: when this close to region boundary, switch
- * buffers to avoid losing data.
- */
-#define TOPA_PMI_MARGIN 512
-
-#define TOPA_SHIFT 12
-
-static inline unsigned int sizes(unsigned int tsz)
-{
-	return 1 << (tsz + TOPA_SHIFT);
-};
-
-struct topa_entry {
-	u64	end	: 1;
-	u64	rsvd0	: 1;
-	u64	intr	: 1;
-	u64	rsvd1	: 1;
-	u64	stop	: 1;
-	u64	rsvd2	: 1;
-	u64	size	: 4;
-	u64	rsvd3	: 2;
-	u64	base	: 36;
-	u64	rsvd4	: 16;
-};
-
-#define PT_CPUID_LEAVES		2
-#define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */
-
-enum pt_capabilities {
-	PT_CAP_max_subleaf = 0,
-	PT_CAP_cr3_filtering,
-	PT_CAP_psb_cyc,
-	PT_CAP_mtc,
-	PT_CAP_topa_output,
-	PT_CAP_topa_multiple_entries,
-	PT_CAP_single_range_output,
-	PT_CAP_payloads_lip,
-	PT_CAP_mtc_periods,
-	PT_CAP_cycle_thresholds,
-	PT_CAP_psb_periods,
-};
-
-struct pt_pmu {
-	struct pmu		pmu;
-	u32			caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
-};
-
-/**
- * struct pt_buffer - buffer configuration; one buffer per task_struct or
- *		cpu, depending on perf event configuration
- * @cpu:	cpu for per-cpu allocation
- * @tables:	list of ToPA tables in this buffer
- * @first:	shorthand for first topa table
- * @last:	shorthand for last topa table
- * @cur:	current topa table
- * @nr_pages:	buffer size in pages
- * @cur_idx:	current output region's index within @cur table
- * @output_off:	offset within the current output region
- * @data_size:	running total of the amount of data in this buffer
- * @lost:	if data was lost/truncated
- * @head:	logical write offset inside the buffer
- * @snapshot:	if this is for a snapshot/overwrite counter
- * @stop_pos:	STOP topa entry in the buffer
- * @intr_pos:	INT topa entry in the buffer
- * @data_pages:	array of pages from perf
- * @topa_index:	table of topa entries indexed by page offset
- */
-struct pt_buffer {
-	int			cpu;
-	struct list_head	tables;
-	struct topa		*first, *last, *cur;
-	unsigned int		cur_idx;
-	size_t			output_off;
-	unsigned long		nr_pages;
-	local_t			data_size;
-	local_t			lost;
-	local64_t		head;
-	bool			snapshot;
-	unsigned long		stop_pos, intr_pos;
-	void			**data_pages;
-	struct topa_entry	*topa_index[0];
-};
-
-/**
- * struct pt - per-cpu pt context
- * @handle:	perf output handle
- * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
- */
-struct pt {
-	struct perf_output_handle handle;
-	int			handle_nmi;
-};
-
-#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
deleted file mode 100644
index c0bbd10..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ /dev/null
@@ -1,1188 +0,0 @@
-/*
- * Intel(R) Processor Trace PMU driver for perf
- * Copyright (c) 2013-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * Intel PT is specified in the Intel Architecture Instruction Set Extensions
- * Programming Reference:
- * http://software.intel.com/en-us/intel-isa-extensions
- */
-
-#undef DEBUG
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-
-#include <asm/perf_event.h>
-#include <asm/insn.h>
-#include <asm/io.h>
-#include <asm/intel_pt.h>
-
-#include "perf_event.h"
-#include "intel_pt.h"
-
-static DEFINE_PER_CPU(struct pt, pt_ctx);
-
-static struct pt_pmu pt_pmu;
-
-enum cpuid_regs {
-	CR_EAX = 0,
-	CR_ECX,
-	CR_EDX,
-	CR_EBX
-};
-
-/*
- * Capabilities of Intel PT hardware, such as number of address bits or
- * supported output schemes, are cached and exported to userspace as "caps"
- * attribute group of pt pmu device
- * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
- * relevant bits together with intel_pt traces.
- *
- * These are necessary for both trace decoding (payloads_lip, contains address
- * width encoded in IP-related packets), and event configuration (bitmasks with
- * permitted values for certain bit fields).
- */
-#define PT_CAP(_n, _l, _r, _m)						\
-	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
-			    .reg = _r, .mask = _m }
-
-static struct pt_cap_desc {
-	const char	*name;
-	u32		leaf;
-	u8		reg;
-	u32		mask;
-} pt_caps[] = {
-	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
-	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
-	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)),
-	PT_CAP(mtc,			0, CR_EBX, BIT(3)),
-	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
-	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
-	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),
-	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
-	PT_CAP(mtc_periods,		1, CR_EAX, 0xffff0000),
-	PT_CAP(cycle_thresholds,	1, CR_EBX, 0xffff),
-	PT_CAP(psb_periods,		1, CR_EBX, 0xffff0000),
-};
-
-static u32 pt_cap_get(enum pt_capabilities cap)
-{
-	struct pt_cap_desc *cd = &pt_caps[cap];
-	u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
-	unsigned int shift = __ffs(cd->mask);
-
-	return (c & cd->mask) >> shift;
-}
-
-static ssize_t pt_cap_show(struct device *cdev,
-			   struct device_attribute *attr,
-			   char *buf)
-{
-	struct dev_ext_attribute *ea =
-		container_of(attr, struct dev_ext_attribute, attr);
-	enum pt_capabilities cap = (long)ea->var;
-
-	return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
-}
-
-static struct attribute_group pt_cap_group = {
-	.name	= "caps",
-};
-
-PMU_FORMAT_ATTR(cyc,		"config:1"	);
-PMU_FORMAT_ATTR(mtc,		"config:9"	);
-PMU_FORMAT_ATTR(tsc,		"config:10"	);
-PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
-PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
-PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
-PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
-
-static struct attribute *pt_formats_attr[] = {
-	&format_attr_cyc.attr,
-	&format_attr_mtc.attr,
-	&format_attr_tsc.attr,
-	&format_attr_noretcomp.attr,
-	&format_attr_mtc_period.attr,
-	&format_attr_cyc_thresh.attr,
-	&format_attr_psb_period.attr,
-	NULL,
-};
-
-static struct attribute_group pt_format_group = {
-	.name	= "format",
-	.attrs	= pt_formats_attr,
-};
-
-static const struct attribute_group *pt_attr_groups[] = {
-	&pt_cap_group,
-	&pt_format_group,
-	NULL,
-};
-
-static int __init pt_pmu_hw_init(void)
-{
-	struct dev_ext_attribute *de_attrs;
-	struct attribute **attrs;
-	size_t size;
-	int ret;
-	long i;
-
-	attrs = NULL;
-
-	for (i = 0; i < PT_CPUID_LEAVES; i++) {
-		cpuid_count(20, i,
-			    &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
-			    &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
-			    &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
-			    &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
-	}
-
-	ret = -ENOMEM;
-	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
-	attrs = kzalloc(size, GFP_KERNEL);
-	if (!attrs)
-		goto fail;
-
-	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
-	de_attrs = kzalloc(size, GFP_KERNEL);
-	if (!de_attrs)
-		goto fail;
-
-	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
-		struct dev_ext_attribute *de_attr = de_attrs + i;
-
-		de_attr->attr.attr.name = pt_caps[i].name;
-
-		sysfs_attr_init(&de_attr->attr.attr);
-
-		de_attr->attr.attr.mode		= S_IRUGO;
-		de_attr->attr.show		= pt_cap_show;
-		de_attr->var			= (void *)i;
-
-		attrs[i] = &de_attr->attr.attr;
-	}
-
-	pt_cap_group.attrs = attrs;
-
-	return 0;
-
-fail:
-	kfree(attrs);
-
-	return ret;
-}
-
-#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC	| \
-			  RTIT_CTL_CYC_THRESH	| \
-			  RTIT_CTL_PSB_FREQ)
-
-#define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \
-			 RTIT_CTL_MTC_RANGE)
-
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \
-			RTIT_CTL_DISRETC	| \
-			RTIT_CTL_CYC_PSB	| \
-			RTIT_CTL_MTC)
-
-static bool pt_event_valid(struct perf_event *event)
-{
-	u64 config = event->attr.config;
-	u64 allowed, requested;
-
-	if ((config & PT_CONFIG_MASK) != config)
-		return false;
-
-	if (config & RTIT_CTL_CYC_PSB) {
-		if (!pt_cap_get(PT_CAP_psb_cyc))
-			return false;
-
-		allowed = pt_cap_get(PT_CAP_psb_periods);
-		requested = (config & RTIT_CTL_PSB_FREQ) >>
-			RTIT_CTL_PSB_FREQ_OFFSET;
-		if (requested && (!(allowed & BIT(requested))))
-			return false;
-
-		allowed = pt_cap_get(PT_CAP_cycle_thresholds);
-		requested = (config & RTIT_CTL_CYC_THRESH) >>
-			RTIT_CTL_CYC_THRESH_OFFSET;
-		if (requested && (!(allowed & BIT(requested))))
-			return false;
-	}
-
-	if (config & RTIT_CTL_MTC) {
-		/*
-		 * In the unlikely case that CPUID lists valid mtc periods,
-		 * but not the mtc capability, drop out here.
-		 *
-		 * Spec says that setting mtc period bits while mtc bit in
-		 * CPUID is 0 will #GP, so better safe than sorry.
-		 */
-		if (!pt_cap_get(PT_CAP_mtc))
-			return false;
-
-		allowed = pt_cap_get(PT_CAP_mtc_periods);
-		if (!allowed)
-			return false;
-
-		requested = (config & RTIT_CTL_MTC_RANGE) >>
-			RTIT_CTL_MTC_RANGE_OFFSET;
-
-		if (!(allowed & BIT(requested)))
-			return false;
-	}
-
-	return true;
-}
-
-/*
- * PT configuration helpers
- * These all are cpu affine and operate on a local PT
- */
-
-static void pt_config(struct perf_event *event)
-{
-	u64 reg;
-
-	if (!event->hw.itrace_started) {
-		event->hw.itrace_started = 1;
-		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
-	}
-
-	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
-
-	if (!event->attr.exclude_kernel)
-		reg |= RTIT_CTL_OS;
-	if (!event->attr.exclude_user)
-		reg |= RTIT_CTL_USR;
-
-	reg |= (event->attr.config & PT_CONFIG_MASK);
-
-	wrmsrl(MSR_IA32_RTIT_CTL, reg);
-}
-
-static void pt_config_start(bool start)
-{
-	u64 ctl;
-
-	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-	if (start)
-		ctl |= RTIT_CTL_TRACEEN;
-	else
-		ctl &= ~RTIT_CTL_TRACEEN;
-	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
-
-	/*
-	 * A wrmsr that disables trace generation serializes other PT
-	 * registers and causes all data packets to be written to memory,
-	 * but a fence is required for the data to become globally visible.
-	 *
-	 * The below WMB, separating data store and aux_head store matches
-	 * the consumer's RMB that separates aux_head load and data load.
-	 */
-	if (!start)
-		wmb();
-}
-
-static void pt_config_buffer(void *buf, unsigned int topa_idx,
-			     unsigned int output_off)
-{
-	u64 reg;
-
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
-
-	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
-
-	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
-}
-
-/*
- * Keep ToPA table-related metadata on the same page as the actual table,
- * taking up a few words from the top
- */
-
-#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
-
-/**
- * struct topa - page-sized ToPA table with metadata at the top
- * @table:	actual ToPA table entries, as understood by PT hardware
- * @list:	linkage to struct pt_buffer's list of tables
- * @phys:	physical address of this page
- * @offset:	offset of the first entry in this table in the buffer
- * @size:	total size of all entries in this table
- * @last:	index of the last initialized entry in this table
- */
-struct topa {
-	struct topa_entry	table[TENTS_PER_PAGE];
-	struct list_head	list;
-	u64			phys;
-	u64			offset;
-	size_t			size;
-	int			last;
-};
-
-/* make -1 stand for the last table entry */
-#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
-
-/**
- * topa_alloc() - allocate page-sized ToPA table
- * @cpu:	CPU on which to allocate.
- * @gfp:	Allocation flags.
- *
- * Return:	On success, return the pointer to ToPA table page.
- */
-static struct topa *topa_alloc(int cpu, gfp_t gfp)
-{
-	int node = cpu_to_node(cpu);
-	struct topa *topa;
-	struct page *p;
-
-	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
-	if (!p)
-		return NULL;
-
-	topa = page_address(p);
-	topa->last = 0;
-	topa->phys = page_to_phys(p);
-
-	/*
-	 * In case of singe-entry ToPA, always put the self-referencing END
-	 * link as the 2nd entry in the table
-	 */
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
-		TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
-		TOPA_ENTRY(topa, 1)->end = 1;
-	}
-
-	return topa;
-}
-
-/**
- * topa_free() - free a page-sized ToPA table
- * @topa:	Table to deallocate.
- */
-static void topa_free(struct topa *topa)
-{
-	free_page((unsigned long)topa);
-}
-
-/**
- * topa_insert_table() - insert a ToPA table into a buffer
- * @buf:	 PT buffer that's being extended.
- * @topa:	 New topa table to be inserted.
- *
- * If it's the first table in this buffer, set up buffer's pointers
- * accordingly; otherwise, add a END=1 link entry to @topa to the current
- * "last" table and adjust the last table pointer to @topa.
- */
-static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
-{
-	struct topa *last = buf->last;
-
-	list_add_tail(&topa->list, &buf->tables);
-
-	if (!buf->first) {
-		buf->first = buf->last = buf->cur = topa;
-		return;
-	}
-
-	topa->offset = last->offset + last->size;
-	buf->last = topa;
-
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-		return;
-
-	BUG_ON(last->last != TENTS_PER_PAGE - 1);
-
-	TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
-	TOPA_ENTRY(last, -1)->end = 1;
-}
-
-/**
- * topa_table_full() - check if a ToPA table is filled up
- * @topa:	ToPA table.
- */
-static bool topa_table_full(struct topa *topa)
-{
-	/* single-entry ToPA is a special case */
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-		return !!topa->last;
-
-	return topa->last == TENTS_PER_PAGE - 1;
-}
-
-/**
- * topa_insert_pages() - create a list of ToPA tables
- * @buf:	PT buffer being initialized.
- * @gfp:	Allocation flags.
- *
- * This initializes a list of ToPA tables with entries from
- * the data_pages provided by rb_alloc_aux().
- *
- * Return:	0 on success or error code.
- */
-static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
-{
-	struct topa *topa = buf->last;
-	int order = 0;
-	struct page *p;
-
-	p = virt_to_page(buf->data_pages[buf->nr_pages]);
-	if (PagePrivate(p))
-		order = page_private(p);
-
-	if (topa_table_full(topa)) {
-		topa = topa_alloc(buf->cpu, gfp);
-		if (!topa)
-			return -ENOMEM;
-
-		topa_insert_table(buf, topa);
-	}
-
-	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
-	TOPA_ENTRY(topa, -1)->size = order;
-	if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
-		TOPA_ENTRY(topa, -1)->intr = 1;
-		TOPA_ENTRY(topa, -1)->stop = 1;
-	}
-
-	topa->last++;
-	topa->size += sizes(order);
-
-	buf->nr_pages += 1ul << order;
-
-	return 0;
-}
-
-/**
- * pt_topa_dump() - print ToPA tables and their entries
- * @buf:	PT buffer.
- */
-static void pt_topa_dump(struct pt_buffer *buf)
-{
-	struct topa *topa;
-
-	list_for_each_entry(topa, &buf->tables, list) {
-		int i;
-
-		pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
-			 topa->phys, topa->offset, topa->size);
-		for (i = 0; i < TENTS_PER_PAGE; i++) {
-			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
-				 &topa->table[i],
-				 (unsigned long)topa->table[i].base << TOPA_SHIFT,
-				 sizes(topa->table[i].size),
-				 topa->table[i].end ?  'E' : ' ',
-				 topa->table[i].intr ? 'I' : ' ',
-				 topa->table[i].stop ? 'S' : ' ',
-				 *(u64 *)&topa->table[i]);
-			if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
-			     topa->table[i].stop) ||
-			    topa->table[i].end)
-				break;
-		}
-	}
-}
-
-/**
- * pt_buffer_advance() - advance to the next output region
- * @buf:	PT buffer.
- *
- * Advance the current pointers in the buffer to the next ToPA entry.
- */
-static void pt_buffer_advance(struct pt_buffer *buf)
-{
-	buf->output_off = 0;
-	buf->cur_idx++;
-
-	if (buf->cur_idx == buf->cur->last) {
-		if (buf->cur == buf->last)
-			buf->cur = buf->first;
-		else
-			buf->cur = list_entry(buf->cur->list.next, struct topa,
-					      list);
-		buf->cur_idx = 0;
-	}
-}
-
-/**
- * pt_update_head() - calculate current offsets and sizes
- * @pt:		Per-cpu pt context.
- *
- * Update buffer's current write pointer position and data size.
- */
-static void pt_update_head(struct pt *pt)
-{
-	struct pt_buffer *buf = perf_get_aux(&pt->handle);
-	u64 topa_idx, base, old;
-
-	/* offset of the first region in this table from the beginning of buf */
-	base = buf->cur->offset + buf->output_off;
-
-	/* offset of the current output region within this table */
-	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
-		base += sizes(buf->cur->table[topa_idx].size);
-
-	if (buf->snapshot) {
-		local_set(&buf->data_size, base);
-	} else {
-		old = (local64_xchg(&buf->head, base) &
-		       ((buf->nr_pages << PAGE_SHIFT) - 1));
-		if (base < old)
-			base += buf->nr_pages << PAGE_SHIFT;
-
-		local_add(base - old, &buf->data_size);
-	}
-}
-
-/**
- * pt_buffer_region() - obtain current output region's address
- * @buf:	PT buffer.
- */
-static void *pt_buffer_region(struct pt_buffer *buf)
-{
-	return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
-}
-
-/**
- * pt_buffer_region_size() - obtain current output region's size
- * @buf:	PT buffer.
- */
-static size_t pt_buffer_region_size(struct pt_buffer *buf)
-{
-	return sizes(buf->cur->table[buf->cur_idx].size);
-}
-
-/**
- * pt_handle_status() - take care of possible status conditions
- * @pt:		Per-cpu pt context.
- */
-static void pt_handle_status(struct pt *pt)
-{
-	struct pt_buffer *buf = perf_get_aux(&pt->handle);
-	int advance = 0;
-	u64 status;
-
-	rdmsrl(MSR_IA32_RTIT_STATUS, status);
-
-	if (status & RTIT_STATUS_ERROR) {
-		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
-		pt_topa_dump(buf);
-		status &= ~RTIT_STATUS_ERROR;
-	}
-
-	if (status & RTIT_STATUS_STOPPED) {
-		status &= ~RTIT_STATUS_STOPPED;
-
-		/*
-		 * On systems that only do single-entry ToPA, hitting STOP
-		 * means we are already losing data; need to let the decoder
-		 * know.
-		 */
-		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
-		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
-			local_inc(&buf->lost);
-			advance++;
-		}
-	}
-
-	/*
-	 * Also on single-entry ToPA implementations, interrupt will come
-	 * before the output reaches its output region's boundary.
-	 */
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
-	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
-		void *head = pt_buffer_region(buf);
-
-		/* everything within this margin needs to be zeroed out */
-		memset(head + buf->output_off, 0,
-		       pt_buffer_region_size(buf) -
-		       buf->output_off);
-		advance++;
-	}
-
-	if (advance)
-		pt_buffer_advance(buf);
-
-	wrmsrl(MSR_IA32_RTIT_STATUS, status);
-}
-
-/**
- * pt_read_offset() - translate registers into buffer pointers
- * @buf:	PT buffer.
- *
- * Set buffer's output pointers from MSR values.
- */
-static void pt_read_offset(struct pt_buffer *buf)
-{
-	u64 offset, base_topa;
-
-	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
-	buf->cur = phys_to_virt(base_topa);
-
-	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
-	/* offset within current output region */
-	buf->output_off = offset >> 32;
-	/* index of current output region within this table */
-	buf->cur_idx = (offset & 0xffffff80) >> 7;
-}
-
-/**
- * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
- * @buf:	PT buffer.
- * @pg:		Page offset in the buffer.
- *
- * When advancing to the next output region (ToPA entry), given a page offset
- * into the buffer, we need to find the offset of the first page in the next
- * region.
- */
-static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
-{
-	struct topa_entry *te = buf->topa_index[pg];
-
-	/* one region */
-	if (buf->first == buf->last && buf->first->last == 1)
-		return pg;
-
-	do {
-		pg++;
-		pg &= buf->nr_pages - 1;
-	} while (buf->topa_index[pg] == te);
-
-	return pg;
-}
-
-/**
- * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
- * @buf:	PT buffer.
- * @handle:	Current output handle.
- *
- * Place INT and STOP marks to prevent overwriting old data that the consumer
- * hasn't yet collected and waking up the consumer after a certain fraction of
- * the buffer has filled up. Only needed and sensible for non-snapshot counters.
- *
- * This obviously relies on buf::head to figure out buffer markers, so it has
- * to be called after pt_buffer_reset_offsets() and before the hardware tracing
- * is enabled.
- */
-static int pt_buffer_reset_markers(struct pt_buffer *buf,
-				   struct perf_output_handle *handle)
-
-{
-	unsigned long head = local64_read(&buf->head);
-	unsigned long idx, npages, wakeup;
-
-	/* can't stop in the middle of an output region */
-	if (buf->output_off + handle->size + 1 <
-	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
-		return -EINVAL;
-
-
-	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-		return 0;
-
-	/* clear STOP and INT from current entry */
-	buf->topa_index[buf->stop_pos]->stop = 0;
-	buf->topa_index[buf->intr_pos]->intr = 0;
-
-	/* how many pages till the STOP marker */
-	npages = handle->size >> PAGE_SHIFT;
-
-	/* if it's on a page boundary, fill up one more page */
-	if (!offset_in_page(head + handle->size + 1))
-		npages++;
-
-	idx = (head >> PAGE_SHIFT) + npages;
-	idx &= buf->nr_pages - 1;
-	buf->stop_pos = idx;
-
-	wakeup = handle->wakeup >> PAGE_SHIFT;
-
-	/* in the worst case, wake up the consumer one page before hard stop */
-	idx = (head >> PAGE_SHIFT) + npages - 1;
-	if (idx > wakeup)
-		idx = wakeup;
-
-	idx &= buf->nr_pages - 1;
-	buf->intr_pos = idx;
-
-	buf->topa_index[buf->stop_pos]->stop = 1;
-	buf->topa_index[buf->intr_pos]->intr = 1;
-
-	return 0;
-}
-
-/**
- * pt_buffer_setup_topa_index() - build topa_index[] table of regions
- * @buf:	PT buffer.
- *
- * topa_index[] references output regions indexed by offset into the
- * buffer for purposes of quick reverse lookup.
- */
-static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
-{
-	struct topa *cur = buf->first, *prev = buf->last;
-	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
-		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
-	int pg = 0, idx = 0;
-
-	while (pg < buf->nr_pages) {
-		int tidx;
-
-		/* pages within one topa entry */
-		for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
-			buf->topa_index[pg] = te_prev;
-
-		te_prev = te_cur;
-
-		if (idx == cur->last - 1) {
-			/* advance to next topa table */
-			idx = 0;
-			cur = list_entry(cur->list.next, struct topa, list);
-		} else {
-			idx++;
-		}
-		te_cur = TOPA_ENTRY(cur, idx);
-	}
-
-}
-
-/**
- * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
- * @buf:	PT buffer.
- * @head:	Write pointer (aux_head) from AUX buffer.
- *
- * Find the ToPA table and entry corresponding to given @head and set buffer's
- * "current" pointers accordingly. This is done after we have obtained the
- * current aux_head position from a successful call to perf_aux_output_begin()
- * to make sure the hardware is writing to the right place.
- *
- * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
- * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
- * which are used to determine INT and STOP markers' locations by a subsequent
- * call to pt_buffer_reset_markers().
- */
-static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
-{
-	int pg;
-
-	if (buf->snapshot)
-		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
-
-	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
-	pg = pt_topa_next_entry(buf, pg);
-
-	buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
-	buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
-			(unsigned long)buf->cur) / sizeof(struct topa_entry);
-	buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
-
-	local64_set(&buf->head, head);
-	local_set(&buf->data_size, 0);
-}
-
-/**
- * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
- * @buf:	PT buffer.
- */
-static void pt_buffer_fini_topa(struct pt_buffer *buf)
-{
-	struct topa *topa, *iter;
-
-	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
-		/*
-		 * right now, this is in free_aux() path only, so
-		 * no need to unlink this table from the list
-		 */
-		topa_free(topa);
-	}
-}
-
-/**
- * pt_buffer_init_topa() - initialize ToPA table for pt buffer
- * @buf:	PT buffer.
- * @size:	Total size of all regions within this ToPA.
- * @gfp:	Allocation flags.
- */
-static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
-			       gfp_t gfp)
-{
-	struct topa *topa;
-	int err;
-
-	topa = topa_alloc(buf->cpu, gfp);
-	if (!topa)
-		return -ENOMEM;
-
-	topa_insert_table(buf, topa);
-
-	while (buf->nr_pages < nr_pages) {
-		err = topa_insert_pages(buf, gfp);
-		if (err) {
-			pt_buffer_fini_topa(buf);
-			return -ENOMEM;
-		}
-	}
-
-	pt_buffer_setup_topa_index(buf);
-
-	/* link last table to the first one, unless we're double buffering */
-	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
-		TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
-		TOPA_ENTRY(buf->last, -1)->end = 1;
-	}
-
-	pt_topa_dump(buf);
-	return 0;
-}
-
-/**
- * pt_buffer_setup_aux() - set up topa tables for a PT buffer
- * @cpu:	Cpu on which to allocate, -1 means current.
- * @pages:	Array of pointers to buffer pages passed from perf core.
- * @nr_pages:	Number of pages in the buffer.
- * @snapshot:	If this is a snapshot/overwrite counter.
- *
- * This is a pmu::setup_aux callback that sets up ToPA tables and all the
- * bookkeeping for an AUX buffer.
- *
- * Return:	Our private PT buffer structure.
- */
-static void *
-pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
-{
-	struct pt_buffer *buf;
-	int node, ret;
-
-	if (!nr_pages)
-		return NULL;
-
-	if (cpu == -1)
-		cpu = raw_smp_processor_id();
-	node = cpu_to_node(cpu);
-
-	buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
-			   GFP_KERNEL, node);
-	if (!buf)
-		return NULL;
-
-	buf->cpu = cpu;
-	buf->snapshot = snapshot;
-	buf->data_pages = pages;
-
-	INIT_LIST_HEAD(&buf->tables);
-
-	ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
-	if (ret) {
-		kfree(buf);
-		return NULL;
-	}
-
-	return buf;
-}
-
-/**
- * pt_buffer_free_aux() - perf AUX deallocation path callback
- * @data:	PT buffer.
- */
-static void pt_buffer_free_aux(void *data)
-{
-	struct pt_buffer *buf = data;
-
-	pt_buffer_fini_topa(buf);
-	kfree(buf);
-}
-
-/**
- * pt_buffer_is_full() - check if the buffer is full
- * @buf:	PT buffer.
- * @pt:		Per-cpu pt handle.
- *
- * If the user hasn't read data from the output region that aux_head
- * points to, the buffer is considered full: the user needs to read at
- * least this region and update aux_tail to point past it.
- */
-static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
-{
-	if (buf->snapshot)
-		return false;
-
-	if (local_read(&buf->data_size) >= pt->handle.size)
-		return true;
-
-	return false;
-}
-
-/**
- * intel_pt_interrupt() - PT PMI handler
- */
-void intel_pt_interrupt(void)
-{
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-	struct pt_buffer *buf;
-	struct perf_event *event = pt->handle.event;
-
-	/*
-	 * There may be a dangling PT bit in the interrupt status register
-	 * after PT has been disabled by pt_event_stop(). Make sure we don't
-	 * do anything (particularly, re-enable) for this event here.
-	 */
-	if (!ACCESS_ONCE(pt->handle_nmi))
-		return;
-
-	pt_config_start(false);
-
-	if (!event)
-		return;
-
-	buf = perf_get_aux(&pt->handle);
-	if (!buf)
-		return;
-
-	pt_read_offset(buf);
-
-	pt_handle_status(pt);
-
-	pt_update_head(pt);
-
-	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-			    local_xchg(&buf->lost, 0));
-
-	if (!event->hw.state) {
-		int ret;
-
-		buf = perf_aux_output_begin(&pt->handle, event);
-		if (!buf) {
-			event->hw.state = PERF_HES_STOPPED;
-			return;
-		}
-
-		pt_buffer_reset_offsets(buf, pt->handle.head);
-		/* snapshot counters don't use PMI, so it's safe */
-		ret = pt_buffer_reset_markers(buf, &pt->handle);
-		if (ret) {
-			perf_aux_output_end(&pt->handle, 0, true);
-			return;
-		}
-
-		pt_config_buffer(buf->cur->table, buf->cur_idx,
-				 buf->output_off);
-		pt_config(event);
-	}
-}
-
-/*
- * PMU callbacks
- */
-
-static void pt_event_start(struct perf_event *event, int mode)
-{
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-	struct pt_buffer *buf = perf_get_aux(&pt->handle);
-
-	if (!buf || pt_buffer_is_full(buf, pt)) {
-		event->hw.state = PERF_HES_STOPPED;
-		return;
-	}
-
-	ACCESS_ONCE(pt->handle_nmi) = 1;
-	event->hw.state = 0;
-
-	pt_config_buffer(buf->cur->table, buf->cur_idx,
-			 buf->output_off);
-	pt_config(event);
-}
-
-static void pt_event_stop(struct perf_event *event, int mode)
-{
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-
-	/*
-	 * Protect against the PMI racing with disabling wrmsr,
-	 * see comment in intel_pt_interrupt().
-	 */
-	ACCESS_ONCE(pt->handle_nmi) = 0;
-	pt_config_start(false);
-
-	if (event->hw.state == PERF_HES_STOPPED)
-		return;
-
-	event->hw.state = PERF_HES_STOPPED;
-
-	if (mode & PERF_EF_UPDATE) {
-		struct pt_buffer *buf = perf_get_aux(&pt->handle);
-
-		if (!buf)
-			return;
-
-		if (WARN_ON_ONCE(pt->handle.event != event))
-			return;
-
-		pt_read_offset(buf);
-
-		pt_handle_status(pt);
-
-		pt_update_head(pt);
-	}
-}
-
-static void pt_event_del(struct perf_event *event, int mode)
-{
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-	struct pt_buffer *buf;
-
-	pt_event_stop(event, PERF_EF_UPDATE);
-
-	buf = perf_get_aux(&pt->handle);
-
-	if (buf) {
-		if (buf->snapshot)
-			pt->handle.head =
-				local_xchg(&buf->data_size,
-					   buf->nr_pages << PAGE_SHIFT);
-		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-				    local_xchg(&buf->lost, 0));
-	}
-}
-
-static int pt_event_add(struct perf_event *event, int mode)
-{
-	struct pt_buffer *buf;
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-	struct hw_perf_event *hwc = &event->hw;
-	int ret = -EBUSY;
-
-	if (pt->handle.event)
-		goto fail;
-
-	buf = perf_aux_output_begin(&pt->handle, event);
-	ret = -EINVAL;
-	if (!buf)
-		goto fail_stop;
-
-	pt_buffer_reset_offsets(buf, pt->handle.head);
-	if (!buf->snapshot) {
-		ret = pt_buffer_reset_markers(buf, &pt->handle);
-		if (ret)
-			goto fail_end_stop;
-	}
-
-	if (mode & PERF_EF_START) {
-		pt_event_start(event, 0);
-		ret = -EBUSY;
-		if (hwc->state == PERF_HES_STOPPED)
-			goto fail_end_stop;
-	} else {
-		hwc->state = PERF_HES_STOPPED;
-	}
-
-	return 0;
-
-fail_end_stop:
-	perf_aux_output_end(&pt->handle, 0, true);
-fail_stop:
-	hwc->state = PERF_HES_STOPPED;
-fail:
-	return ret;
-}
-
-static void pt_event_read(struct perf_event *event)
-{
-}
-
-static void pt_event_destroy(struct perf_event *event)
-{
-	x86_del_exclusive(x86_lbr_exclusive_pt);
-}
-
-static int pt_event_init(struct perf_event *event)
-{
-	if (event->attr.type != pt_pmu.pmu.type)
-		return -ENOENT;
-
-	if (!pt_event_valid(event))
-		return -EINVAL;
-
-	if (x86_add_exclusive(x86_lbr_exclusive_pt))
-		return -EBUSY;
-
-	event->destroy = pt_event_destroy;
-
-	return 0;
-}
-
-void cpu_emergency_stop_pt(void)
-{
-	struct pt *pt = this_cpu_ptr(&pt_ctx);
-
-	if (pt->handle.event)
-		pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
-}
-
-static __init int pt_init(void)
-{
-	int ret, cpu, prior_warn = 0;
-
-	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
-
-	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
-		return -ENODEV;
-
-	get_online_cpus();
-	for_each_online_cpu(cpu) {
-		u64 ctl;
-
-		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
-		if (!ret && (ctl & RTIT_CTL_TRACEEN))
-			prior_warn++;
-	}
-	put_online_cpus();
-
-	if (prior_warn) {
-		x86_add_exclusive(x86_lbr_exclusive_pt);
-		pr_warn("PT is enabled at boot time, doing nothing\n");
-
-		return -EBUSY;
-	}
-
-	ret = pt_pmu_hw_init();
-	if (ret)
-		return ret;
-
-	if (!pt_cap_get(PT_CAP_topa_output)) {
-		pr_warn("ToPA output is not supported on this CPU\n");
-		return -ENODEV;
-	}
-
-	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
-		pt_pmu.pmu.capabilities =
-			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
-
-	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
-	pt_pmu.pmu.attr_groups	= pt_attr_groups;
-	pt_pmu.pmu.task_ctx_nr	= perf_sw_context;
-	pt_pmu.pmu.event_init	= pt_event_init;
-	pt_pmu.pmu.add		= pt_event_add;
-	pt_pmu.pmu.del		= pt_event_del;
-	pt_pmu.pmu.start	= pt_event_start;
-	pt_pmu.pmu.stop		= pt_event_stop;
-	pt_pmu.pmu.read		= pt_event_read;
-	pt_pmu.pmu.setup_aux	= pt_buffer_setup_aux;
-	pt_pmu.pmu.free_aux	= pt_buffer_free_aux;
-	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
-
-	return ret;
-}
-arch_initcall(pt_init);
-- 
cgit v0.10.2


From 609d809f832ddda20f03029c865dd052596ea394 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:14 +0100
Subject: perf/x86: Move perf_event_intel_rapl.c ....... =>
 x86/events/intel/rapl.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-9-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 2a2f55d..f68232c 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
new file mode 100644
index 0000000..9541f50
--- /dev/null
+++ b/arch/x86/events/intel/rapl.c
@@ -0,0 +1,783 @@
+/*
+ * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ *  pp0 counter: consumption of all physical cores (power plane 0)
+ * 	  event: rapl_energy_cores
+ *    perf code: 0x1
+ *
+ *  pkg counter: consumption of the whole processor package
+ *	  event: rapl_energy_pkg
+ *    perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ *	  event: rapl_energy_dram
+ *    perf code: 0x3
+ *
+ * dram counter: consumption of the builtin-gpu domain (client only)
+ *	  event: rapl_energy_gpu
+ *    perf code: 0x4
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../../kernel/cpu/perf_event.h"
+
+/*
+ * RAPL energy status counters
+ */
+#define RAPL_IDX_PP0_NRG_STAT	0	/* all cores */
+#define INTEL_RAPL_PP0		0x1	/* pseudo-encoding */
+#define RAPL_IDX_PKG_NRG_STAT	1	/* entire package */
+#define INTEL_RAPL_PKG		0x2	/* pseudo-encoding */
+#define RAPL_IDX_RAM_NRG_STAT	2	/* DRAM */
+#define INTEL_RAPL_RAM		0x3	/* pseudo-encoding */
+#define RAPL_IDX_PP1_NRG_STAT	3	/* gpu */
+#define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */
+
+#define NR_RAPL_DOMAINS         0x4
+static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+	"pp0-core",
+	"package",
+	"dram",
+	"pp1-gpu",
+};
+
+/* Clients have PP0, PKG */
+#define RAPL_IDX_CLN	(1<<RAPL_IDX_PP0_NRG_STAT|\
+			 1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM */
+#define RAPL_IDX_SRV	(1<<RAPL_IDX_PP0_NRG_STAT|\
+			 1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_RAM_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM, PP1 */
+#define RAPL_IDX_HSW	(1<<RAPL_IDX_PP0_NRG_STAT|\
+			 1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_RAM_NRG_STAT|\
+			 1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Knights Landing has PKG, RAM */
+#define RAPL_IDX_KNL	(1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_RAM_NRG_STAT)
+
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK	0xFFULL
+
+#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)		\
+static ssize_t __rapl_##_var##_show(struct kobject *kobj,	\
+				struct kobj_attribute *attr,	\
+				char *page)			\
+{								\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\
+	return sprintf(page, _format "\n");			\
+}								\
+static struct kobj_attribute format_attr_##_var =		\
+	__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
+
+#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+
+#define RAPL_EVENT_ATTR_STR(_name, v, str)					\
+static struct perf_pmu_events_attr event_attr_##v = {				\
+	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
+	.id		= 0,							\
+	.event_str	= str,							\
+};
+
+struct rapl_pmu {
+	spinlock_t	 lock;
+	int		 n_active; /* number of active events */
+	struct list_head active_list;
+	struct pmu	 *pmu; /* pointer to rapl_pmu_class */
+	ktime_t		 timer_interval; /* in ktime_t unit */
+	struct hrtimer   hrtimer;
+};
+
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
+static struct pmu rapl_pmu_class;
+static cpumask_t rapl_cpu_mask;
+static int rapl_cntr_mask;
+
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+
+static struct x86_pmu_quirk *rapl_quirks;
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+	u64 raw;
+	rdmsrl(event->hw.event_base, raw);
+	return raw;
+}
+
+#define rapl_add_quirk(func_)						\
+do {									\
+	static struct x86_pmu_quirk __quirk __initdata = {		\
+		.func = func_,						\
+	};								\
+	__quirk.next = rapl_quirks;					\
+	rapl_quirks = &__quirk;						\
+} while (0)
+
+static inline u64 rapl_scale(u64 v, int cfg)
+{
+	if (cfg > NR_RAPL_DOMAINS) {
+		pr_warn("invalid domain %d, failed to scale data\n", cfg);
+		return v;
+	}
+	/*
+	 * scale delta to smallest unit (1/2^32)
+	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
+	 * or use ldexp(count, -32).
+	 * Watts = Joules/Time delta
+	 */
+	return v << (32 - rapl_hw_unit[cfg - 1]);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta, sdelta;
+	int shift = RAPL_CNTR_WIDTH;
+
+again:
+	prev_raw_count = local64_read(&hwc->prev_count);
+	rdmsrl(event->hw.event_base, new_raw_count);
+
+	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+			    new_raw_count) != prev_raw_count) {
+		cpu_relax();
+		goto again;
+	}
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count.
+	 */
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
+
+	sdelta = rapl_scale(delta, event->hw.config);
+
+	local64_add(sdelta, &event->count);
+
+	return new_raw_count;
+}
+
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
+		     HRTIMER_MODE_REL_PINNED);
+}
+
+static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
+{
+	hrtimer_cancel(&pmu->hrtimer);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct perf_event *event;
+	unsigned long flags;
+
+	if (!pmu->n_active)
+		return HRTIMER_NORESTART;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+
+	list_for_each_entry(event, &pmu->active_list, active_entry) {
+		rapl_event_update(event);
+	}
+
+	spin_unlock_irqrestore(&pmu->lock, flags);
+
+	hrtimer_forward_now(hrtimer, pmu->timer_interval);
+
+	return HRTIMER_RESTART;
+}
+
+static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+{
+	struct hrtimer *hr = &pmu->hrtimer;
+
+	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hr->function = rapl_hrtimer_handle;
+}
+
+static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+				   struct perf_event *event)
+{
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	event->hw.state = 0;
+
+	list_add_tail(&event->active_entry, &pmu->active_list);
+
+	local64_set(&event->hw.prev_count, rapl_read_counter(event));
+
+	pmu->n_active++;
+	if (pmu->n_active == 1)
+		rapl_start_hrtimer(pmu);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+	__rapl_pmu_event_start(pmu, event);
+	spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+
+	/* mark event as deactivated and stopped */
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		WARN_ON_ONCE(pmu->n_active <= 0);
+		pmu->n_active--;
+		if (pmu->n_active == 0)
+			rapl_stop_hrtimer(pmu);
+
+		list_del(&event->active_entry);
+
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	/* check if update of sw counter is necessary */
+	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		rapl_event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+
+	spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pmu->lock, flags);
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	if (mode & PERF_EF_START)
+		__rapl_pmu_event_start(pmu, event);
+
+	spin_unlock_irqrestore(&pmu->lock, flags);
+
+	return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+	int bit, msr, ret = 0;
+
+	/* only look at RAPL events */
+	if (event->attr.type != rapl_pmu_class.type)
+		return -ENOENT;
+
+	/* check only supported bits are set */
+	if (event->attr.config & ~RAPL_EVENT_MASK)
+		return -EINVAL;
+
+	/*
+	 * check event is known (determines counter)
+	 */
+	switch (cfg) {
+	case INTEL_RAPL_PP0:
+		bit = RAPL_IDX_PP0_NRG_STAT;
+		msr = MSR_PP0_ENERGY_STATUS;
+		break;
+	case INTEL_RAPL_PKG:
+		bit = RAPL_IDX_PKG_NRG_STAT;
+		msr = MSR_PKG_ENERGY_STATUS;
+		break;
+	case INTEL_RAPL_RAM:
+		bit = RAPL_IDX_RAM_NRG_STAT;
+		msr = MSR_DRAM_ENERGY_STATUS;
+		break;
+	case INTEL_RAPL_PP1:
+		bit = RAPL_IDX_PP1_NRG_STAT;
+		msr = MSR_PP1_ENERGY_STATUS;
+		break;
+	default:
+		return -EINVAL;
+	}
+	/* check event supported */
+	if (!(rapl_cntr_mask & (1 << bit)))
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	/* must be done before validate_group */
+	event->hw.event_base = msr;
+	event->hw.config = cfg;
+	event->hw.idx = bit;
+
+	return ret;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+	rapl_event_update(event);
+}
+
+static ssize_t rapl_get_attr_cpumask(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+
+static struct attribute *rapl_pmu_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group rapl_pmu_attr_group = {
+	.attrs = rapl_pmu_attrs,
+};
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
+
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
+
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
+
+static struct attribute *rapl_events_srv_attr[] = {
+	EVENT_PTR(rapl_cores),
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_ram),
+
+	EVENT_PTR(rapl_cores_unit),
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_ram_unit),
+
+	EVENT_PTR(rapl_cores_scale),
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_ram_scale),
+	NULL,
+};
+
+static struct attribute *rapl_events_cln_attr[] = {
+	EVENT_PTR(rapl_cores),
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_gpu),
+
+	EVENT_PTR(rapl_cores_unit),
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_gpu_unit),
+
+	EVENT_PTR(rapl_cores_scale),
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_gpu_scale),
+	NULL,
+};
+
+static struct attribute *rapl_events_hsw_attr[] = {
+	EVENT_PTR(rapl_cores),
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_gpu),
+	EVENT_PTR(rapl_ram),
+
+	EVENT_PTR(rapl_cores_unit),
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_gpu_unit),
+	EVENT_PTR(rapl_ram_unit),
+
+	EVENT_PTR(rapl_cores_scale),
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_gpu_scale),
+	EVENT_PTR(rapl_ram_scale),
+	NULL,
+};
+
+static struct attribute *rapl_events_knl_attr[] = {
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_ram),
+
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_ram_unit),
+
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_ram_scale),
+	NULL,
+};
+
+static struct attribute_group rapl_pmu_events_group = {
+	.name = "events",
+	.attrs = NULL, /* patched at runtime */
+};
+
+DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group rapl_pmu_format_group = {
+	.name = "format",
+	.attrs = rapl_formats_attr,
+};
+
+const struct attribute_group *rapl_attr_groups[] = {
+	&rapl_pmu_attr_group,
+	&rapl_pmu_format_group,
+	&rapl_pmu_events_group,
+	NULL,
+};
+
+static struct pmu rapl_pmu_class = {
+	.attr_groups	= rapl_attr_groups,
+	.task_ctx_nr	= perf_invalid_context, /* system-wide only */
+	.event_init	= rapl_pmu_event_init,
+	.add		= rapl_pmu_event_add, /* must have */
+	.del		= rapl_pmu_event_del, /* must have */
+	.start		= rapl_pmu_event_start,
+	.stop		= rapl_pmu_event_stop,
+	.read		= rapl_pmu_event_read,
+};
+
+static void rapl_cpu_exit(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+	int i, phys_id = topology_physical_package_id(cpu);
+	int target = -1;
+
+	/* find a new cpu on same package */
+	for_each_online_cpu(i) {
+		if (i == cpu)
+			continue;
+		if (phys_id == topology_physical_package_id(i)) {
+			target = i;
+			break;
+		}
+	}
+	/*
+	 * clear cpu from cpumask
+	 * if was set in cpumask and still some cpu on package,
+	 * then move to new cpu
+	 */
+	if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
+		cpumask_set_cpu(target, &rapl_cpu_mask);
+
+	WARN_ON(cpumask_empty(&rapl_cpu_mask));
+	/*
+	 * migrate events and context to new cpu
+	 */
+	if (target >= 0)
+		perf_pmu_migrate_context(pmu->pmu, cpu, target);
+
+	/* cancel overflow polling timer for CPU */
+	rapl_stop_hrtimer(pmu);
+}
+
+static void rapl_cpu_init(int cpu)
+{
+	int i, phys_id = topology_physical_package_id(cpu);
+
+	/* check if phys_is is already covered */
+	for_each_cpu(i, &rapl_cpu_mask) {
+		if (phys_id == topology_physical_package_id(i))
+			return;
+	}
+	/* was not found, so add it */
+	cpumask_set_cpu(cpu, &rapl_cpu_mask);
+}
+
+static __init void rapl_hsw_server_quirk(void)
+{
+	/*
+	 * DRAM domain on HSW server has fixed energy unit which can be
+	 * different than the unit from power unit MSR.
+	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+	 */
+	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+}
+
+static int rapl_cpu_prepare(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+	int phys_id = topology_physical_package_id(cpu);
+	u64 ms;
+
+	if (pmu)
+		return 0;
+
+	if (phys_id < 0)
+		return -1;
+
+	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+	if (!pmu)
+		return -1;
+	spin_lock_init(&pmu->lock);
+
+	INIT_LIST_HEAD(&pmu->active_list);
+
+	pmu->pmu = &rapl_pmu_class;
+
+	/*
+	 * use reference of 200W for scaling the timeout
+	 * to avoid missing counter overflows.
+	 * 200W = 200 Joules/sec
+	 * divide interval by 2 to avoid lockstep (2 * 100)
+	 * if hw unit is 32, then we use 2 ms 1/200/2
+	 */
+	if (rapl_hw_unit[0] < 32)
+		ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
+	else
+		ms = 2;
+
+	pmu->timer_interval = ms_to_ktime(ms);
+
+	rapl_hrtimer_init(pmu);
+
+	/* set RAPL pmu for this cpu for now */
+	per_cpu(rapl_pmu, cpu) = pmu;
+	per_cpu(rapl_pmu_to_free, cpu) = NULL;
+
+	return 0;
+}
+
+static void rapl_cpu_kfree(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
+
+	kfree(pmu);
+
+	per_cpu(rapl_pmu_to_free, cpu) = NULL;
+}
+
+static int rapl_cpu_dying(int cpu)
+{
+	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+
+	if (!pmu)
+		return 0;
+
+	per_cpu(rapl_pmu, cpu) = NULL;
+
+	per_cpu(rapl_pmu_to_free, cpu) = pmu;
+
+	return 0;
+}
+
+static int rapl_cpu_notifier(struct notifier_block *self,
+			     unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		rapl_cpu_prepare(cpu);
+		break;
+	case CPU_STARTING:
+		rapl_cpu_init(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DYING:
+		rapl_cpu_dying(cpu);
+		break;
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		rapl_cpu_kfree(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		rapl_cpu_exit(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static int rapl_check_hw_unit(void)
+{
+	u64 msr_rapl_power_unit_bits;
+	int i;
+
+	/* protect rdmsrl() to handle virtualization */
+	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+		return -1;
+	for (i = 0; i < NR_RAPL_DOMAINS; i++)
+		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+	return 0;
+}
+
+static const struct x86_cpu_id rapl_cpu_match[] = {
+	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
+	[1] = {},
+};
+
+static int __init rapl_pmu_init(void)
+{
+	struct rapl_pmu *pmu;
+	int cpu, ret;
+	struct x86_pmu_quirk *quirk;
+	int i;
+
+	/*
+	 * check for Intel processor family 6
+	 */
+	if (!x86_match_cpu(rapl_cpu_match))
+		return 0;
+
+	/* check supported CPU */
+	switch (boot_cpu_data.x86_model) {
+	case 42: /* Sandy Bridge */
+	case 58: /* Ivy Bridge */
+		rapl_cntr_mask = RAPL_IDX_CLN;
+		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
+		break;
+	case 63: /* Haswell-Server */
+		rapl_add_quirk(rapl_hsw_server_quirk);
+		rapl_cntr_mask = RAPL_IDX_SRV;
+		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+		break;
+	case 60: /* Haswell */
+	case 69: /* Haswell-Celeron */
+	case 61: /* Broadwell */
+		rapl_cntr_mask = RAPL_IDX_HSW;
+		rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
+		break;
+	case 45: /* Sandy Bridge-EP */
+	case 62: /* IvyTown */
+		rapl_cntr_mask = RAPL_IDX_SRV;
+		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+		break;
+	case 87: /* Knights Landing */
+		rapl_add_quirk(rapl_hsw_server_quirk);
+		rapl_cntr_mask = RAPL_IDX_KNL;
+		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
+
+	default:
+		/* unsupported */
+		return 0;
+	}
+	ret = rapl_check_hw_unit();
+	if (ret)
+		return ret;
+
+	/* run cpu model quirks */
+	for (quirk = rapl_quirks; quirk; quirk = quirk->next)
+		quirk->func();
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu) {
+		ret = rapl_cpu_prepare(cpu);
+		if (ret)
+			goto out;
+		rapl_cpu_init(cpu);
+	}
+
+	__perf_cpu_notifier(rapl_cpu_notifier);
+
+	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
+	if (WARN_ON(ret)) {
+		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
+		cpu_notifier_register_done();
+		return -1;
+	}
+
+	pmu = __this_cpu_read(rapl_pmu);
+
+	pr_info("RAPL PMU detected,"
+		" API unit is 2^-32 Joules,"
+		" %d fixed counters"
+		" %llu ms ovfl timer\n",
+		hweight32(rapl_cntr_mask),
+		ktime_to_ms(pmu->timer_interval));
+	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+		if (rapl_cntr_mask & (1 << i)) {
+			pr_info("hw unit of domain %s 2^-%d Joules\n",
+				rapl_domain_names[i], rapl_hw_unit[i]);
+		}
+	}
+out:
+	cpu_notifier_register_done();
+
+	return 0;
+}
+device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 595e3ff..018451a 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,6 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
deleted file mode 100644
index 24a351a..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ /dev/null
@@ -1,783 +0,0 @@
-/*
- * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
- * Copyright (C) 2013 Google, Inc., Stephane Eranian
- *
- * Intel RAPL interface is specified in the IA-32 Manual Vol3b
- * section 14.7.1 (September 2013)
- *
- * RAPL provides more controls than just reporting energy consumption
- * however here we only expose the 3 energy consumption free running
- * counters (pp0, pkg, dram).
- *
- * Each of those counters increments in a power unit defined by the
- * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
- * but it can vary.
- *
- * Counter to rapl events mappings:
- *
- *  pp0 counter: consumption of all physical cores (power plane 0)
- * 	  event: rapl_energy_cores
- *    perf code: 0x1
- *
- *  pkg counter: consumption of the whole processor package
- *	  event: rapl_energy_pkg
- *    perf code: 0x2
- *
- * dram counter: consumption of the dram domain (servers only)
- *	  event: rapl_energy_dram
- *    perf code: 0x3
- *
- * dram counter: consumption of the builtin-gpu domain (client only)
- *	  event: rapl_energy_gpu
- *    perf code: 0x4
- *
- * We manage those counters as free running (read-only). They may be
- * use simultaneously by other tools, such as turbostat.
- *
- * The events only support system-wide mode counting. There is no
- * sampling support because it does not make sense and is not
- * supported by the RAPL hardware.
- *
- * Because we want to avoid floating-point operations in the kernel,
- * the events are all reported in fixed point arithmetic (32.32).
- * Tools must adjust the counts to convert them to Watts using
- * the duration of the measurement. Tools may use a function such as
- * ldexp(raw_count, -32);
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/perf_event.h>
-#include <asm/cpu_device_id.h>
-#include "perf_event.h"
-
-/*
- * RAPL energy status counters
- */
-#define RAPL_IDX_PP0_NRG_STAT	0	/* all cores */
-#define INTEL_RAPL_PP0		0x1	/* pseudo-encoding */
-#define RAPL_IDX_PKG_NRG_STAT	1	/* entire package */
-#define INTEL_RAPL_PKG		0x2	/* pseudo-encoding */
-#define RAPL_IDX_RAM_NRG_STAT	2	/* DRAM */
-#define INTEL_RAPL_RAM		0x3	/* pseudo-encoding */
-#define RAPL_IDX_PP1_NRG_STAT	3	/* gpu */
-#define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */
-
-#define NR_RAPL_DOMAINS         0x4
-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
-	"pp0-core",
-	"package",
-	"dram",
-	"pp1-gpu",
-};
-
-/* Clients have PP0, PKG */
-#define RAPL_IDX_CLN	(1<<RAPL_IDX_PP0_NRG_STAT|\
-			 1<<RAPL_IDX_PKG_NRG_STAT|\
-			 1<<RAPL_IDX_PP1_NRG_STAT)
-
-/* Servers have PP0, PKG, RAM */
-#define RAPL_IDX_SRV	(1<<RAPL_IDX_PP0_NRG_STAT|\
-			 1<<RAPL_IDX_PKG_NRG_STAT|\
-			 1<<RAPL_IDX_RAM_NRG_STAT)
-
-/* Servers have PP0, PKG, RAM, PP1 */
-#define RAPL_IDX_HSW	(1<<RAPL_IDX_PP0_NRG_STAT|\
-			 1<<RAPL_IDX_PKG_NRG_STAT|\
-			 1<<RAPL_IDX_RAM_NRG_STAT|\
-			 1<<RAPL_IDX_PP1_NRG_STAT)
-
-/* Knights Landing has PKG, RAM */
-#define RAPL_IDX_KNL	(1<<RAPL_IDX_PKG_NRG_STAT|\
-			 1<<RAPL_IDX_RAM_NRG_STAT)
-
-/*
- * event code: LSB 8 bits, passed in attr->config
- * any other bit is reserved
- */
-#define RAPL_EVENT_MASK	0xFFULL
-
-#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)		\
-static ssize_t __rapl_##_var##_show(struct kobject *kobj,	\
-				struct kobj_attribute *attr,	\
-				char *page)			\
-{								\
-	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\
-	return sprintf(page, _format "\n");			\
-}								\
-static struct kobj_attribute format_attr_##_var =		\
-	__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
-
-#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
-
-#define RAPL_EVENT_ATTR_STR(_name, v, str)					\
-static struct perf_pmu_events_attr event_attr_##v = {				\
-	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
-	.id		= 0,							\
-	.event_str	= str,							\
-};
-
-struct rapl_pmu {
-	spinlock_t	 lock;
-	int		 n_active; /* number of active events */
-	struct list_head active_list;
-	struct pmu	 *pmu; /* pointer to rapl_pmu_class */
-	ktime_t		 timer_interval; /* in ktime_t unit */
-	struct hrtimer   hrtimer;
-};
-
-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
-static struct pmu rapl_pmu_class;
-static cpumask_t rapl_cpu_mask;
-static int rapl_cntr_mask;
-
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
-
-static struct x86_pmu_quirk *rapl_quirks;
-static inline u64 rapl_read_counter(struct perf_event *event)
-{
-	u64 raw;
-	rdmsrl(event->hw.event_base, raw);
-	return raw;
-}
-
-#define rapl_add_quirk(func_)						\
-do {									\
-	static struct x86_pmu_quirk __quirk __initdata = {		\
-		.func = func_,						\
-	};								\
-	__quirk.next = rapl_quirks;					\
-	rapl_quirks = &__quirk;						\
-} while (0)
-
-static inline u64 rapl_scale(u64 v, int cfg)
-{
-	if (cfg > NR_RAPL_DOMAINS) {
-		pr_warn("invalid domain %d, failed to scale data\n", cfg);
-		return v;
-	}
-	/*
-	 * scale delta to smallest unit (1/2^32)
-	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
-	 * or use ldexp(count, -32).
-	 * Watts = Joules/Time delta
-	 */
-	return v << (32 - rapl_hw_unit[cfg - 1]);
-}
-
-static u64 rapl_event_update(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 prev_raw_count, new_raw_count;
-	s64 delta, sdelta;
-	int shift = RAPL_CNTR_WIDTH;
-
-again:
-	prev_raw_count = local64_read(&hwc->prev_count);
-	rdmsrl(event->hw.event_base, new_raw_count);
-
-	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
-			    new_raw_count) != prev_raw_count) {
-		cpu_relax();
-		goto again;
-	}
-
-	/*
-	 * Now we have the new raw value and have updated the prev
-	 * timestamp already. We can now calculate the elapsed delta
-	 * (event-)time and add that to the generic event.
-	 *
-	 * Careful, not all hw sign-extends above the physical width
-	 * of the count.
-	 */
-	delta = (new_raw_count << shift) - (prev_raw_count << shift);
-	delta >>= shift;
-
-	sdelta = rapl_scale(delta, event->hw.config);
-
-	local64_add(sdelta, &event->count);
-
-	return new_raw_count;
-}
-
-static void rapl_start_hrtimer(struct rapl_pmu *pmu)
-{
-       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
-		     HRTIMER_MODE_REL_PINNED);
-}
-
-static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
-{
-	hrtimer_cancel(&pmu->hrtimer);
-}
-
-static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
-{
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-	struct perf_event *event;
-	unsigned long flags;
-
-	if (!pmu->n_active)
-		return HRTIMER_NORESTART;
-
-	spin_lock_irqsave(&pmu->lock, flags);
-
-	list_for_each_entry(event, &pmu->active_list, active_entry) {
-		rapl_event_update(event);
-	}
-
-	spin_unlock_irqrestore(&pmu->lock, flags);
-
-	hrtimer_forward_now(hrtimer, pmu->timer_interval);
-
-	return HRTIMER_RESTART;
-}
-
-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
-{
-	struct hrtimer *hr = &pmu->hrtimer;
-
-	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hr->function = rapl_hrtimer_handle;
-}
-
-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
-				   struct perf_event *event)
-{
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-		return;
-
-	event->hw.state = 0;
-
-	list_add_tail(&event->active_entry, &pmu->active_list);
-
-	local64_set(&event->hw.prev_count, rapl_read_counter(event));
-
-	pmu->n_active++;
-	if (pmu->n_active == 1)
-		rapl_start_hrtimer(pmu);
-}
-
-static void rapl_pmu_event_start(struct perf_event *event, int mode)
-{
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-	unsigned long flags;
-
-	spin_lock_irqsave(&pmu->lock, flags);
-	__rapl_pmu_event_start(pmu, event);
-	spin_unlock_irqrestore(&pmu->lock, flags);
-}
-
-static void rapl_pmu_event_stop(struct perf_event *event, int mode)
-{
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned long flags;
-
-	spin_lock_irqsave(&pmu->lock, flags);
-
-	/* mark event as deactivated and stopped */
-	if (!(hwc->state & PERF_HES_STOPPED)) {
-		WARN_ON_ONCE(pmu->n_active <= 0);
-		pmu->n_active--;
-		if (pmu->n_active == 0)
-			rapl_stop_hrtimer(pmu);
-
-		list_del(&event->active_entry);
-
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-	}
-
-	/* check if update of sw counter is necessary */
-	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		/*
-		 * Drain the remaining delta count out of a event
-		 * that we are disabling:
-		 */
-		rapl_event_update(event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-
-	spin_unlock_irqrestore(&pmu->lock, flags);
-}
-
-static int rapl_pmu_event_add(struct perf_event *event, int mode)
-{
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned long flags;
-
-	spin_lock_irqsave(&pmu->lock, flags);
-
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-
-	if (mode & PERF_EF_START)
-		__rapl_pmu_event_start(pmu, event);
-
-	spin_unlock_irqrestore(&pmu->lock, flags);
-
-	return 0;
-}
-
-static void rapl_pmu_event_del(struct perf_event *event, int flags)
-{
-	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int rapl_pmu_event_init(struct perf_event *event)
-{
-	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
-	int bit, msr, ret = 0;
-
-	/* only look at RAPL events */
-	if (event->attr.type != rapl_pmu_class.type)
-		return -ENOENT;
-
-	/* check only supported bits are set */
-	if (event->attr.config & ~RAPL_EVENT_MASK)
-		return -EINVAL;
-
-	/*
-	 * check event is known (determines counter)
-	 */
-	switch (cfg) {
-	case INTEL_RAPL_PP0:
-		bit = RAPL_IDX_PP0_NRG_STAT;
-		msr = MSR_PP0_ENERGY_STATUS;
-		break;
-	case INTEL_RAPL_PKG:
-		bit = RAPL_IDX_PKG_NRG_STAT;
-		msr = MSR_PKG_ENERGY_STATUS;
-		break;
-	case INTEL_RAPL_RAM:
-		bit = RAPL_IDX_RAM_NRG_STAT;
-		msr = MSR_DRAM_ENERGY_STATUS;
-		break;
-	case INTEL_RAPL_PP1:
-		bit = RAPL_IDX_PP1_NRG_STAT;
-		msr = MSR_PP1_ENERGY_STATUS;
-		break;
-	default:
-		return -EINVAL;
-	}
-	/* check event supported */
-	if (!(rapl_cntr_mask & (1 << bit)))
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	/* must be done before validate_group */
-	event->hw.event_base = msr;
-	event->hw.config = cfg;
-	event->hw.idx = bit;
-
-	return ret;
-}
-
-static void rapl_pmu_event_read(struct perf_event *event)
-{
-	rapl_event_update(event);
-}
-
-static ssize_t rapl_get_attr_cpumask(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
-
-static struct attribute *rapl_pmu_attrs[] = {
-	&dev_attr_cpumask.attr,
-	NULL,
-};
-
-static struct attribute_group rapl_pmu_attr_group = {
-	.attrs = rapl_pmu_attrs,
-};
-
-RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
-RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
-RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
-RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
-
-RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
-RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
-
-/*
- * we compute in 0.23 nJ increments regardless of MSR
- */
-RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
-RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
-
-static struct attribute *rapl_events_srv_attr[] = {
-	EVENT_PTR(rapl_cores),
-	EVENT_PTR(rapl_pkg),
-	EVENT_PTR(rapl_ram),
-
-	EVENT_PTR(rapl_cores_unit),
-	EVENT_PTR(rapl_pkg_unit),
-	EVENT_PTR(rapl_ram_unit),
-
-	EVENT_PTR(rapl_cores_scale),
-	EVENT_PTR(rapl_pkg_scale),
-	EVENT_PTR(rapl_ram_scale),
-	NULL,
-};
-
-static struct attribute *rapl_events_cln_attr[] = {
-	EVENT_PTR(rapl_cores),
-	EVENT_PTR(rapl_pkg),
-	EVENT_PTR(rapl_gpu),
-
-	EVENT_PTR(rapl_cores_unit),
-	EVENT_PTR(rapl_pkg_unit),
-	EVENT_PTR(rapl_gpu_unit),
-
-	EVENT_PTR(rapl_cores_scale),
-	EVENT_PTR(rapl_pkg_scale),
-	EVENT_PTR(rapl_gpu_scale),
-	NULL,
-};
-
-static struct attribute *rapl_events_hsw_attr[] = {
-	EVENT_PTR(rapl_cores),
-	EVENT_PTR(rapl_pkg),
-	EVENT_PTR(rapl_gpu),
-	EVENT_PTR(rapl_ram),
-
-	EVENT_PTR(rapl_cores_unit),
-	EVENT_PTR(rapl_pkg_unit),
-	EVENT_PTR(rapl_gpu_unit),
-	EVENT_PTR(rapl_ram_unit),
-
-	EVENT_PTR(rapl_cores_scale),
-	EVENT_PTR(rapl_pkg_scale),
-	EVENT_PTR(rapl_gpu_scale),
-	EVENT_PTR(rapl_ram_scale),
-	NULL,
-};
-
-static struct attribute *rapl_events_knl_attr[] = {
-	EVENT_PTR(rapl_pkg),
-	EVENT_PTR(rapl_ram),
-
-	EVENT_PTR(rapl_pkg_unit),
-	EVENT_PTR(rapl_ram_unit),
-
-	EVENT_PTR(rapl_pkg_scale),
-	EVENT_PTR(rapl_ram_scale),
-	NULL,
-};
-
-static struct attribute_group rapl_pmu_events_group = {
-	.name = "events",
-	.attrs = NULL, /* patched at runtime */
-};
-
-DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
-static struct attribute *rapl_formats_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group rapl_pmu_format_group = {
-	.name = "format",
-	.attrs = rapl_formats_attr,
-};
-
-const struct attribute_group *rapl_attr_groups[] = {
-	&rapl_pmu_attr_group,
-	&rapl_pmu_format_group,
-	&rapl_pmu_events_group,
-	NULL,
-};
-
-static struct pmu rapl_pmu_class = {
-	.attr_groups	= rapl_attr_groups,
-	.task_ctx_nr	= perf_invalid_context, /* system-wide only */
-	.event_init	= rapl_pmu_event_init,
-	.add		= rapl_pmu_event_add, /* must have */
-	.del		= rapl_pmu_event_del, /* must have */
-	.start		= rapl_pmu_event_start,
-	.stop		= rapl_pmu_event_stop,
-	.read		= rapl_pmu_event_read,
-};
-
-static void rapl_cpu_exit(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-	int i, phys_id = topology_physical_package_id(cpu);
-	int target = -1;
-
-	/* find a new cpu on same package */
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
-		if (phys_id == topology_physical_package_id(i)) {
-			target = i;
-			break;
-		}
-	}
-	/*
-	 * clear cpu from cpumask
-	 * if was set in cpumask and still some cpu on package,
-	 * then move to new cpu
-	 */
-	if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
-		cpumask_set_cpu(target, &rapl_cpu_mask);
-
-	WARN_ON(cpumask_empty(&rapl_cpu_mask));
-	/*
-	 * migrate events and context to new cpu
-	 */
-	if (target >= 0)
-		perf_pmu_migrate_context(pmu->pmu, cpu, target);
-
-	/* cancel overflow polling timer for CPU */
-	rapl_stop_hrtimer(pmu);
-}
-
-static void rapl_cpu_init(int cpu)
-{
-	int i, phys_id = topology_physical_package_id(cpu);
-
-	/* check if phys_is is already covered */
-	for_each_cpu(i, &rapl_cpu_mask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;
-	}
-	/* was not found, so add it */
-	cpumask_set_cpu(cpu, &rapl_cpu_mask);
-}
-
-static __init void rapl_hsw_server_quirk(void)
-{
-	/*
-	 * DRAM domain on HSW server has fixed energy unit which can be
-	 * different than the unit from power unit MSR.
-	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
-	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
-	 */
-	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
-}
-
-static int rapl_cpu_prepare(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-	int phys_id = topology_physical_package_id(cpu);
-	u64 ms;
-
-	if (pmu)
-		return 0;
-
-	if (phys_id < 0)
-		return -1;
-
-	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
-	if (!pmu)
-		return -1;
-	spin_lock_init(&pmu->lock);
-
-	INIT_LIST_HEAD(&pmu->active_list);
-
-	pmu->pmu = &rapl_pmu_class;
-
-	/*
-	 * use reference of 200W for scaling the timeout
-	 * to avoid missing counter overflows.
-	 * 200W = 200 Joules/sec
-	 * divide interval by 2 to avoid lockstep (2 * 100)
-	 * if hw unit is 32, then we use 2 ms 1/200/2
-	 */
-	if (rapl_hw_unit[0] < 32)
-		ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
-	else
-		ms = 2;
-
-	pmu->timer_interval = ms_to_ktime(ms);
-
-	rapl_hrtimer_init(pmu);
-
-	/* set RAPL pmu for this cpu for now */
-	per_cpu(rapl_pmu, cpu) = pmu;
-	per_cpu(rapl_pmu_to_free, cpu) = NULL;
-
-	return 0;
-}
-
-static void rapl_cpu_kfree(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
-
-	kfree(pmu);
-
-	per_cpu(rapl_pmu_to_free, cpu) = NULL;
-}
-
-static int rapl_cpu_dying(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-
-	if (!pmu)
-		return 0;
-
-	per_cpu(rapl_pmu, cpu) = NULL;
-
-	per_cpu(rapl_pmu_to_free, cpu) = pmu;
-
-	return 0;
-}
-
-static int rapl_cpu_notifier(struct notifier_block *self,
-			     unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		rapl_cpu_prepare(cpu);
-		break;
-	case CPU_STARTING:
-		rapl_cpu_init(cpu);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_DYING:
-		rapl_cpu_dying(cpu);
-		break;
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		rapl_cpu_kfree(cpu);
-		break;
-	case CPU_DOWN_PREPARE:
-		rapl_cpu_exit(cpu);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static int rapl_check_hw_unit(void)
-{
-	u64 msr_rapl_power_unit_bits;
-	int i;
-
-	/* protect rdmsrl() to handle virtualization */
-	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
-		return -1;
-	for (i = 0; i < NR_RAPL_DOMAINS; i++)
-		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
-
-	return 0;
-}
-
-static const struct x86_cpu_id rapl_cpu_match[] = {
-	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
-	[1] = {},
-};
-
-static int __init rapl_pmu_init(void)
-{
-	struct rapl_pmu *pmu;
-	int cpu, ret;
-	struct x86_pmu_quirk *quirk;
-	int i;
-
-	/*
-	 * check for Intel processor family 6
-	 */
-	if (!x86_match_cpu(rapl_cpu_match))
-		return 0;
-
-	/* check supported CPU */
-	switch (boot_cpu_data.x86_model) {
-	case 42: /* Sandy Bridge */
-	case 58: /* Ivy Bridge */
-		rapl_cntr_mask = RAPL_IDX_CLN;
-		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
-		break;
-	case 63: /* Haswell-Server */
-		rapl_add_quirk(rapl_hsw_server_quirk);
-		rapl_cntr_mask = RAPL_IDX_SRV;
-		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-		break;
-	case 60: /* Haswell */
-	case 69: /* Haswell-Celeron */
-	case 61: /* Broadwell */
-		rapl_cntr_mask = RAPL_IDX_HSW;
-		rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
-		break;
-	case 45: /* Sandy Bridge-EP */
-	case 62: /* IvyTown */
-		rapl_cntr_mask = RAPL_IDX_SRV;
-		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
-		break;
-	case 87: /* Knights Landing */
-		rapl_add_quirk(rapl_hsw_server_quirk);
-		rapl_cntr_mask = RAPL_IDX_KNL;
-		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
-
-	default:
-		/* unsupported */
-		return 0;
-	}
-	ret = rapl_check_hw_unit();
-	if (ret)
-		return ret;
-
-	/* run cpu model quirks */
-	for (quirk = rapl_quirks; quirk; quirk = quirk->next)
-		quirk->func();
-	cpu_notifier_register_begin();
-
-	for_each_online_cpu(cpu) {
-		ret = rapl_cpu_prepare(cpu);
-		if (ret)
-			goto out;
-		rapl_cpu_init(cpu);
-	}
-
-	__perf_cpu_notifier(rapl_cpu_notifier);
-
-	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
-	if (WARN_ON(ret)) {
-		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
-		cpu_notifier_register_done();
-		return -1;
-	}
-
-	pmu = __this_cpu_read(rapl_pmu);
-
-	pr_info("RAPL PMU detected,"
-		" API unit is 2^-32 Joules,"
-		" %d fixed counters"
-		" %llu ms ovfl timer\n",
-		hweight32(rapl_cntr_mask),
-		ktime_to_ms(pmu->timer_interval));
-	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
-		if (rapl_cntr_mask & (1 << i)) {
-			pr_info("hw unit of domain %s 2^-%d Joules\n",
-				rapl_domain_names[i], rapl_hw_unit[i]);
-		}
-	}
-out:
-	cpu_notifier_register_done();
-
-	return 0;
-}
-device_initcall(rapl_pmu_init);
-- 
cgit v0.10.2


From 6bcb2db547be8263a98ae9413127df9385b38763 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:15 +0100
Subject: perf/x86: Move perf_event_intel_uncore.[ch] .. =>
 x86/events/intel/uncore.[ch]

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-10-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index f68232c..aae3e53 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -8,3 +8,4 @@ endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
new file mode 100644
index 0000000..91a18d6
--- /dev/null
+++ b/arch/x86/events/intel/uncore.c
@@ -0,0 +1,1401 @@
+#include "uncore.h"
+
+static struct intel_uncore_type *empty_uncore[] = { NULL, };
+struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
+struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
+
+static bool pcidrv_registered;
+struct pci_driver *uncore_pci_driver;
+/* pci bus to socket mapping */
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
+struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+
+static DEFINE_RAW_SPINLOCK(uncore_box_lock);
+/* mask of cpus that collect uncore events */
+static cpumask_t uncore_cpu_mask;
+
+/* constraint for the fixed counter */
+static struct event_constraint uncore_constraint_fixed =
+	EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
+struct event_constraint uncore_constraint_empty =
+	EVENT_CONSTRAINT(0, 0, 0);
+
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+	struct pci2phy_map *map;
+	int phys_id = -1;
+
+	raw_spin_lock(&pci2phy_map_lock);
+	list_for_each_entry(map, &pci2phy_map_head, list) {
+		if (map->segment == pci_domain_nr(bus)) {
+			phys_id = map->pbus_to_physid[bus->number];
+			break;
+		}
+	}
+	raw_spin_unlock(&pci2phy_map_lock);
+
+	return phys_id;
+}
+
+struct pci2phy_map *__find_pci2phy_map(int segment)
+{
+	struct pci2phy_map *map, *alloc = NULL;
+	int i;
+
+	lockdep_assert_held(&pci2phy_map_lock);
+
+lookup:
+	list_for_each_entry(map, &pci2phy_map_head, list) {
+		if (map->segment == segment)
+			goto end;
+	}
+
+	if (!alloc) {
+		raw_spin_unlock(&pci2phy_map_lock);
+		alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
+		raw_spin_lock(&pci2phy_map_lock);
+
+		if (!alloc)
+			return NULL;
+
+		goto lookup;
+	}
+
+	map = alloc;
+	alloc = NULL;
+	map->segment = segment;
+	for (i = 0; i < 256; i++)
+		map->pbus_to_physid[i] = -1;
+	list_add_tail(&map->list, &pci2phy_map_head);
+
+end:
+	kfree(alloc);
+	return map;
+}
+
+ssize_t uncore_event_show(struct kobject *kobj,
+			  struct kobj_attribute *attr, char *buf)
+{
+	struct uncore_event_desc *event =
+		container_of(attr, struct uncore_event_desc, attr);
+	return sprintf(buf, "%s", event->config);
+}
+
+struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+	return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+{
+	struct intel_uncore_box *box;
+
+	box = *per_cpu_ptr(pmu->box, cpu);
+	if (box)
+		return box;
+
+	raw_spin_lock(&uncore_box_lock);
+	/* Recheck in lock to handle races. */
+	if (*per_cpu_ptr(pmu->box, cpu))
+		goto out;
+	list_for_each_entry(box, &pmu->box_list, list) {
+		if (box->phys_id == topology_physical_package_id(cpu)) {
+			atomic_inc(&box->refcnt);
+			*per_cpu_ptr(pmu->box, cpu) = box;
+			break;
+		}
+	}
+out:
+	raw_spin_unlock(&uncore_box_lock);
+
+	return *per_cpu_ptr(pmu->box, cpu);
+}
+
+struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+	/*
+	 * perf core schedules event on the basis of cpu, uncore events are
+	 * collected by one of the cpus inside a physical package.
+	 */
+	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
+}
+
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	u64 count;
+
+	rdmsrl(event->hw.event_base, count);
+
+	return count;
+}
+
+/*
+ * generic get constraint function for shared match/mask registers.
+ */
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_extra_reg *er;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	unsigned long flags;
+	bool ok = false;
+
+	/*
+	 * reg->alloc can be set due to existing state, so for fake box we
+	 * need to ignore this, otherwise we might fail to allocate proper
+	 * fake state for this extra reg constraint.
+	 */
+	if (reg1->idx == EXTRA_REG_NONE ||
+	    (!uncore_box_is_fake(box) && reg1->alloc))
+		return NULL;
+
+	er = &box->shared_regs[reg1->idx];
+	raw_spin_lock_irqsave(&er->lock, flags);
+	if (!atomic_read(&er->ref) ||
+	    (er->config1 == reg1->config && er->config2 == reg2->config)) {
+		atomic_inc(&er->ref);
+		er->config1 = reg1->config;
+		er->config2 = reg2->config;
+		ok = true;
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	if (ok) {
+		if (!uncore_box_is_fake(box))
+			reg1->alloc = 1;
+		return NULL;
+	}
+
+	return &uncore_constraint_empty;
+}
+
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_extra_reg *er;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+
+	/*
+	 * Only put constraint if extra reg was actually allocated. Also
+	 * takes care of event which do not use an extra shared reg.
+	 *
+	 * Also, if this is a fake box we shouldn't touch any event state
+	 * (reg->alloc) and we don't care about leaving inconsistent box
+	 * state either since it will be thrown out.
+	 */
+	if (uncore_box_is_fake(box) || !reg1->alloc)
+		return;
+
+	er = &box->shared_regs[reg1->idx];
+	atomic_dec(&er->ref);
+	reg1->alloc = 0;
+}
+
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	u64 config;
+
+	er = &box->shared_regs[idx];
+
+	raw_spin_lock_irqsave(&er->lock, flags);
+	config = er->config;
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	return config;
+}
+
+static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hwc->idx = idx;
+	hwc->last_tag = ++box->tags[idx];
+
+	if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
+		hwc->event_base = uncore_fixed_ctr(box);
+		hwc->config_base = uncore_fixed_ctl(box);
+		return;
+	}
+
+	hwc->config_base = uncore_event_ctl(box, hwc->idx);
+	hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
+}
+
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+{
+	u64 prev_count, new_count, delta;
+	int shift;
+
+	if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
+		shift = 64 - uncore_fixed_ctr_bits(box);
+	else
+		shift = 64 - uncore_perf_ctr_bits(box);
+
+	/* the hrtimer might modify the previous event value */
+again:
+	prev_count = local64_read(&event->hw.prev_count);
+	new_count = uncore_read_counter(box, event);
+	if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
+		goto again;
+
+	delta = (new_count << shift) - (prev_count << shift);
+	delta >>= shift;
+
+	local64_add(delta, &event->count);
+}
+
+/*
+ * The overflow interrupt is unavailable for SandyBridge-EP, is broken
+ * for SandyBridge. So we use hrtimer to periodically poll the counter
+ * to avoid overflow.
+ */
+static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
+{
+	struct intel_uncore_box *box;
+	struct perf_event *event;
+	unsigned long flags;
+	int bit;
+
+	box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
+	if (!box->n_active || box->cpu != smp_processor_id())
+		return HRTIMER_NORESTART;
+	/*
+	 * disable local interrupt to prevent uncore_pmu_event_start/stop
+	 * to interrupt the update process
+	 */
+	local_irq_save(flags);
+
+	/*
+	 * handle boxes with an active event list as opposed to active
+	 * counters
+	 */
+	list_for_each_entry(event, &box->active_list, active_entry) {
+		uncore_perf_event_update(box, event);
+	}
+
+	for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
+		uncore_perf_event_update(box, box->events[bit]);
+
+	local_irq_restore(flags);
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
+	return HRTIMER_RESTART;
+}
+
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+{
+	hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+		      HRTIMER_MODE_REL_PINNED);
+}
+
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+{
+	hrtimer_cancel(&box->hrtimer);
+}
+
+static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
+{
+	hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	box->hrtimer.function = uncore_pmu_hrtimer;
+}
+
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
+{
+	struct intel_uncore_box *box;
+	int i, size;
+
+	size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
+
+	box = kzalloc_node(size, GFP_KERNEL, node);
+	if (!box)
+		return NULL;
+
+	for (i = 0; i < type->num_shared_regs; i++)
+		raw_spin_lock_init(&box->shared_regs[i].lock);
+
+	uncore_pmu_init_hrtimer(box);
+	atomic_set(&box->refcnt, 1);
+	box->cpu = -1;
+	box->phys_id = -1;
+
+	/* set default hrtimer timeout */
+	box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
+
+	INIT_LIST_HEAD(&box->active_list);
+
+	return box;
+}
+
+/*
+ * Using uncore_pmu_event_init pmu event_init callback
+ * as a detection point for uncore events.
+ */
+static int uncore_pmu_event_init(struct perf_event *event);
+
+static bool is_uncore_event(struct perf_event *event)
+{
+	return event->pmu->event_init == uncore_pmu_event_init;
+}
+
+static int
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
+{
+	struct perf_event *event;
+	int n, max_count;
+
+	max_count = box->pmu->type->num_counters;
+	if (box->pmu->type->fixed_ctl)
+		max_count++;
+
+	if (box->n_events >= max_count)
+		return -EINVAL;
+
+	n = box->n_events;
+
+	if (is_uncore_event(leader)) {
+		box->event_list[n] = leader;
+		n++;
+	}
+
+	if (!dogrp)
+		return n;
+
+	list_for_each_entry(event, &leader->sibling_list, group_entry) {
+		if (!is_uncore_event(event) ||
+		    event->state <= PERF_EVENT_STATE_OFF)
+			continue;
+
+		if (n >= max_count)
+			return -EINVAL;
+
+		box->event_list[n] = event;
+		n++;
+	}
+	return n;
+}
+
+static struct event_constraint *
+uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_type *type = box->pmu->type;
+	struct event_constraint *c;
+
+	if (type->ops->get_constraint) {
+		c = type->ops->get_constraint(box, event);
+		if (c)
+			return c;
+	}
+
+	if (event->attr.config == UNCORE_FIXED_EVENT)
+		return &uncore_constraint_fixed;
+
+	if (type->constraints) {
+		for_each_event_constraint(c, type->constraints) {
+			if ((event->hw.config & c->cmask) == c->code)
+				return c;
+		}
+	}
+
+	return &type->unconstrainted;
+}
+
+static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	if (box->pmu->type->ops->put_constraint)
+		box->pmu->type->ops->put_constraint(box, event);
+}
+
+static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
+{
+	unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+	struct event_constraint *c;
+	int i, wmin, wmax, ret = 0;
+	struct hw_perf_event *hwc;
+
+	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
+
+	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+		c = uncore_get_event_constraint(box, box->event_list[i]);
+		box->event_constraint[i] = c;
+		wmin = min(wmin, c->weight);
+		wmax = max(wmax, c->weight);
+	}
+
+	/* fastpath, try to reuse previous register */
+	for (i = 0; i < n; i++) {
+		hwc = &box->event_list[i]->hw;
+		c = box->event_constraint[i];
+
+		/* never assigned */
+		if (hwc->idx == -1)
+			break;
+
+		/* constraint still honored */
+		if (!test_bit(hwc->idx, c->idxmsk))
+			break;
+
+		/* not already used */
+		if (test_bit(hwc->idx, used_mask))
+			break;
+
+		__set_bit(hwc->idx, used_mask);
+		if (assign)
+			assign[i] = hwc->idx;
+	}
+	/* slow path */
+	if (i != n)
+		ret = perf_assign_events(box->event_constraint, n,
+					 wmin, wmax, n, assign);
+
+	if (!assign || ret) {
+		for (i = 0; i < n; i++)
+			uncore_put_event_constraint(box, box->event_list[i]);
+	}
+	return ret ? -EINVAL : 0;
+}
+
+static void uncore_pmu_event_start(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	int idx = event->hw.idx;
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
+		return;
+
+	event->hw.state = 0;
+	box->events[idx] = event;
+	box->n_active++;
+	__set_bit(idx, box->active_mask);
+
+	local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
+	uncore_enable_event(box, event);
+
+	if (box->n_active == 1) {
+		uncore_enable_box(box);
+		uncore_pmu_start_hrtimer(box);
+	}
+}
+
+static void uncore_pmu_event_stop(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
+		uncore_disable_event(box, event);
+		box->n_active--;
+		box->events[hwc->idx] = NULL;
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+
+		if (box->n_active == 0) {
+			uncore_disable_box(box);
+			uncore_pmu_cancel_hrtimer(box);
+		}
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		uncore_perf_event_update(box, event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int uncore_pmu_event_add(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+	int assign[UNCORE_PMC_IDX_MAX];
+	int i, n, ret;
+
+	if (!box)
+		return -ENODEV;
+
+	ret = n = uncore_collect_events(box, event, false);
+	if (ret < 0)
+		return ret;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	ret = uncore_assign_events(box, assign, n);
+	if (ret)
+		return ret;
+
+	/* save events moving to new counters */
+	for (i = 0; i < box->n_events; i++) {
+		event = box->event_list[i];
+		hwc = &event->hw;
+
+		if (hwc->idx == assign[i] &&
+			hwc->last_tag == box->tags[assign[i]])
+			continue;
+		/*
+		 * Ensure we don't accidentally enable a stopped
+		 * counter simply because we rescheduled.
+		 */
+		if (hwc->state & PERF_HES_STOPPED)
+			hwc->state |= PERF_HES_ARCH;
+
+		uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+	}
+
+	/* reprogram moved events into new counters */
+	for (i = 0; i < n; i++) {
+		event = box->event_list[i];
+		hwc = &event->hw;
+
+		if (hwc->idx != assign[i] ||
+			hwc->last_tag != box->tags[assign[i]])
+			uncore_assign_hw_event(box, event, assign[i]);
+		else if (i < box->n_events)
+			continue;
+
+		if (hwc->state & PERF_HES_ARCH)
+			continue;
+
+		uncore_pmu_event_start(event, 0);
+	}
+	box->n_events = n;
+
+	return 0;
+}
+
+static void uncore_pmu_event_del(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	int i;
+
+	uncore_pmu_event_stop(event, PERF_EF_UPDATE);
+
+	for (i = 0; i < box->n_events; i++) {
+		if (event == box->event_list[i]) {
+			uncore_put_event_constraint(box, event);
+
+			while (++i < box->n_events)
+				box->event_list[i - 1] = box->event_list[i];
+
+			--box->n_events;
+			break;
+		}
+	}
+
+	event->hw.idx = -1;
+	event->hw.last_tag = ~0ULL;
+}
+
+void uncore_pmu_event_read(struct perf_event *event)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	uncore_perf_event_update(box, event);
+}
+
+/*
+ * validation ensures the group can be loaded onto the
+ * PMU if it was the only group available.
+ */
+static int uncore_validate_group(struct intel_uncore_pmu *pmu,
+				struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+	struct intel_uncore_box *fake_box;
+	int ret = -EINVAL, n;
+
+	fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
+	if (!fake_box)
+		return -ENOMEM;
+
+	fake_box->pmu = pmu;
+	/*
+	 * the event is not yet connected with its
+	 * siblings therefore we must first collect
+	 * existing siblings, then add the new event
+	 * before we can simulate the scheduling
+	 */
+	n = uncore_collect_events(fake_box, leader, true);
+	if (n < 0)
+		goto out;
+
+	fake_box->n_events = n;
+	n = uncore_collect_events(fake_box, event, false);
+	if (n < 0)
+		goto out;
+
+	fake_box->n_events = n;
+
+	ret = uncore_assign_events(fake_box, NULL, n);
+out:
+	kfree(fake_box);
+	return ret;
+}
+
+static int uncore_pmu_event_init(struct perf_event *event)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	struct hw_perf_event *hwc = &event->hw;
+	int ret;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	pmu = uncore_event_to_pmu(event);
+	/* no device found for this pmu */
+	if (pmu->func_id < 0)
+		return -ENOENT;
+
+	/*
+	 * Uncore PMU does measure at all privilege level all the time.
+	 * So it doesn't make sense to specify any exclude bits.
+	 */
+	if (event->attr.exclude_user || event->attr.exclude_kernel ||
+			event->attr.exclude_hv || event->attr.exclude_idle)
+		return -EINVAL;
+
+	/* Sampling not supported yet */
+	if (hwc->sample_period)
+		return -EINVAL;
+
+	/*
+	 * Place all uncore events for a particular physical package
+	 * onto a single cpu
+	 */
+	if (event->cpu < 0)
+		return -EINVAL;
+	box = uncore_pmu_to_box(pmu, event->cpu);
+	if (!box || box->cpu < 0)
+		return -EINVAL;
+	event->cpu = box->cpu;
+
+	event->hw.idx = -1;
+	event->hw.last_tag = ~0ULL;
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+	event->hw.branch_reg.idx = EXTRA_REG_NONE;
+
+	if (event->attr.config == UNCORE_FIXED_EVENT) {
+		/* no fixed counter */
+		if (!pmu->type->fixed_ctl)
+			return -EINVAL;
+		/*
+		 * if there is only one fixed counter, only the first pmu
+		 * can access the fixed counter
+		 */
+		if (pmu->type->single_fixed && pmu->pmu_idx > 0)
+			return -EINVAL;
+
+		/* fixed counters have event field hardcoded to zero */
+		hwc->config = 0ULL;
+	} else {
+		hwc->config = event->attr.config & pmu->type->event_mask;
+		if (pmu->type->ops->hw_config) {
+			ret = pmu->type->ops->hw_config(box, event);
+			if (ret)
+				return ret;
+		}
+	}
+
+	if (event->group_leader != event)
+		ret = uncore_validate_group(pmu, event);
+	else
+		ret = 0;
+
+	return ret;
+}
+
+static ssize_t uncore_get_attr_cpumask(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
+
+static struct attribute *uncore_pmu_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group uncore_pmu_attr_group = {
+	.attrs = uncore_pmu_attrs,
+};
+
+static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
+{
+	int ret;
+
+	if (!pmu->type->pmu) {
+		pmu->pmu = (struct pmu) {
+			.attr_groups	= pmu->type->attr_groups,
+			.task_ctx_nr	= perf_invalid_context,
+			.event_init	= uncore_pmu_event_init,
+			.add		= uncore_pmu_event_add,
+			.del		= uncore_pmu_event_del,
+			.start		= uncore_pmu_event_start,
+			.stop		= uncore_pmu_event_stop,
+			.read		= uncore_pmu_event_read,
+		};
+	} else {
+		pmu->pmu = *pmu->type->pmu;
+		pmu->pmu.attr_groups = pmu->type->attr_groups;
+	}
+
+	if (pmu->type->num_boxes == 1) {
+		if (strlen(pmu->type->name) > 0)
+			sprintf(pmu->name, "uncore_%s", pmu->type->name);
+		else
+			sprintf(pmu->name, "uncore");
+	} else {
+		sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
+			pmu->pmu_idx);
+	}
+
+	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+	return ret;
+}
+
+static void __init uncore_type_exit(struct intel_uncore_type *type)
+{
+	int i;
+
+	for (i = 0; i < type->num_boxes; i++)
+		free_percpu(type->pmus[i].box);
+	kfree(type->pmus);
+	type->pmus = NULL;
+	kfree(type->events_group);
+	type->events_group = NULL;
+}
+
+static void __init uncore_types_exit(struct intel_uncore_type **types)
+{
+	int i;
+	for (i = 0; types[i]; i++)
+		uncore_type_exit(types[i]);
+}
+
+static int __init uncore_type_init(struct intel_uncore_type *type)
+{
+	struct intel_uncore_pmu *pmus;
+	struct attribute_group *attr_group;
+	struct attribute **attrs;
+	int i, j;
+
+	pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
+	if (!pmus)
+		return -ENOMEM;
+
+	type->pmus = pmus;
+
+	type->unconstrainted = (struct event_constraint)
+		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
+				0, type->num_counters, 0, 0);
+
+	for (i = 0; i < type->num_boxes; i++) {
+		pmus[i].func_id = -1;
+		pmus[i].pmu_idx = i;
+		pmus[i].type = type;
+		INIT_LIST_HEAD(&pmus[i].box_list);
+		pmus[i].box = alloc_percpu(struct intel_uncore_box *);
+		if (!pmus[i].box)
+			goto fail;
+	}
+
+	if (type->event_descs) {
+		i = 0;
+		while (type->event_descs[i].attr.attr.name)
+			i++;
+
+		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
+					sizeof(*attr_group), GFP_KERNEL);
+		if (!attr_group)
+			goto fail;
+
+		attrs = (struct attribute **)(attr_group + 1);
+		attr_group->name = "events";
+		attr_group->attrs = attrs;
+
+		for (j = 0; j < i; j++)
+			attrs[j] = &type->event_descs[j].attr.attr;
+
+		type->events_group = attr_group;
+	}
+
+	type->pmu_group = &uncore_pmu_attr_group;
+	return 0;
+fail:
+	uncore_type_exit(type);
+	return -ENOMEM;
+}
+
+static int __init uncore_types_init(struct intel_uncore_type **types)
+{
+	int i, ret;
+
+	for (i = 0; types[i]; i++) {
+		ret = uncore_type_init(types[i]);
+		if (ret)
+			goto fail;
+	}
+	return 0;
+fail:
+	while (--i >= 0)
+		uncore_type_exit(types[i]);
+	return ret;
+}
+
+/*
+ * add a pci uncore device
+ */
+static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	struct intel_uncore_type *type;
+	int phys_id;
+	bool first_box = false;
+
+	phys_id = uncore_pcibus_to_physid(pdev->bus);
+	if (phys_id < 0)
+		return -ENODEV;
+
+	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
+		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+		uncore_extra_pci_dev[phys_id][idx] = pdev;
+		pci_set_drvdata(pdev, NULL);
+		return 0;
+	}
+
+	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+	box = uncore_alloc_box(type, NUMA_NO_NODE);
+	if (!box)
+		return -ENOMEM;
+
+	/*
+	 * for performance monitoring unit with multiple boxes,
+	 * each box has a different function id.
+	 */
+	pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
+	/* Knights Landing uses a common PCI device ID for multiple instances of
+	 * an uncore PMU device type. There is only one entry per device type in
+	 * the knl_uncore_pci_ids table inspite of multiple devices present for
+	 * some device types. Hence PCI device idx would be 0 for all devices.
+	 * So increment pmu pointer to point to an unused array element.
+	 */
+	if (boot_cpu_data.x86_model == 87)
+		while (pmu->func_id >= 0)
+			pmu++;
+	if (pmu->func_id < 0)
+		pmu->func_id = pdev->devfn;
+	else
+		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
+
+	box->phys_id = phys_id;
+	box->pci_dev = pdev;
+	box->pmu = pmu;
+	uncore_box_init(box);
+	pci_set_drvdata(pdev, box);
+
+	raw_spin_lock(&uncore_box_lock);
+	if (list_empty(&pmu->box_list))
+		first_box = true;
+	list_add_tail(&box->list, &pmu->box_list);
+	raw_spin_unlock(&uncore_box_lock);
+
+	if (first_box)
+		uncore_pmu_register(pmu);
+	return 0;
+}
+
+static void uncore_pci_remove(struct pci_dev *pdev)
+{
+	struct intel_uncore_box *box = pci_get_drvdata(pdev);
+	struct intel_uncore_pmu *pmu;
+	int i, cpu, phys_id;
+	bool last_box = false;
+
+	phys_id = uncore_pcibus_to_physid(pdev->bus);
+	box = pci_get_drvdata(pdev);
+	if (!box) {
+		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
+			if (uncore_extra_pci_dev[phys_id][i] == pdev) {
+				uncore_extra_pci_dev[phys_id][i] = NULL;
+				break;
+			}
+		}
+		WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
+		return;
+	}
+
+	pmu = box->pmu;
+	if (WARN_ON_ONCE(phys_id != box->phys_id))
+		return;
+
+	pci_set_drvdata(pdev, NULL);
+
+	raw_spin_lock(&uncore_box_lock);
+	list_del(&box->list);
+	if (list_empty(&pmu->box_list))
+		last_box = true;
+	raw_spin_unlock(&uncore_box_lock);
+
+	for_each_possible_cpu(cpu) {
+		if (*per_cpu_ptr(pmu->box, cpu) == box) {
+			*per_cpu_ptr(pmu->box, cpu) = NULL;
+			atomic_dec(&box->refcnt);
+		}
+	}
+
+	WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
+	kfree(box);
+
+	if (last_box)
+		perf_pmu_unregister(&pmu->pmu);
+}
+
+static int __init uncore_pci_init(void)
+{
+	int ret;
+
+	switch (boot_cpu_data.x86_model) {
+	case 45: /* Sandy Bridge-EP */
+		ret = snbep_uncore_pci_init();
+		break;
+	case 62: /* Ivy Bridge-EP */
+		ret = ivbep_uncore_pci_init();
+		break;
+	case 63: /* Haswell-EP */
+		ret = hswep_uncore_pci_init();
+		break;
+	case 79: /* BDX-EP */
+	case 86: /* BDX-DE */
+		ret = bdx_uncore_pci_init();
+		break;
+	case 42: /* Sandy Bridge */
+		ret = snb_uncore_pci_init();
+		break;
+	case 58: /* Ivy Bridge */
+		ret = ivb_uncore_pci_init();
+		break;
+	case 60: /* Haswell */
+	case 69: /* Haswell Celeron */
+		ret = hsw_uncore_pci_init();
+		break;
+	case 61: /* Broadwell */
+		ret = bdw_uncore_pci_init();
+		break;
+	case 87: /* Knights Landing */
+		ret = knl_uncore_pci_init();
+		break;
+	case 94: /* SkyLake */
+		ret = skl_uncore_pci_init();
+		break;
+	default:
+		return 0;
+	}
+
+	if (ret)
+		return ret;
+
+	ret = uncore_types_init(uncore_pci_uncores);
+	if (ret)
+		return ret;
+
+	uncore_pci_driver->probe = uncore_pci_probe;
+	uncore_pci_driver->remove = uncore_pci_remove;
+
+	ret = pci_register_driver(uncore_pci_driver);
+	if (ret == 0)
+		pcidrv_registered = true;
+	else
+		uncore_types_exit(uncore_pci_uncores);
+
+	return ret;
+}
+
+static void __init uncore_pci_exit(void)
+{
+	if (pcidrv_registered) {
+		pcidrv_registered = false;
+		pci_unregister_driver(uncore_pci_driver);
+		uncore_types_exit(uncore_pci_uncores);
+	}
+}
+
+/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
+static LIST_HEAD(boxes_to_free);
+
+static void uncore_kfree_boxes(void)
+{
+	struct intel_uncore_box *box;
+
+	while (!list_empty(&boxes_to_free)) {
+		box = list_entry(boxes_to_free.next,
+				 struct intel_uncore_box, list);
+		list_del(&box->list);
+		kfree(box);
+	}
+}
+
+static void uncore_cpu_dying(int cpu)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, j;
+
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			box = *per_cpu_ptr(pmu->box, cpu);
+			*per_cpu_ptr(pmu->box, cpu) = NULL;
+			if (box && atomic_dec_and_test(&box->refcnt))
+				list_add(&box->list, &boxes_to_free);
+		}
+	}
+}
+
+static int uncore_cpu_starting(int cpu)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box, *exist;
+	int i, j, k, phys_id;
+
+	phys_id = topology_physical_package_id(cpu);
+
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			box = *per_cpu_ptr(pmu->box, cpu);
+			/* called by uncore_cpu_init? */
+			if (box && box->phys_id >= 0) {
+				uncore_box_init(box);
+				continue;
+			}
+
+			for_each_online_cpu(k) {
+				exist = *per_cpu_ptr(pmu->box, k);
+				if (exist && exist->phys_id == phys_id) {
+					atomic_inc(&exist->refcnt);
+					*per_cpu_ptr(pmu->box, cpu) = exist;
+					if (box) {
+						list_add(&box->list,
+							 &boxes_to_free);
+						box = NULL;
+					}
+					break;
+				}
+			}
+
+			if (box) {
+				box->phys_id = phys_id;
+				uncore_box_init(box);
+			}
+		}
+	}
+	return 0;
+}
+
+static int uncore_cpu_prepare(int cpu, int phys_id)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, j;
+
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			if (pmu->func_id < 0)
+				pmu->func_id = j;
+
+			box = uncore_alloc_box(type, cpu_to_node(cpu));
+			if (!box)
+				return -ENOMEM;
+
+			box->pmu = pmu;
+			box->phys_id = phys_id;
+			*per_cpu_ptr(pmu->box, cpu) = box;
+		}
+	}
+	return 0;
+}
+
+static void
+uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
+{
+	struct intel_uncore_type *type;
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	int i, j;
+
+	for (i = 0; uncores[i]; i++) {
+		type = uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			if (old_cpu < 0)
+				box = uncore_pmu_to_box(pmu, new_cpu);
+			else
+				box = uncore_pmu_to_box(pmu, old_cpu);
+			if (!box)
+				continue;
+
+			if (old_cpu < 0) {
+				WARN_ON_ONCE(box->cpu != -1);
+				box->cpu = new_cpu;
+				continue;
+			}
+
+			WARN_ON_ONCE(box->cpu != old_cpu);
+			if (new_cpu >= 0) {
+				uncore_pmu_cancel_hrtimer(box);
+				perf_pmu_migrate_context(&pmu->pmu,
+						old_cpu, new_cpu);
+				box->cpu = new_cpu;
+			} else {
+				box->cpu = -1;
+			}
+		}
+	}
+}
+
+static void uncore_event_exit_cpu(int cpu)
+{
+	int i, phys_id, target;
+
+	/* if exiting cpu is used for collecting uncore events */
+	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
+		return;
+
+	/* find a new cpu to collect uncore events */
+	phys_id = topology_physical_package_id(cpu);
+	target = -1;
+	for_each_online_cpu(i) {
+		if (i == cpu)
+			continue;
+		if (phys_id == topology_physical_package_id(i)) {
+			target = i;
+			break;
+		}
+	}
+
+	/* migrate uncore events to the new cpu */
+	if (target >= 0)
+		cpumask_set_cpu(target, &uncore_cpu_mask);
+
+	uncore_change_context(uncore_msr_uncores, cpu, target);
+	uncore_change_context(uncore_pci_uncores, cpu, target);
+}
+
+static void uncore_event_init_cpu(int cpu)
+{
+	int i, phys_id;
+
+	phys_id = topology_physical_package_id(cpu);
+	for_each_cpu(i, &uncore_cpu_mask) {
+		if (phys_id == topology_physical_package_id(i))
+			return;
+	}
+
+	cpumask_set_cpu(cpu, &uncore_cpu_mask);
+
+	uncore_change_context(uncore_msr_uncores, -1, cpu);
+	uncore_change_context(uncore_pci_uncores, -1, cpu);
+}
+
+static int uncore_cpu_notifier(struct notifier_block *self,
+			       unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	/* allocate/free data structure for uncore box */
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		uncore_cpu_prepare(cpu, -1);
+		break;
+	case CPU_STARTING:
+		uncore_cpu_starting(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_DYING:
+		uncore_cpu_dying(cpu);
+		break;
+	case CPU_ONLINE:
+	case CPU_DEAD:
+		uncore_kfree_boxes();
+		break;
+	default:
+		break;
+	}
+
+	/* select the cpu that collects uncore events */
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_FAILED:
+	case CPU_STARTING:
+		uncore_event_init_cpu(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		uncore_event_exit_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block uncore_cpu_nb = {
+	.notifier_call	= uncore_cpu_notifier,
+	/*
+	 * to migrate uncore events, our notifier should be executed
+	 * before perf core's notifier.
+	 */
+	.priority	= CPU_PRI_PERF + 1,
+};
+
+static void __init uncore_cpu_setup(void *dummy)
+{
+	uncore_cpu_starting(smp_processor_id());
+}
+
+static int __init uncore_cpu_init(void)
+{
+	int ret;
+
+	switch (boot_cpu_data.x86_model) {
+	case 26: /* Nehalem */
+	case 30:
+	case 37: /* Westmere */
+	case 44:
+		nhm_uncore_cpu_init();
+		break;
+	case 42: /* Sandy Bridge */
+	case 58: /* Ivy Bridge */
+	case 60: /* Haswell */
+	case 69: /* Haswell */
+	case 70: /* Haswell */
+	case 61: /* Broadwell */
+	case 71: /* Broadwell */
+		snb_uncore_cpu_init();
+		break;
+	case 45: /* Sandy Bridge-EP */
+		snbep_uncore_cpu_init();
+		break;
+	case 46: /* Nehalem-EX */
+	case 47: /* Westmere-EX aka. Xeon E7 */
+		nhmex_uncore_cpu_init();
+		break;
+	case 62: /* Ivy Bridge-EP */
+		ivbep_uncore_cpu_init();
+		break;
+	case 63: /* Haswell-EP */
+		hswep_uncore_cpu_init();
+		break;
+	case 79: /* BDX-EP */
+	case 86: /* BDX-DE */
+		bdx_uncore_cpu_init();
+		break;
+	case 87: /* Knights Landing */
+		knl_uncore_cpu_init();
+		break;
+	default:
+		return 0;
+	}
+
+	ret = uncore_types_init(uncore_msr_uncores);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int __init uncore_pmus_register(void)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_type *type;
+	int i, j;
+
+	for (i = 0; uncore_msr_uncores[i]; i++) {
+		type = uncore_msr_uncores[i];
+		for (j = 0; j < type->num_boxes; j++) {
+			pmu = &type->pmus[j];
+			uncore_pmu_register(pmu);
+		}
+	}
+
+	return 0;
+}
+
+static void __init uncore_cpumask_init(void)
+{
+	int cpu;
+
+	/*
+	 * ony invoke once from msr or pci init code
+	 */
+	if (!cpumask_empty(&uncore_cpu_mask))
+		return;
+
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu) {
+		int i, phys_id = topology_physical_package_id(cpu);
+
+		for_each_cpu(i, &uncore_cpu_mask) {
+			if (phys_id == topology_physical_package_id(i)) {
+				phys_id = -1;
+				break;
+			}
+		}
+		if (phys_id < 0)
+			continue;
+
+		uncore_cpu_prepare(cpu, phys_id);
+		uncore_event_init_cpu(cpu);
+	}
+	on_each_cpu(uncore_cpu_setup, NULL, 1);
+
+	__register_cpu_notifier(&uncore_cpu_nb);
+
+	cpu_notifier_register_done();
+}
+
+
+static int __init intel_uncore_init(void)
+{
+	int ret;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return -ENODEV;
+
+	if (cpu_has_hypervisor)
+		return -ENODEV;
+
+	ret = uncore_pci_init();
+	if (ret)
+		goto fail;
+	ret = uncore_cpu_init();
+	if (ret) {
+		uncore_pci_exit();
+		goto fail;
+	}
+	uncore_cpumask_init();
+
+	uncore_pmus_register();
+	return 0;
+fail:
+	return ret;
+}
+device_initcall(intel_uncore_init);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
new file mode 100644
index 0000000..1dea204
--- /dev/null
+++ b/arch/x86/events/intel/uncore.h
@@ -0,0 +1,357 @@
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/perf_event.h>
+#include "../../kernel/cpu/perf_event.h"
+
+#define UNCORE_PMU_NAME_LEN		32
+#define UNCORE_PMU_HRTIMER_INTERVAL	(60LL * NSEC_PER_SEC)
+#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
+
+#define UNCORE_FIXED_EVENT		0xff
+#define UNCORE_PMC_IDX_MAX_GENERIC	8
+#define UNCORE_PMC_IDX_FIXED		UNCORE_PMC_IDX_MAX_GENERIC
+#define UNCORE_PMC_IDX_MAX		(UNCORE_PMC_IDX_FIXED + 1)
+
+#define UNCORE_PCI_DEV_DATA(type, idx)	((type << 8) | idx)
+#define UNCORE_PCI_DEV_TYPE(data)	((data >> 8) & 0xff)
+#define UNCORE_PCI_DEV_IDX(data)	(data & 0xff)
+#define UNCORE_EXTRA_PCI_DEV		0xff
+#define UNCORE_EXTRA_PCI_DEV_MAX	3
+
+/* support up to 8 sockets */
+#define UNCORE_SOCKET_MAX		8
+
+#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
+
+struct intel_uncore_ops;
+struct intel_uncore_pmu;
+struct intel_uncore_box;
+struct uncore_event_desc;
+
+struct intel_uncore_type {
+	const char *name;
+	int num_counters;
+	int num_boxes;
+	int perf_ctr_bits;
+	int fixed_ctr_bits;
+	unsigned perf_ctr;
+	unsigned event_ctl;
+	unsigned event_mask;
+	unsigned fixed_ctr;
+	unsigned fixed_ctl;
+	unsigned box_ctl;
+	unsigned msr_offset;
+	unsigned num_shared_regs:8;
+	unsigned single_fixed:1;
+	unsigned pair_ctr_ctl:1;
+	unsigned *msr_offsets;
+	struct event_constraint unconstrainted;
+	struct event_constraint *constraints;
+	struct intel_uncore_pmu *pmus;
+	struct intel_uncore_ops *ops;
+	struct uncore_event_desc *event_descs;
+	const struct attribute_group *attr_groups[4];
+	struct pmu *pmu; /* for custom pmu ops */
+};
+
+#define pmu_group attr_groups[0]
+#define format_group attr_groups[1]
+#define events_group attr_groups[2]
+
+struct intel_uncore_ops {
+	void (*init_box)(struct intel_uncore_box *);
+	void (*disable_box)(struct intel_uncore_box *);
+	void (*enable_box)(struct intel_uncore_box *);
+	void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
+	void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
+	u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
+	int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
+	struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
+						   struct perf_event *);
+	void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
+};
+
+struct intel_uncore_pmu {
+	struct pmu pmu;
+	char name[UNCORE_PMU_NAME_LEN];
+	int pmu_idx;
+	int func_id;
+	struct intel_uncore_type *type;
+	struct intel_uncore_box ** __percpu box;
+	struct list_head box_list;
+};
+
+struct intel_uncore_extra_reg {
+	raw_spinlock_t lock;
+	u64 config, config1, config2;
+	atomic_t ref;
+};
+
+struct intel_uncore_box {
+	int phys_id;
+	int n_active;	/* number of active events */
+	int n_events;
+	int cpu;	/* cpu to collect events */
+	unsigned long flags;
+	atomic_t refcnt;
+	struct perf_event *events[UNCORE_PMC_IDX_MAX];
+	struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+	struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
+	unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
+	u64 tags[UNCORE_PMC_IDX_MAX];
+	struct pci_dev *pci_dev;
+	struct intel_uncore_pmu *pmu;
+	u64 hrtimer_duration; /* hrtimer timeout for this box */
+	struct hrtimer hrtimer;
+	struct list_head list;
+	struct list_head active_list;
+	void *io_addr;
+	struct intel_uncore_extra_reg shared_regs[0];
+};
+
+#define UNCORE_BOX_FLAG_INITIATED	0
+
+struct uncore_event_desc {
+	struct kobj_attribute attr;
+	const char *config;
+};
+
+struct pci2phy_map {
+	struct list_head list;
+	int segment;
+	int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+struct pci2phy_map *__find_pci2phy_map(int segment);
+
+ssize_t uncore_event_show(struct kobject *kobj,
+			  struct kobj_attribute *attr, char *buf);
+
+#define INTEL_UNCORE_EVENT_DESC(_name, _config)			\
+{								\
+	.attr	= __ATTR(_name, 0444, uncore_event_show, NULL),	\
+	.config	= _config,					\
+}
+
+#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)			\
+static ssize_t __uncore_##_var##_show(struct kobject *kobj,		\
+				struct kobj_attribute *attr,		\
+				char *page)				\
+{									\
+	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			\
+	return sprintf(page, _format "\n");				\
+}									\
+static struct kobj_attribute format_attr_##_var =			\
+	__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
+
+static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
+{
+	return box->pmu->type->box_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctl;
+}
+
+static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctr;
+}
+
+static inline
+unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
+{
+	return idx * 4 + box->pmu->type->event_ctl;
+}
+
+static inline
+unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+	return idx * 8 + box->pmu->type->perf_ctr;
+}
+
+static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
+{
+	struct intel_uncore_pmu *pmu = box->pmu;
+	return pmu->type->msr_offsets ?
+		pmu->type->msr_offsets[pmu->pmu_idx] :
+		pmu->type->msr_offset * pmu->pmu_idx;
+}
+
+static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
+{
+	if (!box->pmu->type->box_ctl)
+		return 0;
+	return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
+{
+	if (!box->pmu->type->fixed_ctl)
+		return 0;
+	return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
+}
+
+static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
+{
+	return box->pmu->type->event_ctl +
+		(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+		uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+	return box->pmu->type->perf_ctr +
+		(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
+		uncore_msr_box_offset(box);
+}
+
+static inline
+unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
+{
+	if (box->pci_dev)
+		return uncore_pci_fixed_ctl(box);
+	else
+		return uncore_msr_fixed_ctl(box);
+}
+
+static inline
+unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
+{
+	if (box->pci_dev)
+		return uncore_pci_fixed_ctr(box);
+	else
+		return uncore_msr_fixed_ctr(box);
+}
+
+static inline
+unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
+{
+	if (box->pci_dev)
+		return uncore_pci_event_ctl(box, idx);
+	else
+		return uncore_msr_event_ctl(box, idx);
+}
+
+static inline
+unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
+{
+	if (box->pci_dev)
+		return uncore_pci_perf_ctr(box, idx);
+	else
+		return uncore_msr_perf_ctr(box, idx);
+}
+
+static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
+{
+	return box->pmu->type->perf_ctr_bits;
+}
+
+static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
+{
+	return box->pmu->type->fixed_ctr_bits;
+}
+
+static inline int uncore_num_counters(struct intel_uncore_box *box)
+{
+	return box->pmu->type->num_counters;
+}
+
+static inline void uncore_disable_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->type->ops->disable_box)
+		box->pmu->type->ops->disable_box(box);
+}
+
+static inline void uncore_enable_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->type->ops->enable_box)
+		box->pmu->type->ops->enable_box(box);
+}
+
+static inline void uncore_disable_event(struct intel_uncore_box *box,
+				struct perf_event *event)
+{
+	box->pmu->type->ops->disable_event(box, event);
+}
+
+static inline void uncore_enable_event(struct intel_uncore_box *box,
+				struct perf_event *event)
+{
+	box->pmu->type->ops->enable_event(box, event);
+}
+
+static inline u64 uncore_read_counter(struct intel_uncore_box *box,
+				struct perf_event *event)
+{
+	return box->pmu->type->ops->read_counter(box, event);
+}
+
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+	if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+		if (box->pmu->type->ops->init_box)
+			box->pmu->type->ops->init_box(box);
+	}
+}
+
+static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
+{
+	return (box->phys_id < 0);
+}
+
+struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
+struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_read(struct perf_event *event);
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+
+extern struct intel_uncore_type **uncore_msr_uncores;
+extern struct intel_uncore_type **uncore_pci_uncores;
+extern struct pci_driver *uncore_pci_driver;
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
+extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+extern struct event_constraint uncore_constraint_empty;
+
+/* perf_event_intel_uncore_snb.c */
+int snb_uncore_pci_init(void);
+int ivb_uncore_pci_init(void);
+int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
+int skl_uncore_pci_init(void);
+void snb_uncore_cpu_init(void);
+void nhm_uncore_cpu_init(void);
+int snb_pci2phy_map_init(int devid);
+
+/* perf_event_intel_uncore_snbep.c */
+int snbep_uncore_pci_init(void);
+void snbep_uncore_cpu_init(void);
+int ivbep_uncore_pci_init(void);
+void ivbep_uncore_cpu_init(void);
+int hswep_uncore_pci_init(void);
+void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);
+int knl_uncore_pci_init(void);
+void knl_uncore_cpu_init(void);
+
+/* perf_event_intel_uncore_nhmex.c */
+void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 018451a..875d1b1 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,8 +33,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
-					   perf_event_intel_uncore_snb.o \
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore_snb.o \
 					   perf_event_intel_uncore_snbep.o \
 					   perf_event_intel_uncore_nhmex.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
deleted file mode 100644
index 3bf41d4..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ /dev/null
@@ -1,1401 +0,0 @@
-#include "perf_event_intel_uncore.h"
-
-static struct intel_uncore_type *empty_uncore[] = { NULL, };
-struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
-struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
-
-static bool pcidrv_registered;
-struct pci_driver *uncore_pci_driver;
-/* pci bus to socket mapping */
-DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
-struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
-struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
-
-static DEFINE_RAW_SPINLOCK(uncore_box_lock);
-/* mask of cpus that collect uncore events */
-static cpumask_t uncore_cpu_mask;
-
-/* constraint for the fixed counter */
-static struct event_constraint uncore_constraint_fixed =
-	EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
-struct event_constraint uncore_constraint_empty =
-	EVENT_CONSTRAINT(0, 0, 0);
-
-int uncore_pcibus_to_physid(struct pci_bus *bus)
-{
-	struct pci2phy_map *map;
-	int phys_id = -1;
-
-	raw_spin_lock(&pci2phy_map_lock);
-	list_for_each_entry(map, &pci2phy_map_head, list) {
-		if (map->segment == pci_domain_nr(bus)) {
-			phys_id = map->pbus_to_physid[bus->number];
-			break;
-		}
-	}
-	raw_spin_unlock(&pci2phy_map_lock);
-
-	return phys_id;
-}
-
-struct pci2phy_map *__find_pci2phy_map(int segment)
-{
-	struct pci2phy_map *map, *alloc = NULL;
-	int i;
-
-	lockdep_assert_held(&pci2phy_map_lock);
-
-lookup:
-	list_for_each_entry(map, &pci2phy_map_head, list) {
-		if (map->segment == segment)
-			goto end;
-	}
-
-	if (!alloc) {
-		raw_spin_unlock(&pci2phy_map_lock);
-		alloc = kmalloc(sizeof(struct pci2phy_map), GFP_KERNEL);
-		raw_spin_lock(&pci2phy_map_lock);
-
-		if (!alloc)
-			return NULL;
-
-		goto lookup;
-	}
-
-	map = alloc;
-	alloc = NULL;
-	map->segment = segment;
-	for (i = 0; i < 256; i++)
-		map->pbus_to_physid[i] = -1;
-	list_add_tail(&map->list, &pci2phy_map_head);
-
-end:
-	kfree(alloc);
-	return map;
-}
-
-ssize_t uncore_event_show(struct kobject *kobj,
-			  struct kobj_attribute *attr, char *buf)
-{
-	struct uncore_event_desc *event =
-		container_of(attr, struct uncore_event_desc, attr);
-	return sprintf(buf, "%s", event->config);
-}
-
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
-{
-	return container_of(event->pmu, struct intel_uncore_pmu, pmu);
-}
-
-struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
-{
-	struct intel_uncore_box *box;
-
-	box = *per_cpu_ptr(pmu->box, cpu);
-	if (box)
-		return box;
-
-	raw_spin_lock(&uncore_box_lock);
-	/* Recheck in lock to handle races. */
-	if (*per_cpu_ptr(pmu->box, cpu))
-		goto out;
-	list_for_each_entry(box, &pmu->box_list, list) {
-		if (box->phys_id == topology_physical_package_id(cpu)) {
-			atomic_inc(&box->refcnt);
-			*per_cpu_ptr(pmu->box, cpu) = box;
-			break;
-		}
-	}
-out:
-	raw_spin_unlock(&uncore_box_lock);
-
-	return *per_cpu_ptr(pmu->box, cpu);
-}
-
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
-{
-	/*
-	 * perf core schedules event on the basis of cpu, uncore events are
-	 * collected by one of the cpus inside a physical package.
-	 */
-	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
-}
-
-u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	u64 count;
-
-	rdmsrl(event->hw.event_base, count);
-
-	return count;
-}
-
-/*
- * generic get constraint function for shared match/mask registers.
- */
-struct event_constraint *
-uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_extra_reg *er;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-	unsigned long flags;
-	bool ok = false;
-
-	/*
-	 * reg->alloc can be set due to existing state, so for fake box we
-	 * need to ignore this, otherwise we might fail to allocate proper
-	 * fake state for this extra reg constraint.
-	 */
-	if (reg1->idx == EXTRA_REG_NONE ||
-	    (!uncore_box_is_fake(box) && reg1->alloc))
-		return NULL;
-
-	er = &box->shared_regs[reg1->idx];
-	raw_spin_lock_irqsave(&er->lock, flags);
-	if (!atomic_read(&er->ref) ||
-	    (er->config1 == reg1->config && er->config2 == reg2->config)) {
-		atomic_inc(&er->ref);
-		er->config1 = reg1->config;
-		er->config2 = reg2->config;
-		ok = true;
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	if (ok) {
-		if (!uncore_box_is_fake(box))
-			reg1->alloc = 1;
-		return NULL;
-	}
-
-	return &uncore_constraint_empty;
-}
-
-void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_extra_reg *er;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-
-	/*
-	 * Only put constraint if extra reg was actually allocated. Also
-	 * takes care of event which do not use an extra shared reg.
-	 *
-	 * Also, if this is a fake box we shouldn't touch any event state
-	 * (reg->alloc) and we don't care about leaving inconsistent box
-	 * state either since it will be thrown out.
-	 */
-	if (uncore_box_is_fake(box) || !reg1->alloc)
-		return;
-
-	er = &box->shared_regs[reg1->idx];
-	atomic_dec(&er->ref);
-	reg1->alloc = 0;
-}
-
-u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
-	struct intel_uncore_extra_reg *er;
-	unsigned long flags;
-	u64 config;
-
-	er = &box->shared_regs[idx];
-
-	raw_spin_lock_irqsave(&er->lock, flags);
-	config = er->config;
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	return config;
-}
-
-static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	hwc->idx = idx;
-	hwc->last_tag = ++box->tags[idx];
-
-	if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
-		hwc->event_base = uncore_fixed_ctr(box);
-		hwc->config_base = uncore_fixed_ctl(box);
-		return;
-	}
-
-	hwc->config_base = uncore_event_ctl(box, hwc->idx);
-	hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
-}
-
-void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
-{
-	u64 prev_count, new_count, delta;
-	int shift;
-
-	if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
-		shift = 64 - uncore_fixed_ctr_bits(box);
-	else
-		shift = 64 - uncore_perf_ctr_bits(box);
-
-	/* the hrtimer might modify the previous event value */
-again:
-	prev_count = local64_read(&event->hw.prev_count);
-	new_count = uncore_read_counter(box, event);
-	if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
-		goto again;
-
-	delta = (new_count << shift) - (prev_count << shift);
-	delta >>= shift;
-
-	local64_add(delta, &event->count);
-}
-
-/*
- * The overflow interrupt is unavailable for SandyBridge-EP, is broken
- * for SandyBridge. So we use hrtimer to periodically poll the counter
- * to avoid overflow.
- */
-static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
-{
-	struct intel_uncore_box *box;
-	struct perf_event *event;
-	unsigned long flags;
-	int bit;
-
-	box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
-	if (!box->n_active || box->cpu != smp_processor_id())
-		return HRTIMER_NORESTART;
-	/*
-	 * disable local interrupt to prevent uncore_pmu_event_start/stop
-	 * to interrupt the update process
-	 */
-	local_irq_save(flags);
-
-	/*
-	 * handle boxes with an active event list as opposed to active
-	 * counters
-	 */
-	list_for_each_entry(event, &box->active_list, active_entry) {
-		uncore_perf_event_update(box, event);
-	}
-
-	for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
-		uncore_perf_event_update(box, box->events[bit]);
-
-	local_irq_restore(flags);
-
-	hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
-	return HRTIMER_RESTART;
-}
-
-void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
-{
-	hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
-		      HRTIMER_MODE_REL_PINNED);
-}
-
-void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
-{
-	hrtimer_cancel(&box->hrtimer);
-}
-
-static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
-{
-	hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	box->hrtimer.function = uncore_pmu_hrtimer;
-}
-
-static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
-{
-	struct intel_uncore_box *box;
-	int i, size;
-
-	size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
-
-	box = kzalloc_node(size, GFP_KERNEL, node);
-	if (!box)
-		return NULL;
-
-	for (i = 0; i < type->num_shared_regs; i++)
-		raw_spin_lock_init(&box->shared_regs[i].lock);
-
-	uncore_pmu_init_hrtimer(box);
-	atomic_set(&box->refcnt, 1);
-	box->cpu = -1;
-	box->phys_id = -1;
-
-	/* set default hrtimer timeout */
-	box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
-
-	INIT_LIST_HEAD(&box->active_list);
-
-	return box;
-}
-
-/*
- * Using uncore_pmu_event_init pmu event_init callback
- * as a detection point for uncore events.
- */
-static int uncore_pmu_event_init(struct perf_event *event);
-
-static bool is_uncore_event(struct perf_event *event)
-{
-	return event->pmu->event_init == uncore_pmu_event_init;
-}
-
-static int
-uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
-{
-	struct perf_event *event;
-	int n, max_count;
-
-	max_count = box->pmu->type->num_counters;
-	if (box->pmu->type->fixed_ctl)
-		max_count++;
-
-	if (box->n_events >= max_count)
-		return -EINVAL;
-
-	n = box->n_events;
-
-	if (is_uncore_event(leader)) {
-		box->event_list[n] = leader;
-		n++;
-	}
-
-	if (!dogrp)
-		return n;
-
-	list_for_each_entry(event, &leader->sibling_list, group_entry) {
-		if (!is_uncore_event(event) ||
-		    event->state <= PERF_EVENT_STATE_OFF)
-			continue;
-
-		if (n >= max_count)
-			return -EINVAL;
-
-		box->event_list[n] = event;
-		n++;
-	}
-	return n;
-}
-
-static struct event_constraint *
-uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_type *type = box->pmu->type;
-	struct event_constraint *c;
-
-	if (type->ops->get_constraint) {
-		c = type->ops->get_constraint(box, event);
-		if (c)
-			return c;
-	}
-
-	if (event->attr.config == UNCORE_FIXED_EVENT)
-		return &uncore_constraint_fixed;
-
-	if (type->constraints) {
-		for_each_event_constraint(c, type->constraints) {
-			if ((event->hw.config & c->cmask) == c->code)
-				return c;
-		}
-	}
-
-	return &type->unconstrainted;
-}
-
-static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	if (box->pmu->type->ops->put_constraint)
-		box->pmu->type->ops->put_constraint(box, event);
-}
-
-static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
-{
-	unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
-	struct event_constraint *c;
-	int i, wmin, wmax, ret = 0;
-	struct hw_perf_event *hwc;
-
-	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
-
-	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-		c = uncore_get_event_constraint(box, box->event_list[i]);
-		box->event_constraint[i] = c;
-		wmin = min(wmin, c->weight);
-		wmax = max(wmax, c->weight);
-	}
-
-	/* fastpath, try to reuse previous register */
-	for (i = 0; i < n; i++) {
-		hwc = &box->event_list[i]->hw;
-		c = box->event_constraint[i];
-
-		/* never assigned */
-		if (hwc->idx == -1)
-			break;
-
-		/* constraint still honored */
-		if (!test_bit(hwc->idx, c->idxmsk))
-			break;
-
-		/* not already used */
-		if (test_bit(hwc->idx, used_mask))
-			break;
-
-		__set_bit(hwc->idx, used_mask);
-		if (assign)
-			assign[i] = hwc->idx;
-	}
-	/* slow path */
-	if (i != n)
-		ret = perf_assign_events(box->event_constraint, n,
-					 wmin, wmax, n, assign);
-
-	if (!assign || ret) {
-		for (i = 0; i < n; i++)
-			uncore_put_event_constraint(box, box->event_list[i]);
-	}
-	return ret ? -EINVAL : 0;
-}
-
-static void uncore_pmu_event_start(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	int idx = event->hw.idx;
-
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-		return;
-
-	if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
-		return;
-
-	event->hw.state = 0;
-	box->events[idx] = event;
-	box->n_active++;
-	__set_bit(idx, box->active_mask);
-
-	local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
-	uncore_enable_event(box, event);
-
-	if (box->n_active == 1) {
-		uncore_enable_box(box);
-		uncore_pmu_start_hrtimer(box);
-	}
-}
-
-static void uncore_pmu_event_stop(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
-		uncore_disable_event(box, event);
-		box->n_active--;
-		box->events[hwc->idx] = NULL;
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-
-		if (box->n_active == 0) {
-			uncore_disable_box(box);
-			uncore_pmu_cancel_hrtimer(box);
-		}
-	}
-
-	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		/*
-		 * Drain the remaining delta count out of a event
-		 * that we are disabling:
-		 */
-		uncore_perf_event_update(box, event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-}
-
-static int uncore_pmu_event_add(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-	int assign[UNCORE_PMC_IDX_MAX];
-	int i, n, ret;
-
-	if (!box)
-		return -ENODEV;
-
-	ret = n = uncore_collect_events(box, event, false);
-	if (ret < 0)
-		return ret;
-
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-	if (!(flags & PERF_EF_START))
-		hwc->state |= PERF_HES_ARCH;
-
-	ret = uncore_assign_events(box, assign, n);
-	if (ret)
-		return ret;
-
-	/* save events moving to new counters */
-	for (i = 0; i < box->n_events; i++) {
-		event = box->event_list[i];
-		hwc = &event->hw;
-
-		if (hwc->idx == assign[i] &&
-			hwc->last_tag == box->tags[assign[i]])
-			continue;
-		/*
-		 * Ensure we don't accidentally enable a stopped
-		 * counter simply because we rescheduled.
-		 */
-		if (hwc->state & PERF_HES_STOPPED)
-			hwc->state |= PERF_HES_ARCH;
-
-		uncore_pmu_event_stop(event, PERF_EF_UPDATE);
-	}
-
-	/* reprogram moved events into new counters */
-	for (i = 0; i < n; i++) {
-		event = box->event_list[i];
-		hwc = &event->hw;
-
-		if (hwc->idx != assign[i] ||
-			hwc->last_tag != box->tags[assign[i]])
-			uncore_assign_hw_event(box, event, assign[i]);
-		else if (i < box->n_events)
-			continue;
-
-		if (hwc->state & PERF_HES_ARCH)
-			continue;
-
-		uncore_pmu_event_start(event, 0);
-	}
-	box->n_events = n;
-
-	return 0;
-}
-
-static void uncore_pmu_event_del(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	int i;
-
-	uncore_pmu_event_stop(event, PERF_EF_UPDATE);
-
-	for (i = 0; i < box->n_events; i++) {
-		if (event == box->event_list[i]) {
-			uncore_put_event_constraint(box, event);
-
-			while (++i < box->n_events)
-				box->event_list[i - 1] = box->event_list[i];
-
-			--box->n_events;
-			break;
-		}
-	}
-
-	event->hw.idx = -1;
-	event->hw.last_tag = ~0ULL;
-}
-
-void uncore_pmu_event_read(struct perf_event *event)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	uncore_perf_event_update(box, event);
-}
-
-/*
- * validation ensures the group can be loaded onto the
- * PMU if it was the only group available.
- */
-static int uncore_validate_group(struct intel_uncore_pmu *pmu,
-				struct perf_event *event)
-{
-	struct perf_event *leader = event->group_leader;
-	struct intel_uncore_box *fake_box;
-	int ret = -EINVAL, n;
-
-	fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
-	if (!fake_box)
-		return -ENOMEM;
-
-	fake_box->pmu = pmu;
-	/*
-	 * the event is not yet connected with its
-	 * siblings therefore we must first collect
-	 * existing siblings, then add the new event
-	 * before we can simulate the scheduling
-	 */
-	n = uncore_collect_events(fake_box, leader, true);
-	if (n < 0)
-		goto out;
-
-	fake_box->n_events = n;
-	n = uncore_collect_events(fake_box, event, false);
-	if (n < 0)
-		goto out;
-
-	fake_box->n_events = n;
-
-	ret = uncore_assign_events(fake_box, NULL, n);
-out:
-	kfree(fake_box);
-	return ret;
-}
-
-static int uncore_pmu_event_init(struct perf_event *event)
-{
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	struct hw_perf_event *hwc = &event->hw;
-	int ret;
-
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	pmu = uncore_event_to_pmu(event);
-	/* no device found for this pmu */
-	if (pmu->func_id < 0)
-		return -ENOENT;
-
-	/*
-	 * Uncore PMU does measure at all privilege level all the time.
-	 * So it doesn't make sense to specify any exclude bits.
-	 */
-	if (event->attr.exclude_user || event->attr.exclude_kernel ||
-			event->attr.exclude_hv || event->attr.exclude_idle)
-		return -EINVAL;
-
-	/* Sampling not supported yet */
-	if (hwc->sample_period)
-		return -EINVAL;
-
-	/*
-	 * Place all uncore events for a particular physical package
-	 * onto a single cpu
-	 */
-	if (event->cpu < 0)
-		return -EINVAL;
-	box = uncore_pmu_to_box(pmu, event->cpu);
-	if (!box || box->cpu < 0)
-		return -EINVAL;
-	event->cpu = box->cpu;
-
-	event->hw.idx = -1;
-	event->hw.last_tag = ~0ULL;
-	event->hw.extra_reg.idx = EXTRA_REG_NONE;
-	event->hw.branch_reg.idx = EXTRA_REG_NONE;
-
-	if (event->attr.config == UNCORE_FIXED_EVENT) {
-		/* no fixed counter */
-		if (!pmu->type->fixed_ctl)
-			return -EINVAL;
-		/*
-		 * if there is only one fixed counter, only the first pmu
-		 * can access the fixed counter
-		 */
-		if (pmu->type->single_fixed && pmu->pmu_idx > 0)
-			return -EINVAL;
-
-		/* fixed counters have event field hardcoded to zero */
-		hwc->config = 0ULL;
-	} else {
-		hwc->config = event->attr.config & pmu->type->event_mask;
-		if (pmu->type->ops->hw_config) {
-			ret = pmu->type->ops->hw_config(box, event);
-			if (ret)
-				return ret;
-		}
-	}
-
-	if (event->group_leader != event)
-		ret = uncore_validate_group(pmu, event);
-	else
-		ret = 0;
-
-	return ret;
-}
-
-static ssize_t uncore_get_attr_cpumask(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
-}
-
-static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
-
-static struct attribute *uncore_pmu_attrs[] = {
-	&dev_attr_cpumask.attr,
-	NULL,
-};
-
-static struct attribute_group uncore_pmu_attr_group = {
-	.attrs = uncore_pmu_attrs,
-};
-
-static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
-{
-	int ret;
-
-	if (!pmu->type->pmu) {
-		pmu->pmu = (struct pmu) {
-			.attr_groups	= pmu->type->attr_groups,
-			.task_ctx_nr	= perf_invalid_context,
-			.event_init	= uncore_pmu_event_init,
-			.add		= uncore_pmu_event_add,
-			.del		= uncore_pmu_event_del,
-			.start		= uncore_pmu_event_start,
-			.stop		= uncore_pmu_event_stop,
-			.read		= uncore_pmu_event_read,
-		};
-	} else {
-		pmu->pmu = *pmu->type->pmu;
-		pmu->pmu.attr_groups = pmu->type->attr_groups;
-	}
-
-	if (pmu->type->num_boxes == 1) {
-		if (strlen(pmu->type->name) > 0)
-			sprintf(pmu->name, "uncore_%s", pmu->type->name);
-		else
-			sprintf(pmu->name, "uncore");
-	} else {
-		sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
-			pmu->pmu_idx);
-	}
-
-	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
-	return ret;
-}
-
-static void __init uncore_type_exit(struct intel_uncore_type *type)
-{
-	int i;
-
-	for (i = 0; i < type->num_boxes; i++)
-		free_percpu(type->pmus[i].box);
-	kfree(type->pmus);
-	type->pmus = NULL;
-	kfree(type->events_group);
-	type->events_group = NULL;
-}
-
-static void __init uncore_types_exit(struct intel_uncore_type **types)
-{
-	int i;
-	for (i = 0; types[i]; i++)
-		uncore_type_exit(types[i]);
-}
-
-static int __init uncore_type_init(struct intel_uncore_type *type)
-{
-	struct intel_uncore_pmu *pmus;
-	struct attribute_group *attr_group;
-	struct attribute **attrs;
-	int i, j;
-
-	pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
-	if (!pmus)
-		return -ENOMEM;
-
-	type->pmus = pmus;
-
-	type->unconstrainted = (struct event_constraint)
-		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
-				0, type->num_counters, 0, 0);
-
-	for (i = 0; i < type->num_boxes; i++) {
-		pmus[i].func_id = -1;
-		pmus[i].pmu_idx = i;
-		pmus[i].type = type;
-		INIT_LIST_HEAD(&pmus[i].box_list);
-		pmus[i].box = alloc_percpu(struct intel_uncore_box *);
-		if (!pmus[i].box)
-			goto fail;
-	}
-
-	if (type->event_descs) {
-		i = 0;
-		while (type->event_descs[i].attr.attr.name)
-			i++;
-
-		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
-					sizeof(*attr_group), GFP_KERNEL);
-		if (!attr_group)
-			goto fail;
-
-		attrs = (struct attribute **)(attr_group + 1);
-		attr_group->name = "events";
-		attr_group->attrs = attrs;
-
-		for (j = 0; j < i; j++)
-			attrs[j] = &type->event_descs[j].attr.attr;
-
-		type->events_group = attr_group;
-	}
-
-	type->pmu_group = &uncore_pmu_attr_group;
-	return 0;
-fail:
-	uncore_type_exit(type);
-	return -ENOMEM;
-}
-
-static int __init uncore_types_init(struct intel_uncore_type **types)
-{
-	int i, ret;
-
-	for (i = 0; types[i]; i++) {
-		ret = uncore_type_init(types[i]);
-		if (ret)
-			goto fail;
-	}
-	return 0;
-fail:
-	while (--i >= 0)
-		uncore_type_exit(types[i]);
-	return ret;
-}
-
-/*
- * add a pci uncore device
- */
-static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	struct intel_uncore_type *type;
-	int phys_id;
-	bool first_box = false;
-
-	phys_id = uncore_pcibus_to_physid(pdev->bus);
-	if (phys_id < 0)
-		return -ENODEV;
-
-	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
-		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
-		uncore_extra_pci_dev[phys_id][idx] = pdev;
-		pci_set_drvdata(pdev, NULL);
-		return 0;
-	}
-
-	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
-	box = uncore_alloc_box(type, NUMA_NO_NODE);
-	if (!box)
-		return -ENOMEM;
-
-	/*
-	 * for performance monitoring unit with multiple boxes,
-	 * each box has a different function id.
-	 */
-	pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
-	/* Knights Landing uses a common PCI device ID for multiple instances of
-	 * an uncore PMU device type. There is only one entry per device type in
-	 * the knl_uncore_pci_ids table inspite of multiple devices present for
-	 * some device types. Hence PCI device idx would be 0 for all devices.
-	 * So increment pmu pointer to point to an unused array element.
-	 */
-	if (boot_cpu_data.x86_model == 87)
-		while (pmu->func_id >= 0)
-			pmu++;
-	if (pmu->func_id < 0)
-		pmu->func_id = pdev->devfn;
-	else
-		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
-
-	box->phys_id = phys_id;
-	box->pci_dev = pdev;
-	box->pmu = pmu;
-	uncore_box_init(box);
-	pci_set_drvdata(pdev, box);
-
-	raw_spin_lock(&uncore_box_lock);
-	if (list_empty(&pmu->box_list))
-		first_box = true;
-	list_add_tail(&box->list, &pmu->box_list);
-	raw_spin_unlock(&uncore_box_lock);
-
-	if (first_box)
-		uncore_pmu_register(pmu);
-	return 0;
-}
-
-static void uncore_pci_remove(struct pci_dev *pdev)
-{
-	struct intel_uncore_box *box = pci_get_drvdata(pdev);
-	struct intel_uncore_pmu *pmu;
-	int i, cpu, phys_id;
-	bool last_box = false;
-
-	phys_id = uncore_pcibus_to_physid(pdev->bus);
-	box = pci_get_drvdata(pdev);
-	if (!box) {
-		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
-			if (uncore_extra_pci_dev[phys_id][i] == pdev) {
-				uncore_extra_pci_dev[phys_id][i] = NULL;
-				break;
-			}
-		}
-		WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
-		return;
-	}
-
-	pmu = box->pmu;
-	if (WARN_ON_ONCE(phys_id != box->phys_id))
-		return;
-
-	pci_set_drvdata(pdev, NULL);
-
-	raw_spin_lock(&uncore_box_lock);
-	list_del(&box->list);
-	if (list_empty(&pmu->box_list))
-		last_box = true;
-	raw_spin_unlock(&uncore_box_lock);
-
-	for_each_possible_cpu(cpu) {
-		if (*per_cpu_ptr(pmu->box, cpu) == box) {
-			*per_cpu_ptr(pmu->box, cpu) = NULL;
-			atomic_dec(&box->refcnt);
-		}
-	}
-
-	WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
-	kfree(box);
-
-	if (last_box)
-		perf_pmu_unregister(&pmu->pmu);
-}
-
-static int __init uncore_pci_init(void)
-{
-	int ret;
-
-	switch (boot_cpu_data.x86_model) {
-	case 45: /* Sandy Bridge-EP */
-		ret = snbep_uncore_pci_init();
-		break;
-	case 62: /* Ivy Bridge-EP */
-		ret = ivbep_uncore_pci_init();
-		break;
-	case 63: /* Haswell-EP */
-		ret = hswep_uncore_pci_init();
-		break;
-	case 79: /* BDX-EP */
-	case 86: /* BDX-DE */
-		ret = bdx_uncore_pci_init();
-		break;
-	case 42: /* Sandy Bridge */
-		ret = snb_uncore_pci_init();
-		break;
-	case 58: /* Ivy Bridge */
-		ret = ivb_uncore_pci_init();
-		break;
-	case 60: /* Haswell */
-	case 69: /* Haswell Celeron */
-		ret = hsw_uncore_pci_init();
-		break;
-	case 61: /* Broadwell */
-		ret = bdw_uncore_pci_init();
-		break;
-	case 87: /* Knights Landing */
-		ret = knl_uncore_pci_init();
-		break;
-	case 94: /* SkyLake */
-		ret = skl_uncore_pci_init();
-		break;
-	default:
-		return 0;
-	}
-
-	if (ret)
-		return ret;
-
-	ret = uncore_types_init(uncore_pci_uncores);
-	if (ret)
-		return ret;
-
-	uncore_pci_driver->probe = uncore_pci_probe;
-	uncore_pci_driver->remove = uncore_pci_remove;
-
-	ret = pci_register_driver(uncore_pci_driver);
-	if (ret == 0)
-		pcidrv_registered = true;
-	else
-		uncore_types_exit(uncore_pci_uncores);
-
-	return ret;
-}
-
-static void __init uncore_pci_exit(void)
-{
-	if (pcidrv_registered) {
-		pcidrv_registered = false;
-		pci_unregister_driver(uncore_pci_driver);
-		uncore_types_exit(uncore_pci_uncores);
-	}
-}
-
-/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
-static LIST_HEAD(boxes_to_free);
-
-static void uncore_kfree_boxes(void)
-{
-	struct intel_uncore_box *box;
-
-	while (!list_empty(&boxes_to_free)) {
-		box = list_entry(boxes_to_free.next,
-				 struct intel_uncore_box, list);
-		list_del(&box->list);
-		kfree(box);
-	}
-}
-
-static void uncore_cpu_dying(int cpu)
-{
-	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	int i, j;
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			box = *per_cpu_ptr(pmu->box, cpu);
-			*per_cpu_ptr(pmu->box, cpu) = NULL;
-			if (box && atomic_dec_and_test(&box->refcnt))
-				list_add(&box->list, &boxes_to_free);
-		}
-	}
-}
-
-static int uncore_cpu_starting(int cpu)
-{
-	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box, *exist;
-	int i, j, k, phys_id;
-
-	phys_id = topology_physical_package_id(cpu);
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			box = *per_cpu_ptr(pmu->box, cpu);
-			/* called by uncore_cpu_init? */
-			if (box && box->phys_id >= 0) {
-				uncore_box_init(box);
-				continue;
-			}
-
-			for_each_online_cpu(k) {
-				exist = *per_cpu_ptr(pmu->box, k);
-				if (exist && exist->phys_id == phys_id) {
-					atomic_inc(&exist->refcnt);
-					*per_cpu_ptr(pmu->box, cpu) = exist;
-					if (box) {
-						list_add(&box->list,
-							 &boxes_to_free);
-						box = NULL;
-					}
-					break;
-				}
-			}
-
-			if (box) {
-				box->phys_id = phys_id;
-				uncore_box_init(box);
-			}
-		}
-	}
-	return 0;
-}
-
-static int uncore_cpu_prepare(int cpu, int phys_id)
-{
-	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	int i, j;
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			if (pmu->func_id < 0)
-				pmu->func_id = j;
-
-			box = uncore_alloc_box(type, cpu_to_node(cpu));
-			if (!box)
-				return -ENOMEM;
-
-			box->pmu = pmu;
-			box->phys_id = phys_id;
-			*per_cpu_ptr(pmu->box, cpu) = box;
-		}
-	}
-	return 0;
-}
-
-static void
-uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
-{
-	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	int i, j;
-
-	for (i = 0; uncores[i]; i++) {
-		type = uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			if (old_cpu < 0)
-				box = uncore_pmu_to_box(pmu, new_cpu);
-			else
-				box = uncore_pmu_to_box(pmu, old_cpu);
-			if (!box)
-				continue;
-
-			if (old_cpu < 0) {
-				WARN_ON_ONCE(box->cpu != -1);
-				box->cpu = new_cpu;
-				continue;
-			}
-
-			WARN_ON_ONCE(box->cpu != old_cpu);
-			if (new_cpu >= 0) {
-				uncore_pmu_cancel_hrtimer(box);
-				perf_pmu_migrate_context(&pmu->pmu,
-						old_cpu, new_cpu);
-				box->cpu = new_cpu;
-			} else {
-				box->cpu = -1;
-			}
-		}
-	}
-}
-
-static void uncore_event_exit_cpu(int cpu)
-{
-	int i, phys_id, target;
-
-	/* if exiting cpu is used for collecting uncore events */
-	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
-		return;
-
-	/* find a new cpu to collect uncore events */
-	phys_id = topology_physical_package_id(cpu);
-	target = -1;
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
-		if (phys_id == topology_physical_package_id(i)) {
-			target = i;
-			break;
-		}
-	}
-
-	/* migrate uncore events to the new cpu */
-	if (target >= 0)
-		cpumask_set_cpu(target, &uncore_cpu_mask);
-
-	uncore_change_context(uncore_msr_uncores, cpu, target);
-	uncore_change_context(uncore_pci_uncores, cpu, target);
-}
-
-static void uncore_event_init_cpu(int cpu)
-{
-	int i, phys_id;
-
-	phys_id = topology_physical_package_id(cpu);
-	for_each_cpu(i, &uncore_cpu_mask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;
-	}
-
-	cpumask_set_cpu(cpu, &uncore_cpu_mask);
-
-	uncore_change_context(uncore_msr_uncores, -1, cpu);
-	uncore_change_context(uncore_pci_uncores, -1, cpu);
-}
-
-static int uncore_cpu_notifier(struct notifier_block *self,
-			       unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (long)hcpu;
-
-	/* allocate/free data structure for uncore box */
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_UP_PREPARE:
-		uncore_cpu_prepare(cpu, -1);
-		break;
-	case CPU_STARTING:
-		uncore_cpu_starting(cpu);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_DYING:
-		uncore_cpu_dying(cpu);
-		break;
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		uncore_kfree_boxes();
-		break;
-	default:
-		break;
-	}
-
-	/* select the cpu that collects uncore events */
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_FAILED:
-	case CPU_STARTING:
-		uncore_event_init_cpu(cpu);
-		break;
-	case CPU_DOWN_PREPARE:
-		uncore_event_exit_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block uncore_cpu_nb = {
-	.notifier_call	= uncore_cpu_notifier,
-	/*
-	 * to migrate uncore events, our notifier should be executed
-	 * before perf core's notifier.
-	 */
-	.priority	= CPU_PRI_PERF + 1,
-};
-
-static void __init uncore_cpu_setup(void *dummy)
-{
-	uncore_cpu_starting(smp_processor_id());
-}
-
-static int __init uncore_cpu_init(void)
-{
-	int ret;
-
-	switch (boot_cpu_data.x86_model) {
-	case 26: /* Nehalem */
-	case 30:
-	case 37: /* Westmere */
-	case 44:
-		nhm_uncore_cpu_init();
-		break;
-	case 42: /* Sandy Bridge */
-	case 58: /* Ivy Bridge */
-	case 60: /* Haswell */
-	case 69: /* Haswell */
-	case 70: /* Haswell */
-	case 61: /* Broadwell */
-	case 71: /* Broadwell */
-		snb_uncore_cpu_init();
-		break;
-	case 45: /* Sandy Bridge-EP */
-		snbep_uncore_cpu_init();
-		break;
-	case 46: /* Nehalem-EX */
-	case 47: /* Westmere-EX aka. Xeon E7 */
-		nhmex_uncore_cpu_init();
-		break;
-	case 62: /* Ivy Bridge-EP */
-		ivbep_uncore_cpu_init();
-		break;
-	case 63: /* Haswell-EP */
-		hswep_uncore_cpu_init();
-		break;
-	case 79: /* BDX-EP */
-	case 86: /* BDX-DE */
-		bdx_uncore_cpu_init();
-		break;
-	case 87: /* Knights Landing */
-		knl_uncore_cpu_init();
-		break;
-	default:
-		return 0;
-	}
-
-	ret = uncore_types_init(uncore_msr_uncores);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
-static int __init uncore_pmus_register(void)
-{
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_type *type;
-	int i, j;
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			uncore_pmu_register(pmu);
-		}
-	}
-
-	return 0;
-}
-
-static void __init uncore_cpumask_init(void)
-{
-	int cpu;
-
-	/*
-	 * ony invoke once from msr or pci init code
-	 */
-	if (!cpumask_empty(&uncore_cpu_mask))
-		return;
-
-	cpu_notifier_register_begin();
-
-	for_each_online_cpu(cpu) {
-		int i, phys_id = topology_physical_package_id(cpu);
-
-		for_each_cpu(i, &uncore_cpu_mask) {
-			if (phys_id == topology_physical_package_id(i)) {
-				phys_id = -1;
-				break;
-			}
-		}
-		if (phys_id < 0)
-			continue;
-
-		uncore_cpu_prepare(cpu, phys_id);
-		uncore_event_init_cpu(cpu);
-	}
-	on_each_cpu(uncore_cpu_setup, NULL, 1);
-
-	__register_cpu_notifier(&uncore_cpu_nb);
-
-	cpu_notifier_register_done();
-}
-
-
-static int __init intel_uncore_init(void)
-{
-	int ret;
-
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
-		return -ENODEV;
-
-	if (cpu_has_hypervisor)
-		return -ENODEV;
-
-	ret = uncore_pci_init();
-	if (ret)
-		goto fail;
-	ret = uncore_cpu_init();
-	if (ret) {
-		uncore_pci_exit();
-		goto fail;
-	}
-	uncore_cpumask_init();
-
-	uncore_pmus_register();
-	return 0;
-fail:
-	return ret;
-}
-device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
deleted file mode 100644
index a7086b8..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ /dev/null
@@ -1,357 +0,0 @@
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/perf_event.h>
-#include "perf_event.h"
-
-#define UNCORE_PMU_NAME_LEN		32
-#define UNCORE_PMU_HRTIMER_INTERVAL	(60LL * NSEC_PER_SEC)
-#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
-
-#define UNCORE_FIXED_EVENT		0xff
-#define UNCORE_PMC_IDX_MAX_GENERIC	8
-#define UNCORE_PMC_IDX_FIXED		UNCORE_PMC_IDX_MAX_GENERIC
-#define UNCORE_PMC_IDX_MAX		(UNCORE_PMC_IDX_FIXED + 1)
-
-#define UNCORE_PCI_DEV_DATA(type, idx)	((type << 8) | idx)
-#define UNCORE_PCI_DEV_TYPE(data)	((data >> 8) & 0xff)
-#define UNCORE_PCI_DEV_IDX(data)	(data & 0xff)
-#define UNCORE_EXTRA_PCI_DEV		0xff
-#define UNCORE_EXTRA_PCI_DEV_MAX	3
-
-/* support up to 8 sockets */
-#define UNCORE_SOCKET_MAX		8
-
-#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
-
-struct intel_uncore_ops;
-struct intel_uncore_pmu;
-struct intel_uncore_box;
-struct uncore_event_desc;
-
-struct intel_uncore_type {
-	const char *name;
-	int num_counters;
-	int num_boxes;
-	int perf_ctr_bits;
-	int fixed_ctr_bits;
-	unsigned perf_ctr;
-	unsigned event_ctl;
-	unsigned event_mask;
-	unsigned fixed_ctr;
-	unsigned fixed_ctl;
-	unsigned box_ctl;
-	unsigned msr_offset;
-	unsigned num_shared_regs:8;
-	unsigned single_fixed:1;
-	unsigned pair_ctr_ctl:1;
-	unsigned *msr_offsets;
-	struct event_constraint unconstrainted;
-	struct event_constraint *constraints;
-	struct intel_uncore_pmu *pmus;
-	struct intel_uncore_ops *ops;
-	struct uncore_event_desc *event_descs;
-	const struct attribute_group *attr_groups[4];
-	struct pmu *pmu; /* for custom pmu ops */
-};
-
-#define pmu_group attr_groups[0]
-#define format_group attr_groups[1]
-#define events_group attr_groups[2]
-
-struct intel_uncore_ops {
-	void (*init_box)(struct intel_uncore_box *);
-	void (*disable_box)(struct intel_uncore_box *);
-	void (*enable_box)(struct intel_uncore_box *);
-	void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
-	void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
-	u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
-	int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
-	struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
-						   struct perf_event *);
-	void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
-};
-
-struct intel_uncore_pmu {
-	struct pmu pmu;
-	char name[UNCORE_PMU_NAME_LEN];
-	int pmu_idx;
-	int func_id;
-	struct intel_uncore_type *type;
-	struct intel_uncore_box ** __percpu box;
-	struct list_head box_list;
-};
-
-struct intel_uncore_extra_reg {
-	raw_spinlock_t lock;
-	u64 config, config1, config2;
-	atomic_t ref;
-};
-
-struct intel_uncore_box {
-	int phys_id;
-	int n_active;	/* number of active events */
-	int n_events;
-	int cpu;	/* cpu to collect events */
-	unsigned long flags;
-	atomic_t refcnt;
-	struct perf_event *events[UNCORE_PMC_IDX_MAX];
-	struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
-	struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
-	unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
-	u64 tags[UNCORE_PMC_IDX_MAX];
-	struct pci_dev *pci_dev;
-	struct intel_uncore_pmu *pmu;
-	u64 hrtimer_duration; /* hrtimer timeout for this box */
-	struct hrtimer hrtimer;
-	struct list_head list;
-	struct list_head active_list;
-	void *io_addr;
-	struct intel_uncore_extra_reg shared_regs[0];
-};
-
-#define UNCORE_BOX_FLAG_INITIATED	0
-
-struct uncore_event_desc {
-	struct kobj_attribute attr;
-	const char *config;
-};
-
-struct pci2phy_map {
-	struct list_head list;
-	int segment;
-	int pbus_to_physid[256];
-};
-
-int uncore_pcibus_to_physid(struct pci_bus *bus);
-struct pci2phy_map *__find_pci2phy_map(int segment);
-
-ssize_t uncore_event_show(struct kobject *kobj,
-			  struct kobj_attribute *attr, char *buf);
-
-#define INTEL_UNCORE_EVENT_DESC(_name, _config)			\
-{								\
-	.attr	= __ATTR(_name, 0444, uncore_event_show, NULL),	\
-	.config	= _config,					\
-}
-
-#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format)			\
-static ssize_t __uncore_##_var##_show(struct kobject *kobj,		\
-				struct kobj_attribute *attr,		\
-				char *page)				\
-{									\
-	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);			\
-	return sprintf(page, _format "\n");				\
-}									\
-static struct kobj_attribute format_attr_##_var =			\
-	__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
-
-static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
-{
-	return box->pmu->type->box_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
-{
-	return box->pmu->type->fixed_ctl;
-}
-
-static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
-{
-	return box->pmu->type->fixed_ctr;
-}
-
-static inline
-unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
-{
-	return idx * 4 + box->pmu->type->event_ctl;
-}
-
-static inline
-unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-	return idx * 8 + box->pmu->type->perf_ctr;
-}
-
-static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
-{
-	struct intel_uncore_pmu *pmu = box->pmu;
-	return pmu->type->msr_offsets ?
-		pmu->type->msr_offsets[pmu->pmu_idx] :
-		pmu->type->msr_offset * pmu->pmu_idx;
-}
-
-static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
-{
-	if (!box->pmu->type->box_ctl)
-		return 0;
-	return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
-{
-	if (!box->pmu->type->fixed_ctl)
-		return 0;
-	return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
-}
-
-static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
-{
-	return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
-{
-	return box->pmu->type->event_ctl +
-		(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
-		uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-	return box->pmu->type->perf_ctr +
-		(box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
-		uncore_msr_box_offset(box);
-}
-
-static inline
-unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
-{
-	if (box->pci_dev)
-		return uncore_pci_fixed_ctl(box);
-	else
-		return uncore_msr_fixed_ctl(box);
-}
-
-static inline
-unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
-{
-	if (box->pci_dev)
-		return uncore_pci_fixed_ctr(box);
-	else
-		return uncore_msr_fixed_ctr(box);
-}
-
-static inline
-unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
-{
-	if (box->pci_dev)
-		return uncore_pci_event_ctl(box, idx);
-	else
-		return uncore_msr_event_ctl(box, idx);
-}
-
-static inline
-unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
-{
-	if (box->pci_dev)
-		return uncore_pci_perf_ctr(box, idx);
-	else
-		return uncore_msr_perf_ctr(box, idx);
-}
-
-static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
-{
-	return box->pmu->type->perf_ctr_bits;
-}
-
-static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
-{
-	return box->pmu->type->fixed_ctr_bits;
-}
-
-static inline int uncore_num_counters(struct intel_uncore_box *box)
-{
-	return box->pmu->type->num_counters;
-}
-
-static inline void uncore_disable_box(struct intel_uncore_box *box)
-{
-	if (box->pmu->type->ops->disable_box)
-		box->pmu->type->ops->disable_box(box);
-}
-
-static inline void uncore_enable_box(struct intel_uncore_box *box)
-{
-	if (box->pmu->type->ops->enable_box)
-		box->pmu->type->ops->enable_box(box);
-}
-
-static inline void uncore_disable_event(struct intel_uncore_box *box,
-				struct perf_event *event)
-{
-	box->pmu->type->ops->disable_event(box, event);
-}
-
-static inline void uncore_enable_event(struct intel_uncore_box *box,
-				struct perf_event *event)
-{
-	box->pmu->type->ops->enable_event(box, event);
-}
-
-static inline u64 uncore_read_counter(struct intel_uncore_box *box,
-				struct perf_event *event)
-{
-	return box->pmu->type->ops->read_counter(box, event);
-}
-
-static inline void uncore_box_init(struct intel_uncore_box *box)
-{
-	if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
-		if (box->pmu->type->ops->init_box)
-			box->pmu->type->ops->init_box(box);
-	}
-}
-
-static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
-{
-	return (box->phys_id < 0);
-}
-
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
-struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
-u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
-void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
-void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
-void uncore_pmu_event_read(struct perf_event *event);
-void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
-struct event_constraint *
-uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
-void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
-u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
-
-extern struct intel_uncore_type **uncore_msr_uncores;
-extern struct intel_uncore_type **uncore_pci_uncores;
-extern struct pci_driver *uncore_pci_driver;
-extern raw_spinlock_t pci2phy_map_lock;
-extern struct list_head pci2phy_map_head;
-extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
-extern struct event_constraint uncore_constraint_empty;
-
-/* perf_event_intel_uncore_snb.c */
-int snb_uncore_pci_init(void);
-int ivb_uncore_pci_init(void);
-int hsw_uncore_pci_init(void);
-int bdw_uncore_pci_init(void);
-int skl_uncore_pci_init(void);
-void snb_uncore_cpu_init(void);
-void nhm_uncore_cpu_init(void);
-int snb_pci2phy_map_init(int devid);
-
-/* perf_event_intel_uncore_snbep.c */
-int snbep_uncore_pci_init(void);
-void snbep_uncore_cpu_init(void);
-int ivbep_uncore_pci_init(void);
-void ivbep_uncore_cpu_init(void);
-int hswep_uncore_pci_init(void);
-void hswep_uncore_cpu_init(void);
-int bdx_uncore_pci_init(void);
-void bdx_uncore_cpu_init(void);
-int knl_uncore_pci_init(void);
-void knl_uncore_cpu_init(void);
-
-/* perf_event_intel_uncore_nhmex.c */
-void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
index 2749965..d70cfe0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
@@ -1,5 +1,5 @@
 /* Nehalem-EX/Westmere-EX uncore support */
-#include "perf_event_intel_uncore.h"
+#include "../../events/intel/uncore.h"
 
 /* NHM-EX event control */
 #define NHMEX_PMON_CTL_EV_SEL_MASK	0x000000ff
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 2bd030d..e0e41f5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,5 +1,5 @@
 /* Nehalem/SandBridge/Haswell uncore support */
-#include "perf_event_intel_uncore.h"
+#include "../../events/intel/uncore.h"
 
 /* Uncore IMC PCI IDs */
 #define PCI_DEVICE_ID_INTEL_SNB_IMC	0x0100
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 33acb88..188e18a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1,6 +1,5 @@
 /* SandyBridge-EP/IvyTown uncore support */
-#include "perf_event_intel_uncore.h"
-
+#include "../../events/intel/uncore.h"
 
 /* SNB-EP Box level control */
 #define SNBEP_PMON_BOX_CTL_RST_CTRL	(1 << 0)
-- 
cgit v0.10.2


From 35bf705c25e875f40ff8ed6af415315335973977 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:16 +0100
Subject: perf/x86: Move perf_event_intel_uncore_nhmex.c =>
 x86/events/intel/uncore_nmhex.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-11-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index aae3e53..70aedfe 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -8,4 +8,4 @@ endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
new file mode 100644
index 0000000..e89bf5c
--- /dev/null
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -0,0 +1,1221 @@
+/* Nehalem-EX/Westmere-EX uncore support */
+#include "uncore.h"
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK	0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK	0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0		(1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET		(1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN		(1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22		(1 << 22)
+#define NHMEX_PMON_CTL_INVERT		(1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK	0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK	(NHMEX_PMON_CTL_EV_SEL_MASK | \
+					 NHMEX_PMON_CTL_UMASK_MASK | \
+					 NHMEX_PMON_CTL_EDGE_DET | \
+					 NHMEX_PMON_CTL_INVERT | \
+					 NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL		0xc00
+#define NHMEX_U_MSR_PMON_CTR			0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL			0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN			(1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL	0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL		(1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL		(1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK		\
+		(NHMEX_PMON_CTL_EV_SEL_MASK |	\
+		 NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL		0xd00
+#define NHMEX_C0_MSR_PMON_CTR0			0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0		0xd10
+#define NHMEX_C_MSR_OFFSET			0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL		0xc20
+#define NHMEX_B0_MSR_PMON_CTR0			0xc31
+#define NHMEX_B0_MSR_PMON_CTL0			0xc30
+#define NHMEX_B_MSR_OFFSET			0x40
+#define NHMEX_B0_MSR_MATCH			0xe45
+#define NHMEX_B0_MSR_MASK			0xe46
+#define NHMEX_B1_MSR_MATCH			0xe4d
+#define NHMEX_B1_MSR_MASK			0xe4e
+
+#define NHMEX_B_PMON_CTL_EN			(1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT		1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK		\
+		(0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT		6
+#define NHMEX_B_PMON_CTR_MASK		\
+		(0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK		\
+		(NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+		 NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL		0xc40
+#define NHMEX_S0_MSR_PMON_CTR0			0xc51
+#define NHMEX_S0_MSR_PMON_CTL0			0xc50
+#define NHMEX_S_MSR_OFFSET			0x80
+#define NHMEX_S0_MSR_MM_CFG			0xe48
+#define NHMEX_S0_MSR_MATCH			0xe49
+#define NHMEX_S0_MSR_MASK			0xe4a
+#define NHMEX_S1_MSR_MM_CFG			0xe58
+#define NHMEX_S1_MSR_MATCH			0xe59
+#define NHMEX_S1_MSR_MASK			0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN			(0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV		0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL			0xca0
+#define NHMEX_M0_MSR_PMU_DSP			0xca5
+#define NHMEX_M0_MSR_PMU_ISS			0xca6
+#define NHMEX_M0_MSR_PMU_MAP			0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR		0xca8
+#define NHMEX_M0_MSR_PMU_PGT			0xca9
+#define NHMEX_M0_MSR_PMU_PLD			0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC		0xcab
+#define NHMEX_M0_MSR_PMU_CTL0			0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0			0xcb1
+#define NHMEX_M_MSR_OFFSET			0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG			0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG			0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN			(1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK		0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK		0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT		34
+
+#define NHMEX_M_PMON_CTL_EN			(1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN			(1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT	2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK	\
+	(0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT	4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK	\
+	(0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE		(1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE		(1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT		9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK		\
+	(0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT	19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK	\
+	(0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK			\
+		(NHMEX_M_PMON_CTL_COUNT_MODE_MASK |	\
+		 NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |	\
+		 NHMEX_M_PMON_CTL_WRAP_MODE |		\
+		 NHMEX_M_PMON_CTL_FLAG_MODE |		\
+		 NHMEX_M_PMON_CTL_INC_SEL_MASK |	\
+		 NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+				NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+			   NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+				NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+		EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+				MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+		EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+				MBOX_SET_FLAG_SEL_MASK, \
+				(u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL			0xe00
+#define NHMEX_R_MSR_PMON_CTL0			0xe10
+#define NHMEX_R_MSR_PMON_CNT0			0xe11
+#define NHMEX_R_MSR_OFFSET			0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n)		\
+		((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)		(0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)		(0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)		\
+		(((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)	\
+		(0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)	\
+		(0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)	\
+		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN			(1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT		1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK		\
+		(0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN			(1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK		NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL			0xc80
+#define NHMEX_W_MSR_PMON_CNT0			0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0		0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR		0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL		0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN		(1ULL << 31)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+				((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+	u64 config;
+
+	if (msr) {
+		rdmsrl(msr, config);
+		config &= ~((1ULL << uncore_num_counters(box)) - 1);
+		/* WBox has a fixed counter */
+		if (uncore_msr_fixed_ctl(box))
+			config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+		wrmsrl(msr, config);
+	}
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+	u64 config;
+
+	if (msr) {
+		rdmsrl(msr, config);
+		config |= (1ULL << uncore_num_counters(box)) - 1;
+		/* WBox has a fixed counter */
+		if (uncore_msr_fixed_ctl(box))
+			config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+		wrmsrl(msr, config);
+	}
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	wrmsrl(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+	else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+	else
+		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT()				\
+	.init_box	= nhmex_uncore_msr_init_box,		\
+	.disable_box	= nhmex_uncore_msr_disable_box,		\
+	.enable_box	= nhmex_uncore_msr_enable_box,		\
+	.disable_event	= nhmex_uncore_msr_disable_event,	\
+	.read_counter	= uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event	= nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_edge.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_ubox_format_group = {
+	.name		= "format",
+	.attrs		= nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+	.name		= "ubox",
+	.num_counters	= 1,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.event_ctl	= NHMEX_U_MSR_PMON_EV_SEL,
+	.perf_ctr	= NHMEX_U_MSR_PMON_CTR,
+	.event_mask	= NHMEX_U_PMON_RAW_EVENT_MASK,
+	.box_ctl	= NHMEX_U_MSR_PMON_GLOBAL_CTL,
+	.ops		= &nhmex_uncore_ops,
+	.format_group	= &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static unsigned nhmex_cbox_msr_offsets[] = {
+	0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 6,
+	.num_boxes		= 10,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_C0_MSR_PMON_EV_SEL0,
+	.perf_ctr		= NHMEX_C0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+	.msr_offsets		= nhmex_cbox_msr_offsets,
+	.pair_ctr_ctl		= 1,
+	.ops			= &nhmex_uncore_ops,
+	.format_group		= &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+	.name			= "wbox",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_W_MSR_PMON_CNT0,
+	.perf_ctr		= NHMEX_W_MSR_PMON_EVT_SEL0,
+	.fixed_ctr		= NHMEX_W_MSR_PMON_FIXED_CTR,
+	.fixed_ctl		= NHMEX_W_MSR_PMON_FIXED_CTL,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_W_MSR_GLOBAL_CTL,
+	.pair_ctr_ctl		= 1,
+	.event_descs		= nhmex_uncore_wbox_events,
+	.ops			= &nhmex_uncore_ops,
+	.format_group		= &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int ctr, ev_sel;
+
+	ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+		NHMEX_B_PMON_CTR_SHIFT;
+	ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+		  NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+	/* events that do not use the match/mask registers */
+	if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+	    (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+		return 0;
+
+	if (box->pmu->pmu_idx == 0)
+		reg1->reg = NHMEX_B0_MSR_MATCH;
+	else
+		reg1->reg = NHMEX_B1_MSR_MATCH;
+	reg1->idx = 0;
+	reg1->config = event->attr.config1;
+	reg2->config = event->attr.config2;
+	return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg1->reg, reg1->config);
+		wrmsrl(reg1->reg + 1, reg2->config);
+	}
+	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+		(hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+	EVENT_CONSTRAINT(0 , 1, 0xc0),
+	EVENT_CONSTRAINT(0x40, 2, 0xc0),
+	EVENT_CONSTRAINT(0x80, 4, 0xc0),
+	EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+	EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+	&format_attr_event5.attr,
+	&format_attr_counter.attr,
+	&format_attr_match.attr,
+	&format_attr_mask.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_bbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_bbox_msr_enable_event,
+	.hw_config		= nhmex_bbox_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+	.name			= "bbox",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_B0_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_B0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_B_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+	.msr_offset		= NHMEX_B_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 1,
+	.constraints		= nhmex_uncore_bbox_constraints,
+	.ops			= &nhmex_uncore_bbox_ops,
+	.format_group		= &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	/* only TO_R_PROG_EV event uses the match/mask register */
+	if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+	    NHMEX_S_EVENT_TO_R_PROG_EV)
+		return 0;
+
+	if (box->pmu->pmu_idx == 0)
+		reg1->reg = NHMEX_S0_MSR_MM_CFG;
+	else
+		reg1->reg = NHMEX_S1_MSR_MM_CFG;
+	reg1->idx = 0;
+	reg1->config = event->attr.config1;
+	reg2->config = event->attr.config2;
+	return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg1->reg, 0);
+		wrmsrl(reg1->reg + 1, reg1->config);
+		wrmsrl(reg1->reg + 2, reg2->config);
+		wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+	}
+	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_match.attr,
+	&format_attr_mask.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_sbox_format_group = {
+	.name			= "format",
+	.attrs			= nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_sbox_msr_enable_event,
+	.hw_config		= nhmex_sbox_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_S0_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_S0_MSR_PMON_CTR0,
+	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+	.msr_offset		= NHMEX_S_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 1,
+	.ops			= &nhmex_uncore_sbox_ops,
+	.format_group		= &nhmex_uncore_sbox_format_group
+};
+
+enum {
+	EXTRA_REG_NHMEX_M_FILTER,
+	EXTRA_REG_NHMEX_M_DSP,
+	EXTRA_REG_NHMEX_M_ISS,
+	EXTRA_REG_NHMEX_M_MAP,
+	EXTRA_REG_NHMEX_M_MSC_THR,
+	EXTRA_REG_NHMEX_M_PGT,
+	EXTRA_REG_NHMEX_M_PLD,
+	EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+	MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+	MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+	MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+	MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+	/* event 0xa uses two extra registers */
+	MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+	MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+	MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+	/* events 0xd ~ 0x10 use the same extra register */
+	MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+	MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+	MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+	EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+static bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	bool ret = false;
+	u64 mask;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		er = &box->shared_regs[idx];
+		raw_spin_lock_irqsave(&er->lock, flags);
+		if (!atomic_read(&er->ref) || er->config == config) {
+			atomic_inc(&er->ref);
+			er->config = config;
+			ret = true;
+		}
+		raw_spin_unlock_irqrestore(&er->lock, flags);
+
+		return ret;
+	}
+	/*
+	 * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+	 * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+	 * fields which are shared.
+	 */
+	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	if (WARN_ON_ONCE(idx >= 4))
+		return false;
+
+	/* mask of the shared fields */
+	if (uncore_nhmex)
+		mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+	else
+		mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+	raw_spin_lock_irqsave(&er->lock, flags);
+	/* add mask of the non-shared field if it's in use */
+	if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+		if (uncore_nhmex)
+			mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		else
+			mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	}
+
+	if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+		atomic_add(1 << (idx * 8), &er->ref);
+		if (uncore_nhmex)
+			mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+				NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		else
+			mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+				WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+		er->config &= ~mask;
+		er->config |= (config & mask);
+		ret = true;
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+	struct intel_uncore_extra_reg *er;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		er = &box->shared_regs[idx];
+		atomic_dec(&er->ref);
+		return;
+	}
+
+	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+	atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+	u64 config = reg1->config;
+
+	/* get the non-shared control bits and shift them */
+	idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+	if (uncore_nhmex)
+		config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	else
+		config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+	if (new_idx > orig_idx) {
+		idx = new_idx - orig_idx;
+		config <<= 3 * idx;
+	} else {
+		idx = orig_idx - new_idx;
+		config >>= 3 * idx;
+	}
+
+	/* add the shared control bits back */
+	if (uncore_nhmex)
+		config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	else
+		config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+	if (modify) {
+		/* adjust the main event selector */
+		if (new_idx > orig_idx)
+			hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+		else
+			hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+		reg1->config = config;
+		reg1->idx = ~0xff | new_idx;
+	}
+	return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	int i, idx[2], alloc = 0;
+	u64 config1 = reg1->config;
+
+	idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+	idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+	for (i = 0; i < 2; i++) {
+		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+			idx[i] = 0xff;
+
+		if (idx[i] == 0xff)
+			continue;
+
+		if (!nhmex_mbox_get_shared_reg(box, idx[i],
+				__BITS_VALUE(config1, i, 32)))
+			goto fail;
+		alloc |= (0x1 << i);
+	}
+
+	/* for the match/mask registers */
+	if (reg2->idx != EXTRA_REG_NONE &&
+	    (uncore_box_is_fake(box) || !reg2->alloc) &&
+	    !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+		goto fail;
+
+	/*
+	 * If it's a fake box -- as per validate_{group,event}() we
+	 * shouldn't touch event state and we can avoid doing so
+	 * since both will only call get_event_constraints() once
+	 * on each event, this avoids the need for reg->alloc.
+	 */
+	if (!uncore_box_is_fake(box)) {
+		if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+			nhmex_mbox_alter_er(event, idx[0], true);
+		reg1->alloc |= alloc;
+		if (reg2->idx != EXTRA_REG_NONE)
+			reg2->alloc = 1;
+	}
+	return NULL;
+fail:
+	if (idx[0] != 0xff && !(alloc & 0x1) &&
+	    idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+		/*
+		 * events 0xd ~ 0x10 are functional identical, but are
+		 * controlled by different fields in the ZDP_CTL_FVC
+		 * register. If we failed to take one field, try the
+		 * rest 3 choices.
+		 */
+		BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+		idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+		idx[0] = (idx[0] + 1) % 4;
+		idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+		if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+			config1 = nhmex_mbox_alter_er(event, idx[0], false);
+			goto again;
+		}
+	}
+
+	if (alloc & 0x1)
+		nhmex_mbox_put_shared_reg(box, idx[0]);
+	if (alloc & 0x2)
+		nhmex_mbox_put_shared_reg(box, idx[1]);
+	return &uncore_constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+	if (uncore_box_is_fake(box))
+		return;
+
+	if (reg1->alloc & 0x1)
+		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+	if (reg1->alloc & 0x2)
+		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+	reg1->alloc = 0;
+
+	if (reg2->alloc) {
+		nhmex_mbox_put_shared_reg(box, reg2->idx);
+		reg2->alloc = 0;
+	}
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+	if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+		return er->idx;
+	return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_type *type = box->pmu->type;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	struct extra_reg *er;
+	unsigned msr;
+	int reg_idx = 0;
+	/*
+	 * The mbox events may require 2 extra MSRs at the most. But only
+	 * the lower 32 bits in these MSRs are significant, so we can use
+	 * config1 to pass two MSRs' config.
+	 */
+	for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		if (event->attr.config1 & ~er->valid_mask)
+			return -EINVAL;
+
+		msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+		if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+			return -EINVAL;
+
+		/* always use the 32~63 bits to pass the PLD config */
+		if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+			reg_idx = 1;
+		else if (WARN_ON_ONCE(reg_idx > 0))
+			return -EINVAL;
+
+		reg1->idx &= ~(0xff << (reg_idx * 8));
+		reg1->reg &= ~(0xffff << (reg_idx * 16));
+		reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+		reg1->reg |= msr << (reg_idx * 16);
+		reg1->config = event->attr.config1;
+		reg_idx++;
+	}
+	/*
+	 * The mbox only provides ability to perform address matching
+	 * for the PLD events.
+	 */
+	if (reg_idx == 2) {
+		reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+		if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+			reg2->config = event->attr.config2;
+		else
+			reg2->config = ~0ULL;
+		if (box->pmu->pmu_idx == 0)
+			reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+		else
+			reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+	}
+	return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	u64 config;
+
+	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+		return box->shared_regs[idx].config;
+
+	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+	raw_spin_lock_irqsave(&er->lock, flags);
+	config = er->config;
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+	return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int idx;
+
+	idx = __BITS_VALUE(reg1->idx, 0, 8);
+	if (idx != 0xff)
+		wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
+			nhmex_mbox_shared_reg_config(box, idx));
+	idx = __BITS_VALUE(reg1->idx, 1, 8);
+	if (idx != 0xff)
+		wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
+			nhmex_mbox_shared_reg_config(box, idx));
+
+	if (reg2->idx != EXTRA_REG_NONE) {
+		wrmsrl(reg2->reg, 0);
+		if (reg2->config != ~0ULL) {
+			wrmsrl(reg2->reg + 1,
+				reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+			wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+				(reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+			wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+		}
+	}
+
+	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode,		count_mode,	"config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode,		storage_mode,	"config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,		wrap_mode,	"config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode,		flag_mode,	"config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel,		inc_sel,	"config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,		set_flag_sel,	"config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,	filter_cfg_en,	"config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match,		filter_match,	"config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask,		filter_mask,	"config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp,			dsp,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr,			thr,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc,			fvc,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt,			pgt,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map,			map,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss,			iss,		"config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld,			pld,		"config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+	&format_attr_count_mode.attr,
+	&format_attr_storage_mode.attr,
+	&format_attr_wrap_mode.attr,
+	&format_attr_flag_mode.attr,
+	&format_attr_inc_sel.attr,
+	&format_attr_set_flag_sel.attr,
+	&format_attr_filter_cfg_en.attr,
+	&format_attr_filter_match.attr,
+	&format_attr_filter_mask.attr,
+	&format_attr_dsp.attr,
+	&format_attr_thr.attr,
+	&format_attr_fvc.attr,
+	&format_attr_pgt.attr,
+	&format_attr_map.attr,
+	&format_attr_iss.attr,
+	&format_attr_pld.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_mbox_format_group = {
+	.name		= "format",
+	.attrs		= nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+	{ /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event	= nhmex_mbox_msr_enable_event,
+	.hw_config	= nhmex_mbox_hw_config,
+	.get_constraint	= nhmex_mbox_get_constraint,
+	.put_constraint	= nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+	.name			= "mbox",
+	.num_counters		= 6,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_M0_MSR_PMU_CTL0,
+	.perf_ctr		= NHMEX_M0_MSR_PMU_CNT0,
+	.event_mask		= NHMEX_M_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_M0_MSR_GLOBAL_CTL,
+	.msr_offset		= NHMEX_M_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 8,
+	.event_descs		= nhmex_uncore_mbox_events,
+	.ops			= &nhmex_uncore_mbox_ops,
+	.format_group		= &nhmex_uncore_mbox_format_group,
+};
+
+static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	/* adjust the main event selector and extra register index */
+	if (reg1->idx % 2) {
+		reg1->idx--;
+		hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	} else {
+		reg1->idx++;
+		hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	}
+
+	/* adjust extra register config */
+	switch (reg1->idx % 6) {
+	case 2:
+		/* shift the 8~15 bits to the 0~7 bits */
+		reg1->config >>= 8;
+		break;
+	case 3:
+		/* shift the 0~7 bits to the 8~15 bits */
+		reg1->config <<= 8;
+		break;
+	}
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	struct intel_uncore_extra_reg *er;
+	unsigned long flags;
+	int idx, er_idx;
+	u64 config1;
+	bool ok = false;
+
+	if (!uncore_box_is_fake(box) && reg1->alloc)
+		return NULL;
+
+	idx = reg1->idx % 6;
+	config1 = reg1->config;
+again:
+	er_idx = idx;
+	/* the 3rd and 4th events use the same extra register */
+	if (er_idx > 2)
+		er_idx--;
+	er_idx += (reg1->idx / 6) * 5;
+
+	er = &box->shared_regs[er_idx];
+	raw_spin_lock_irqsave(&er->lock, flags);
+	if (idx < 2) {
+		if (!atomic_read(&er->ref) || er->config == reg1->config) {
+			atomic_inc(&er->ref);
+			er->config = reg1->config;
+			ok = true;
+		}
+	} else if (idx == 2 || idx == 3) {
+		/*
+		 * these two events use different fields in a extra register,
+		 * the 0~7 bits and the 8~15 bits respectively.
+		 */
+		u64 mask = 0xff << ((idx - 2) * 8);
+		if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+				!((er->config ^ config1) & mask)) {
+			atomic_add(1 << ((idx - 2) * 8), &er->ref);
+			er->config &= ~mask;
+			er->config |= config1 & mask;
+			ok = true;
+		}
+	} else {
+		if (!atomic_read(&er->ref) ||
+				(er->config == (hwc->config >> 32) &&
+				 er->config1 == reg1->config &&
+				 er->config2 == reg2->config)) {
+			atomic_inc(&er->ref);
+			er->config = (hwc->config >> 32);
+			er->config1 = reg1->config;
+			er->config2 = reg2->config;
+			ok = true;
+		}
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	if (!ok) {
+		/*
+		 * The Rbox events are always in pairs. The paired
+		 * events are functional identical, but use different
+		 * extra registers. If we failed to take an extra
+		 * register, try the alternative.
+		 */
+		idx ^= 1;
+		if (idx != reg1->idx % 6) {
+			if (idx == 2)
+				config1 >>= 8;
+			else if (idx == 3)
+				config1 <<= 8;
+			goto again;
+		}
+	} else {
+		if (!uncore_box_is_fake(box)) {
+			if (idx != reg1->idx % 6)
+				nhmex_rbox_alter_er(box, event);
+			reg1->alloc = 1;
+		}
+		return NULL;
+	}
+	return &uncore_constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct intel_uncore_extra_reg *er;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	int idx, er_idx;
+
+	if (uncore_box_is_fake(box) || !reg1->alloc)
+		return;
+
+	idx = reg1->idx % 6;
+	er_idx = idx;
+	if (er_idx > 2)
+		er_idx--;
+	er_idx += (reg1->idx / 6) * 5;
+
+	er = &box->shared_regs[er_idx];
+	if (idx == 2 || idx == 3)
+		atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+	else
+		atomic_dec(&er->ref);
+
+	reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+	int idx;
+
+	idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+		NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+	if (idx >= 0x18)
+		return -EINVAL;
+
+	reg1->idx = idx;
+	reg1->config = event->attr.config1;
+
+	switch (idx % 6) {
+	case 4:
+	case 5:
+		hwc->config |= event->attr.config & (~0ULL << 32);
+		reg2->config = event->attr.config2;
+		break;
+	}
+	return 0;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+	int idx, port;
+
+	idx = reg1->idx;
+	port = idx / 6 + box->pmu->pmu_idx * 4;
+
+	switch (idx % 6) {
+	case 0:
+		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+		break;
+	case 1:
+		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+		break;
+	case 2:
+	case 3:
+		wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+			uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
+		break;
+	case 4:
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+			hwc->config >> 32);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+		break;
+	case 5:
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+			hwc->config >> 32);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+		break;
+	}
+
+	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+		(hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+	&format_attr_event5.attr,
+	&format_attr_xbr_mm_cfg.attr,
+	&format_attr_xbr_match.attr,
+	&format_attr_xbr_mask.attr,
+	&format_attr_qlx_cfg.attr,
+	&format_attr_iperf_cfg.attr,
+	NULL,
+};
+
+static struct attribute_group nhmex_uncore_rbox_format_group = {
+	.name = "format",
+	.attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+	INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,		"event=0x0,iperf_cfg=0x80000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,		"event=0x6,iperf_cfg=0x80000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,		"event=0x0,iperf_cfg=0x40000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,		"event=0x6,iperf_cfg=0x40000000"),
+	INTEL_UNCORE_EVENT_DESC(qpi0_date_response,	"event=0x0,iperf_cfg=0xc4"),
+	INTEL_UNCORE_EVENT_DESC(qpi1_date_response,	"event=0x6,iperf_cfg=0xc4"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+	NHMEX_UNCORE_OPS_COMMON_INIT(),
+	.enable_event		= nhmex_rbox_msr_enable_event,
+	.hw_config		= nhmex_rbox_hw_config,
+	.get_constraint		= nhmex_rbox_get_constraint,
+	.put_constraint		= nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+	.name			= "rbox",
+	.num_counters		= 8,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= NHMEX_R_MSR_PMON_CTL0,
+	.perf_ctr		= NHMEX_R_MSR_PMON_CNT0,
+	.event_mask		= NHMEX_R_PMON_RAW_EVENT_MASK,
+	.box_ctl		= NHMEX_R_MSR_GLOBAL_CTL,
+	.msr_offset		= NHMEX_R_MSR_OFFSET,
+	.pair_ctr_ctl		= 1,
+	.num_shared_regs	= 20,
+	.event_descs		= nhmex_uncore_rbox_events,
+	.ops			= &nhmex_uncore_rbox_ops,
+	.format_group		= &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+	&nhmex_uncore_ubox,
+	&nhmex_uncore_cbox,
+	&nhmex_uncore_bbox,
+	&nhmex_uncore_sbox,
+	&nhmex_uncore_mbox,
+	&nhmex_uncore_rbox,
+	&nhmex_uncore_wbox,
+	NULL,
+};
+
+void nhmex_uncore_cpu_init(void)
+{
+	if (boot_cpu_data.x86_model == 46)
+		uncore_nhmex = true;
+	else
+		nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+	if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = nhmex_msr_uncores;
+}
+/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 875d1b1..316f53a 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -34,8 +34,7 @@ ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore_snb.o \
-					   perf_event_intel_uncore_snbep.o \
-					   perf_event_intel_uncore_nhmex.o
+					   perf_event_intel_uncore_snbep.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
 endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
deleted file mode 100644
index d70cfe0..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
+++ /dev/null
@@ -1,1221 +0,0 @@
-/* Nehalem-EX/Westmere-EX uncore support */
-#include "../../events/intel/uncore.h"
-
-/* NHM-EX event control */
-#define NHMEX_PMON_CTL_EV_SEL_MASK	0x000000ff
-#define NHMEX_PMON_CTL_UMASK_MASK	0x0000ff00
-#define NHMEX_PMON_CTL_EN_BIT0		(1 << 0)
-#define NHMEX_PMON_CTL_EDGE_DET		(1 << 18)
-#define NHMEX_PMON_CTL_PMI_EN		(1 << 20)
-#define NHMEX_PMON_CTL_EN_BIT22		(1 << 22)
-#define NHMEX_PMON_CTL_INVERT		(1 << 23)
-#define NHMEX_PMON_CTL_TRESH_MASK	0xff000000
-#define NHMEX_PMON_RAW_EVENT_MASK	(NHMEX_PMON_CTL_EV_SEL_MASK | \
-					 NHMEX_PMON_CTL_UMASK_MASK | \
-					 NHMEX_PMON_CTL_EDGE_DET | \
-					 NHMEX_PMON_CTL_INVERT | \
-					 NHMEX_PMON_CTL_TRESH_MASK)
-
-/* NHM-EX Ubox */
-#define NHMEX_U_MSR_PMON_GLOBAL_CTL		0xc00
-#define NHMEX_U_MSR_PMON_CTR			0xc11
-#define NHMEX_U_MSR_PMON_EV_SEL			0xc10
-
-#define NHMEX_U_PMON_GLOBAL_EN			(1 << 0)
-#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL	0x0000001e
-#define NHMEX_U_PMON_GLOBAL_EN_ALL		(1 << 28)
-#define NHMEX_U_PMON_GLOBAL_RST_ALL		(1 << 29)
-#define NHMEX_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
-
-#define NHMEX_U_PMON_RAW_EVENT_MASK		\
-		(NHMEX_PMON_CTL_EV_SEL_MASK |	\
-		 NHMEX_PMON_CTL_EDGE_DET)
-
-/* NHM-EX Cbox */
-#define NHMEX_C0_MSR_PMON_GLOBAL_CTL		0xd00
-#define NHMEX_C0_MSR_PMON_CTR0			0xd11
-#define NHMEX_C0_MSR_PMON_EV_SEL0		0xd10
-#define NHMEX_C_MSR_OFFSET			0x20
-
-/* NHM-EX Bbox */
-#define NHMEX_B0_MSR_PMON_GLOBAL_CTL		0xc20
-#define NHMEX_B0_MSR_PMON_CTR0			0xc31
-#define NHMEX_B0_MSR_PMON_CTL0			0xc30
-#define NHMEX_B_MSR_OFFSET			0x40
-#define NHMEX_B0_MSR_MATCH			0xe45
-#define NHMEX_B0_MSR_MASK			0xe46
-#define NHMEX_B1_MSR_MATCH			0xe4d
-#define NHMEX_B1_MSR_MASK			0xe4e
-
-#define NHMEX_B_PMON_CTL_EN			(1 << 0)
-#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT		1
-#define NHMEX_B_PMON_CTL_EV_SEL_MASK		\
-		(0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_B_PMON_CTR_SHIFT		6
-#define NHMEX_B_PMON_CTR_MASK		\
-		(0x3 << NHMEX_B_PMON_CTR_SHIFT)
-#define NHMEX_B_PMON_RAW_EVENT_MASK		\
-		(NHMEX_B_PMON_CTL_EV_SEL_MASK | \
-		 NHMEX_B_PMON_CTR_MASK)
-
-/* NHM-EX Sbox */
-#define NHMEX_S0_MSR_PMON_GLOBAL_CTL		0xc40
-#define NHMEX_S0_MSR_PMON_CTR0			0xc51
-#define NHMEX_S0_MSR_PMON_CTL0			0xc50
-#define NHMEX_S_MSR_OFFSET			0x80
-#define NHMEX_S0_MSR_MM_CFG			0xe48
-#define NHMEX_S0_MSR_MATCH			0xe49
-#define NHMEX_S0_MSR_MASK			0xe4a
-#define NHMEX_S1_MSR_MM_CFG			0xe58
-#define NHMEX_S1_MSR_MATCH			0xe59
-#define NHMEX_S1_MSR_MASK			0xe5a
-
-#define NHMEX_S_PMON_MM_CFG_EN			(0x1ULL << 63)
-#define NHMEX_S_EVENT_TO_R_PROG_EV		0
-
-/* NHM-EX Mbox */
-#define NHMEX_M0_MSR_GLOBAL_CTL			0xca0
-#define NHMEX_M0_MSR_PMU_DSP			0xca5
-#define NHMEX_M0_MSR_PMU_ISS			0xca6
-#define NHMEX_M0_MSR_PMU_MAP			0xca7
-#define NHMEX_M0_MSR_PMU_MSC_THR		0xca8
-#define NHMEX_M0_MSR_PMU_PGT			0xca9
-#define NHMEX_M0_MSR_PMU_PLD			0xcaa
-#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC		0xcab
-#define NHMEX_M0_MSR_PMU_CTL0			0xcb0
-#define NHMEX_M0_MSR_PMU_CNT0			0xcb1
-#define NHMEX_M_MSR_OFFSET			0x40
-#define NHMEX_M0_MSR_PMU_MM_CFG			0xe54
-#define NHMEX_M1_MSR_PMU_MM_CFG			0xe5c
-
-#define NHMEX_M_PMON_MM_CFG_EN			(1ULL << 63)
-#define NHMEX_M_PMON_ADDR_MATCH_MASK		0x3ffffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_MASK		0x7ffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_SHIFT		34
-
-#define NHMEX_M_PMON_CTL_EN			(1 << 0)
-#define NHMEX_M_PMON_CTL_PMI_EN			(1 << 1)
-#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT	2
-#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK	\
-	(0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT	4
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK	\
-	(0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_WRAP_MODE		(1 << 6)
-#define NHMEX_M_PMON_CTL_FLAG_MODE		(1 << 7)
-#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT		9
-#define NHMEX_M_PMON_CTL_INC_SEL_MASK		\
-	(0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT	19
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK	\
-	(0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
-#define NHMEX_M_PMON_RAW_EVENT_MASK			\
-		(NHMEX_M_PMON_CTL_COUNT_MODE_MASK |	\
-		 NHMEX_M_PMON_CTL_STORAGE_MODE_MASK |	\
-		 NHMEX_M_PMON_CTL_WRAP_MODE |		\
-		 NHMEX_M_PMON_CTL_FLAG_MODE |		\
-		 NHMEX_M_PMON_CTL_INC_SEL_MASK |	\
-		 NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
-
-#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 11) - 1) | (1 << 23))
-#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (11 + 3 * (n)))
-
-#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK		(((1 << 12) - 1) | (1 << 24))
-#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n)	(0x7ULL << (12 + 3 * (n)))
-
-/*
- * use the 9~13 bits to select event If the 7th bit is not set,
- * otherwise use the 19~21 bits to select event.
- */
-#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
-				NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
-			   NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
-				NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_EXTAR_REG(c, r) \
-		EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-				MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
-#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
-		EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
-				MBOX_SET_FLAG_SEL_MASK, \
-				(u64)-1, NHMEX_M_##r)
-
-/* NHM-EX Rbox */
-#define NHMEX_R_MSR_GLOBAL_CTL			0xe00
-#define NHMEX_R_MSR_PMON_CTL0			0xe10
-#define NHMEX_R_MSR_PMON_CNT0			0xe11
-#define NHMEX_R_MSR_OFFSET			0x20
-
-#define NHMEX_R_MSR_PORTN_QLX_CFG(n)		\
-		((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n)		(0xe04 + (n))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n)		(0xe24 + (n))
-#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n)		\
-		(((n) < 4 ? 0 : 0x10) + (n) * 4)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n)	\
-		(0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n)	\
-		(0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n)	\
-		(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
-
-#define NHMEX_R_PMON_CTL_EN			(1 << 0)
-#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT		1
-#define NHMEX_R_PMON_CTL_EV_SEL_MASK		\
-		(0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_R_PMON_CTL_PMI_EN			(1 << 6)
-#define NHMEX_R_PMON_RAW_EVENT_MASK		NHMEX_R_PMON_CTL_EV_SEL_MASK
-
-/* NHM-EX Wbox */
-#define NHMEX_W_MSR_GLOBAL_CTL			0xc80
-#define NHMEX_W_MSR_PMON_CNT0			0xc90
-#define NHMEX_W_MSR_PMON_EVT_SEL0		0xc91
-#define NHMEX_W_MSR_PMON_FIXED_CTR		0x394
-#define NHMEX_W_MSR_PMON_FIXED_CTL		0x395
-
-#define NHMEX_W_PMON_GLOBAL_FIXED_EN		(1ULL << 31)
-
-#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
-				((1ULL << (n)) - 1)))
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
-DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
-
-static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
-}
-
-static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-	u64 config;
-
-	if (msr) {
-		rdmsrl(msr, config);
-		config &= ~((1ULL << uncore_num_counters(box)) - 1);
-		/* WBox has a fixed counter */
-		if (uncore_msr_fixed_ctl(box))
-			config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
-		wrmsrl(msr, config);
-	}
-}
-
-static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-	u64 config;
-
-	if (msr) {
-		rdmsrl(msr, config);
-		config |= (1ULL << uncore_num_counters(box)) - 1;
-		/* WBox has a fixed counter */
-		if (uncore_msr_fixed_ctl(box))
-			config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
-		wrmsrl(msr, config);
-	}
-}
-
-static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	wrmsrl(event->hw.config_base, 0);
-}
-
-static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
-		wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
-	else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
-		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-	else
-		wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-#define NHMEX_UNCORE_OPS_COMMON_INIT()				\
-	.init_box	= nhmex_uncore_msr_init_box,		\
-	.disable_box	= nhmex_uncore_msr_disable_box,		\
-	.enable_box	= nhmex_uncore_msr_enable_box,		\
-	.disable_event	= nhmex_uncore_msr_disable_event,	\
-	.read_counter	= uncore_msr_read_counter
-
-static struct intel_uncore_ops nhmex_uncore_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event	= nhmex_uncore_msr_enable_event,
-};
-
-static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_edge.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_ubox_format_group = {
-	.name		= "format",
-	.attrs		= nhmex_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type nhmex_uncore_ubox = {
-	.name		= "ubox",
-	.num_counters	= 1,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.event_ctl	= NHMEX_U_MSR_PMON_EV_SEL,
-	.perf_ctr	= NHMEX_U_MSR_PMON_CTR,
-	.event_mask	= NHMEX_U_PMON_RAW_EVENT_MASK,
-	.box_ctl	= NHMEX_U_MSR_PMON_GLOBAL_CTL,
-	.ops		= &nhmex_uncore_ops,
-	.format_group	= &nhmex_uncore_ubox_format_group
-};
-
-static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_cbox_format_group = {
-	.name = "format",
-	.attrs = nhmex_uncore_cbox_formats_attr,
-};
-
-/* msr offset for each instance of cbox */
-static unsigned nhmex_cbox_msr_offsets[] = {
-	0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
-};
-
-static struct intel_uncore_type nhmex_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 6,
-	.num_boxes		= 10,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_C0_MSR_PMON_EV_SEL0,
-	.perf_ctr		= NHMEX_C0_MSR_PMON_CTR0,
-	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_C0_MSR_PMON_GLOBAL_CTL,
-	.msr_offsets		= nhmex_cbox_msr_offsets,
-	.pair_ctr_ctl		= 1,
-	.ops			= &nhmex_uncore_ops,
-	.format_group		= &nhmex_uncore_cbox_format_group
-};
-
-static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type nhmex_uncore_wbox = {
-	.name			= "wbox",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_W_MSR_PMON_CNT0,
-	.perf_ctr		= NHMEX_W_MSR_PMON_EVT_SEL0,
-	.fixed_ctr		= NHMEX_W_MSR_PMON_FIXED_CTR,
-	.fixed_ctl		= NHMEX_W_MSR_PMON_FIXED_CTL,
-	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_W_MSR_GLOBAL_CTL,
-	.pair_ctr_ctl		= 1,
-	.event_descs		= nhmex_uncore_wbox_events,
-	.ops			= &nhmex_uncore_ops,
-	.format_group		= &nhmex_uncore_cbox_format_group
-};
-
-static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	int ctr, ev_sel;
-
-	ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
-		NHMEX_B_PMON_CTR_SHIFT;
-	ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
-		  NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
-
-	/* events that do not use the match/mask registers */
-	if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
-	    (ctr == 2 && ev_sel != 0x4) || ctr == 3)
-		return 0;
-
-	if (box->pmu->pmu_idx == 0)
-		reg1->reg = NHMEX_B0_MSR_MATCH;
-	else
-		reg1->reg = NHMEX_B1_MSR_MATCH;
-	reg1->idx = 0;
-	reg1->config = event->attr.config1;
-	reg2->config = event->attr.config2;
-	return 0;
-}
-
-static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		wrmsrl(reg1->reg, reg1->config);
-		wrmsrl(reg1->reg + 1, reg2->config);
-	}
-	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-		(hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
-}
-
-/*
- * The Bbox has 4 counters, but each counter monitors different events.
- * Use bits 6-7 in the event config to select counter.
- */
-static struct event_constraint nhmex_uncore_bbox_constraints[] = {
-	EVENT_CONSTRAINT(0 , 1, 0xc0),
-	EVENT_CONSTRAINT(0x40, 2, 0xc0),
-	EVENT_CONSTRAINT(0x80, 4, 0xc0),
-	EVENT_CONSTRAINT(0xc0, 8, 0xc0),
-	EVENT_CONSTRAINT_END,
-};
-
-static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
-	&format_attr_event5.attr,
-	&format_attr_counter.attr,
-	&format_attr_match.attr,
-	&format_attr_mask.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_bbox_format_group = {
-	.name = "format",
-	.attrs = nhmex_uncore_bbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event		= nhmex_bbox_msr_enable_event,
-	.hw_config		= nhmex_bbox_hw_config,
-	.get_constraint		= uncore_get_constraint,
-	.put_constraint		= uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_bbox = {
-	.name			= "bbox",
-	.num_counters		= 4,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_B0_MSR_PMON_CTL0,
-	.perf_ctr		= NHMEX_B0_MSR_PMON_CTR0,
-	.event_mask		= NHMEX_B_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_B0_MSR_PMON_GLOBAL_CTL,
-	.msr_offset		= NHMEX_B_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 1,
-	.constraints		= nhmex_uncore_bbox_constraints,
-	.ops			= &nhmex_uncore_bbox_ops,
-	.format_group		= &nhmex_uncore_bbox_format_group
-};
-
-static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	/* only TO_R_PROG_EV event uses the match/mask register */
-	if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
-	    NHMEX_S_EVENT_TO_R_PROG_EV)
-		return 0;
-
-	if (box->pmu->pmu_idx == 0)
-		reg1->reg = NHMEX_S0_MSR_MM_CFG;
-	else
-		reg1->reg = NHMEX_S1_MSR_MM_CFG;
-	reg1->idx = 0;
-	reg1->config = event->attr.config1;
-	reg2->config = event->attr.config2;
-	return 0;
-}
-
-static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		wrmsrl(reg1->reg, 0);
-		wrmsrl(reg1->reg + 1, reg1->config);
-		wrmsrl(reg1->reg + 2, reg2->config);
-		wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
-	}
-	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-}
-
-static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_match.attr,
-	&format_attr_mask.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_sbox_format_group = {
-	.name			= "format",
-	.attrs			= nhmex_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event		= nhmex_sbox_msr_enable_event,
-	.hw_config		= nhmex_sbox_hw_config,
-	.get_constraint		= uncore_get_constraint,
-	.put_constraint		= uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_sbox = {
-	.name			= "sbox",
-	.num_counters		= 4,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_S0_MSR_PMON_CTL0,
-	.perf_ctr		= NHMEX_S0_MSR_PMON_CTR0,
-	.event_mask		= NHMEX_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_S0_MSR_PMON_GLOBAL_CTL,
-	.msr_offset		= NHMEX_S_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 1,
-	.ops			= &nhmex_uncore_sbox_ops,
-	.format_group		= &nhmex_uncore_sbox_format_group
-};
-
-enum {
-	EXTRA_REG_NHMEX_M_FILTER,
-	EXTRA_REG_NHMEX_M_DSP,
-	EXTRA_REG_NHMEX_M_ISS,
-	EXTRA_REG_NHMEX_M_MAP,
-	EXTRA_REG_NHMEX_M_MSC_THR,
-	EXTRA_REG_NHMEX_M_PGT,
-	EXTRA_REG_NHMEX_M_PLD,
-	EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
-};
-
-static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
-	MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
-	MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
-	MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
-	MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
-	/* event 0xa uses two extra registers */
-	MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
-	MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
-	MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
-	/* events 0xd ~ 0x10 use the same extra register */
-	MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
-	MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
-	MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
-	EVENT_EXTRA_END
-};
-
-/* Nehalem-EX or Westmere-EX ? */
-static bool uncore_nhmex;
-
-static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
-{
-	struct intel_uncore_extra_reg *er;
-	unsigned long flags;
-	bool ret = false;
-	u64 mask;
-
-	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-		er = &box->shared_regs[idx];
-		raw_spin_lock_irqsave(&er->lock, flags);
-		if (!atomic_read(&er->ref) || er->config == config) {
-			atomic_inc(&er->ref);
-			er->config = config;
-			ret = true;
-		}
-		raw_spin_unlock_irqrestore(&er->lock, flags);
-
-		return ret;
-	}
-	/*
-	 * The ZDP_CTL_FVC MSR has 4 fields which are used to control
-	 * events 0xd ~ 0x10. Besides these 4 fields, there are additional
-	 * fields which are shared.
-	 */
-	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-	if (WARN_ON_ONCE(idx >= 4))
-		return false;
-
-	/* mask of the shared fields */
-	if (uncore_nhmex)
-		mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
-	else
-		mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
-	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-
-	raw_spin_lock_irqsave(&er->lock, flags);
-	/* add mask of the non-shared field if it's in use */
-	if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
-		if (uncore_nhmex)
-			mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-		else
-			mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-	}
-
-	if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
-		atomic_add(1 << (idx * 8), &er->ref);
-		if (uncore_nhmex)
-			mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
-				NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-		else
-			mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
-				WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-		er->config &= ~mask;
-		er->config |= (config & mask);
-		ret = true;
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	return ret;
-}
-
-static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
-{
-	struct intel_uncore_extra_reg *er;
-
-	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-		er = &box->shared_regs[idx];
-		atomic_dec(&er->ref);
-		return;
-	}
-
-	idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-	atomic_sub(1 << (idx * 8), &er->ref);
-}
-
-static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
-	u64 config = reg1->config;
-
-	/* get the non-shared control bits and shift them */
-	idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-	if (uncore_nhmex)
-		config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-	else
-		config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
-	if (new_idx > orig_idx) {
-		idx = new_idx - orig_idx;
-		config <<= 3 * idx;
-	} else {
-		idx = orig_idx - new_idx;
-		config >>= 3 * idx;
-	}
-
-	/* add the shared control bits back */
-	if (uncore_nhmex)
-		config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-	else
-		config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-	config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
-	if (modify) {
-		/* adjust the main event selector */
-		if (new_idx > orig_idx)
-			hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-		else
-			hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
-		reg1->config = config;
-		reg1->idx = ~0xff | new_idx;
-	}
-	return config;
-}
-
-static struct event_constraint *
-nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-	int i, idx[2], alloc = 0;
-	u64 config1 = reg1->config;
-
-	idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
-	idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
-again:
-	for (i = 0; i < 2; i++) {
-		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-			idx[i] = 0xff;
-
-		if (idx[i] == 0xff)
-			continue;
-
-		if (!nhmex_mbox_get_shared_reg(box, idx[i],
-				__BITS_VALUE(config1, i, 32)))
-			goto fail;
-		alloc |= (0x1 << i);
-	}
-
-	/* for the match/mask registers */
-	if (reg2->idx != EXTRA_REG_NONE &&
-	    (uncore_box_is_fake(box) || !reg2->alloc) &&
-	    !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
-		goto fail;
-
-	/*
-	 * If it's a fake box -- as per validate_{group,event}() we
-	 * shouldn't touch event state and we can avoid doing so
-	 * since both will only call get_event_constraints() once
-	 * on each event, this avoids the need for reg->alloc.
-	 */
-	if (!uncore_box_is_fake(box)) {
-		if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
-			nhmex_mbox_alter_er(event, idx[0], true);
-		reg1->alloc |= alloc;
-		if (reg2->idx != EXTRA_REG_NONE)
-			reg2->alloc = 1;
-	}
-	return NULL;
-fail:
-	if (idx[0] != 0xff && !(alloc & 0x1) &&
-	    idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
-		/*
-		 * events 0xd ~ 0x10 are functional identical, but are
-		 * controlled by different fields in the ZDP_CTL_FVC
-		 * register. If we failed to take one field, try the
-		 * rest 3 choices.
-		 */
-		BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
-		idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-		idx[0] = (idx[0] + 1) % 4;
-		idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
-		if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
-			config1 = nhmex_mbox_alter_er(event, idx[0], false);
-			goto again;
-		}
-	}
-
-	if (alloc & 0x1)
-		nhmex_mbox_put_shared_reg(box, idx[0]);
-	if (alloc & 0x2)
-		nhmex_mbox_put_shared_reg(box, idx[1]);
-	return &uncore_constraint_empty;
-}
-
-static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-
-	if (uncore_box_is_fake(box))
-		return;
-
-	if (reg1->alloc & 0x1)
-		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
-	if (reg1->alloc & 0x2)
-		nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
-	reg1->alloc = 0;
-
-	if (reg2->alloc) {
-		nhmex_mbox_put_shared_reg(box, reg2->idx);
-		reg2->alloc = 0;
-	}
-}
-
-static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
-{
-	if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-		return er->idx;
-	return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
-}
-
-static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_type *type = box->pmu->type;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-	struct extra_reg *er;
-	unsigned msr;
-	int reg_idx = 0;
-	/*
-	 * The mbox events may require 2 extra MSRs at the most. But only
-	 * the lower 32 bits in these MSRs are significant, so we can use
-	 * config1 to pass two MSRs' config.
-	 */
-	for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		if (event->attr.config1 & ~er->valid_mask)
-			return -EINVAL;
-
-		msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
-		if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
-			return -EINVAL;
-
-		/* always use the 32~63 bits to pass the PLD config */
-		if (er->idx == EXTRA_REG_NHMEX_M_PLD)
-			reg_idx = 1;
-		else if (WARN_ON_ONCE(reg_idx > 0))
-			return -EINVAL;
-
-		reg1->idx &= ~(0xff << (reg_idx * 8));
-		reg1->reg &= ~(0xffff << (reg_idx * 16));
-		reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
-		reg1->reg |= msr << (reg_idx * 16);
-		reg1->config = event->attr.config1;
-		reg_idx++;
-	}
-	/*
-	 * The mbox only provides ability to perform address matching
-	 * for the PLD events.
-	 */
-	if (reg_idx == 2) {
-		reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
-		if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
-			reg2->config = event->attr.config2;
-		else
-			reg2->config = ~0ULL;
-		if (box->pmu->pmu_idx == 0)
-			reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
-		else
-			reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
-	}
-	return 0;
-}
-
-static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
-	struct intel_uncore_extra_reg *er;
-	unsigned long flags;
-	u64 config;
-
-	if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
-		return box->shared_regs[idx].config;
-
-	er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-	raw_spin_lock_irqsave(&er->lock, flags);
-	config = er->config;
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-	return config;
-}
-
-static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	int idx;
-
-	idx = __BITS_VALUE(reg1->idx, 0, 8);
-	if (idx != 0xff)
-		wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
-			nhmex_mbox_shared_reg_config(box, idx));
-	idx = __BITS_VALUE(reg1->idx, 1, 8);
-	if (idx != 0xff)
-		wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
-			nhmex_mbox_shared_reg_config(box, idx));
-
-	if (reg2->idx != EXTRA_REG_NONE) {
-		wrmsrl(reg2->reg, 0);
-		if (reg2->config != ~0ULL) {
-			wrmsrl(reg2->reg + 1,
-				reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
-			wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
-				(reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
-			wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
-		}
-	}
-
-	wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(count_mode,		count_mode,	"config:2-3");
-DEFINE_UNCORE_FORMAT_ATTR(storage_mode,		storage_mode,	"config:4-5");
-DEFINE_UNCORE_FORMAT_ATTR(wrap_mode,		wrap_mode,	"config:6");
-DEFINE_UNCORE_FORMAT_ATTR(flag_mode,		flag_mode,	"config:7");
-DEFINE_UNCORE_FORMAT_ATTR(inc_sel,		inc_sel,	"config:9-13");
-DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel,		set_flag_sel,	"config:19-21");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en,	filter_cfg_en,	"config2:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_match,		filter_match,	"config2:0-33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_mask,		filter_mask,	"config2:34-61");
-DEFINE_UNCORE_FORMAT_ATTR(dsp,			dsp,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(thr,			thr,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(fvc,			fvc,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pgt,			pgt,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(map,			map,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(iss,			iss,		"config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pld,			pld,		"config1:32-63");
-
-static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
-	&format_attr_count_mode.attr,
-	&format_attr_storage_mode.attr,
-	&format_attr_wrap_mode.attr,
-	&format_attr_flag_mode.attr,
-	&format_attr_inc_sel.attr,
-	&format_attr_set_flag_sel.attr,
-	&format_attr_filter_cfg_en.attr,
-	&format_attr_filter_match.attr,
-	&format_attr_filter_mask.attr,
-	&format_attr_dsp.attr,
-	&format_attr_thr.attr,
-	&format_attr_fvc.attr,
-	&format_attr_pgt.attr,
-	&format_attr_map.attr,
-	&format_attr_iss.attr,
-	&format_attr_pld.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_mbox_format_group = {
-	.name		= "format",
-	.attrs		= nhmex_uncore_mbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
-	{ /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
-	INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event	= nhmex_mbox_msr_enable_event,
-	.hw_config	= nhmex_mbox_hw_config,
-	.get_constraint	= nhmex_mbox_get_constraint,
-	.put_constraint	= nhmex_mbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_mbox = {
-	.name			= "mbox",
-	.num_counters		= 6,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_M0_MSR_PMU_CTL0,
-	.perf_ctr		= NHMEX_M0_MSR_PMU_CNT0,
-	.event_mask		= NHMEX_M_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_M0_MSR_GLOBAL_CTL,
-	.msr_offset		= NHMEX_M_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 8,
-	.event_descs		= nhmex_uncore_mbox_events,
-	.ops			= &nhmex_uncore_mbox_ops,
-	.format_group		= &nhmex_uncore_mbox_format_group,
-};
-
-static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-	/* adjust the main event selector and extra register index */
-	if (reg1->idx % 2) {
-		reg1->idx--;
-		hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-	} else {
-		reg1->idx++;
-		hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-	}
-
-	/* adjust extra register config */
-	switch (reg1->idx % 6) {
-	case 2:
-		/* shift the 8~15 bits to the 0~7 bits */
-		reg1->config >>= 8;
-		break;
-	case 3:
-		/* shift the 0~7 bits to the 8~15 bits */
-		reg1->config <<= 8;
-		break;
-	}
-}
-
-/*
- * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
- * An event set consists of 6 events, the 3rd and 4th events in
- * an event set use the same extra register. So an event set uses
- * 5 extra registers.
- */
-static struct event_constraint *
-nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	struct intel_uncore_extra_reg *er;
-	unsigned long flags;
-	int idx, er_idx;
-	u64 config1;
-	bool ok = false;
-
-	if (!uncore_box_is_fake(box) && reg1->alloc)
-		return NULL;
-
-	idx = reg1->idx % 6;
-	config1 = reg1->config;
-again:
-	er_idx = idx;
-	/* the 3rd and 4th events use the same extra register */
-	if (er_idx > 2)
-		er_idx--;
-	er_idx += (reg1->idx / 6) * 5;
-
-	er = &box->shared_regs[er_idx];
-	raw_spin_lock_irqsave(&er->lock, flags);
-	if (idx < 2) {
-		if (!atomic_read(&er->ref) || er->config == reg1->config) {
-			atomic_inc(&er->ref);
-			er->config = reg1->config;
-			ok = true;
-		}
-	} else if (idx == 2 || idx == 3) {
-		/*
-		 * these two events use different fields in a extra register,
-		 * the 0~7 bits and the 8~15 bits respectively.
-		 */
-		u64 mask = 0xff << ((idx - 2) * 8);
-		if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
-				!((er->config ^ config1) & mask)) {
-			atomic_add(1 << ((idx - 2) * 8), &er->ref);
-			er->config &= ~mask;
-			er->config |= config1 & mask;
-			ok = true;
-		}
-	} else {
-		if (!atomic_read(&er->ref) ||
-				(er->config == (hwc->config >> 32) &&
-				 er->config1 == reg1->config &&
-				 er->config2 == reg2->config)) {
-			atomic_inc(&er->ref);
-			er->config = (hwc->config >> 32);
-			er->config1 = reg1->config;
-			er->config2 = reg2->config;
-			ok = true;
-		}
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	if (!ok) {
-		/*
-		 * The Rbox events are always in pairs. The paired
-		 * events are functional identical, but use different
-		 * extra registers. If we failed to take an extra
-		 * register, try the alternative.
-		 */
-		idx ^= 1;
-		if (idx != reg1->idx % 6) {
-			if (idx == 2)
-				config1 >>= 8;
-			else if (idx == 3)
-				config1 <<= 8;
-			goto again;
-		}
-	} else {
-		if (!uncore_box_is_fake(box)) {
-			if (idx != reg1->idx % 6)
-				nhmex_rbox_alter_er(box, event);
-			reg1->alloc = 1;
-		}
-		return NULL;
-	}
-	return &uncore_constraint_empty;
-}
-
-static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct intel_uncore_extra_reg *er;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	int idx, er_idx;
-
-	if (uncore_box_is_fake(box) || !reg1->alloc)
-		return;
-
-	idx = reg1->idx % 6;
-	er_idx = idx;
-	if (er_idx > 2)
-		er_idx--;
-	er_idx += (reg1->idx / 6) * 5;
-
-	er = &box->shared_regs[er_idx];
-	if (idx == 2 || idx == 3)
-		atomic_sub(1 << ((idx - 2) * 8), &er->ref);
-	else
-		atomic_dec(&er->ref);
-
-	reg1->alloc = 0;
-}
-
-static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-	int idx;
-
-	idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
-		NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
-	if (idx >= 0x18)
-		return -EINVAL;
-
-	reg1->idx = idx;
-	reg1->config = event->attr.config1;
-
-	switch (idx % 6) {
-	case 4:
-	case 5:
-		hwc->config |= event->attr.config & (~0ULL << 32);
-		reg2->config = event->attr.config2;
-		break;
-	}
-	return 0;
-}
-
-static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-	int idx, port;
-
-	idx = reg1->idx;
-	port = idx / 6 + box->pmu->pmu_idx * 4;
-
-	switch (idx % 6) {
-	case 0:
-		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
-		break;
-	case 1:
-		wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
-		break;
-	case 2:
-	case 3:
-		wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
-			uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
-		break;
-	case 4:
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
-			hwc->config >> 32);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
-		break;
-	case 5:
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
-			hwc->config >> 32);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
-		wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
-		break;
-	}
-
-	wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
-		(hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
-DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
-
-static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
-	&format_attr_event5.attr,
-	&format_attr_xbr_mm_cfg.attr,
-	&format_attr_xbr_match.attr,
-	&format_attr_xbr_mask.attr,
-	&format_attr_qlx_cfg.attr,
-	&format_attr_iperf_cfg.attr,
-	NULL,
-};
-
-static struct attribute_group nhmex_uncore_rbox_format_group = {
-	.name = "format",
-	.attrs = nhmex_uncore_rbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
-	INTEL_UNCORE_EVENT_DESC(qpi0_flit_send,		"event=0x0,iperf_cfg=0x80000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi1_filt_send,		"event=0x6,iperf_cfg=0x80000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt,		"event=0x0,iperf_cfg=0x40000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt,		"event=0x6,iperf_cfg=0x40000000"),
-	INTEL_UNCORE_EVENT_DESC(qpi0_date_response,	"event=0x0,iperf_cfg=0xc4"),
-	INTEL_UNCORE_EVENT_DESC(qpi1_date_response,	"event=0x6,iperf_cfg=0xc4"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
-	NHMEX_UNCORE_OPS_COMMON_INIT(),
-	.enable_event		= nhmex_rbox_msr_enable_event,
-	.hw_config		= nhmex_rbox_hw_config,
-	.get_constraint		= nhmex_rbox_get_constraint,
-	.put_constraint		= nhmex_rbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_rbox = {
-	.name			= "rbox",
-	.num_counters		= 8,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= NHMEX_R_MSR_PMON_CTL0,
-	.perf_ctr		= NHMEX_R_MSR_PMON_CNT0,
-	.event_mask		= NHMEX_R_PMON_RAW_EVENT_MASK,
-	.box_ctl		= NHMEX_R_MSR_GLOBAL_CTL,
-	.msr_offset		= NHMEX_R_MSR_OFFSET,
-	.pair_ctr_ctl		= 1,
-	.num_shared_regs	= 20,
-	.event_descs		= nhmex_uncore_rbox_events,
-	.ops			= &nhmex_uncore_rbox_ops,
-	.format_group		= &nhmex_uncore_rbox_format_group
-};
-
-static struct intel_uncore_type *nhmex_msr_uncores[] = {
-	&nhmex_uncore_ubox,
-	&nhmex_uncore_cbox,
-	&nhmex_uncore_bbox,
-	&nhmex_uncore_sbox,
-	&nhmex_uncore_mbox,
-	&nhmex_uncore_rbox,
-	&nhmex_uncore_wbox,
-	NULL,
-};
-
-void nhmex_uncore_cpu_init(void)
-{
-	if (boot_cpu_data.x86_model == 46)
-		uncore_nhmex = true;
-	else
-		nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
-	if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-	uncore_msr_uncores = nhmex_msr_uncores;
-}
-/* end of Nehalem-EX uncore support */
-- 
cgit v0.10.2


From 92553e40c6292408faa069b34a6db0dab4055080 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:17 +0100
Subject: perf/x86: Move perf_event_intel_uncore_snb.c   =>
 x86/events/intel/uncore_snb.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-12-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 70aedfe..eb0d921 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
new file mode 100644
index 0000000..2049d26
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -0,0 +1,717 @@
+/* Nehalem/SandBridge/Haswell uncore support */
+#include "uncore.h"
+
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC	0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC	0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC	0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC	0x0c00
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC	0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC	0x1604
+#define PCI_DEVICE_ID_INTEL_SKL_IMC	0x191f
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK			0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET			(1 << 18)
+#define SNB_UNC_CTL_EN				(1 << 22)
+#define SNB_UNC_CTL_INVERT			(1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK			0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK			0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN		(1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
+						 SNB_UNC_CTL_UMASK_MASK | \
+						 SNB_UNC_CTL_EDGE_DET | \
+						 SNB_UNC_CTL_INVERT | \
+						 SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
+						 SNB_UNC_CTL_UMASK_MASK | \
+						 SNB_UNC_CTL_EDGE_DET | \
+						 SNB_UNC_CTL_INVERT | \
+						 NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
+#define SNB_UNC_FIXED_CTR_CTRL                  0x394
+#define SNB_UNC_FIXED_CTR                       0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
+#define SNB_UNC_CBO_0_PER_CTR0                  0x706
+#define SNB_UNC_CBO_MSR_OFFSET                  0x10
+
+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0			0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0			0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET			0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
+#define NHM_UNC_FIXED_CTR                       0x394
+#define NHM_UNC_FIXED_CTR_CTRL                  0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0                     0x3c0
+#define NHM_UNC_UNCORE_PMC0                     0x3b0
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+	else
+		wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	wrmsrl(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->pmu_idx == 0) {
+		wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+			SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+	}
+}
+
+static struct uncore_event_desc snb_uncore_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+	{ /* end: all zeroes */ },
+};
+
+static struct attribute *snb_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask5.attr,
+	NULL,
+};
+
+static struct attribute_group snb_uncore_format_group = {
+	.name		= "format",
+	.attrs		= snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+	.init_box	= snb_uncore_msr_init_box,
+	.disable_event	= snb_uncore_msr_disable_event,
+	.enable_event	= snb_uncore_msr_enable_event,
+	.read_counter	= uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_arb_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+	.name		= "cbox",
+	.num_counters   = 2,
+	.num_boxes	= 4,
+	.perf_ctr_bits	= 44,
+	.fixed_ctr_bits	= 48,
+	.perf_ctr	= SNB_UNC_CBO_0_PER_CTR0,
+	.event_ctl	= SNB_UNC_CBO_0_PERFEVTSEL0,
+	.fixed_ctr	= SNB_UNC_FIXED_CTR,
+	.fixed_ctl	= SNB_UNC_FIXED_CTR_CTRL,
+	.single_fixed	= 1,
+	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
+	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET,
+	.ops		= &snb_uncore_msr_ops,
+	.format_group	= &snb_uncore_format_group,
+	.event_descs	= snb_uncore_events,
+};
+
+static struct intel_uncore_type snb_uncore_arb = {
+	.name		= "arb",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.perf_ctr	= SNB_UNC_ARB_PER_CTR0,
+	.event_ctl	= SNB_UNC_ARB_PERFEVTSEL0,
+	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
+	.msr_offset	= SNB_UNC_ARB_MSR_OFFSET,
+	.constraints	= snb_uncore_arb_constraints,
+	.ops		= &snb_uncore_msr_ops,
+	.format_group	= &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+	&snb_uncore_cbox,
+	&snb_uncore_arb,
+	NULL,
+};
+
+void snb_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = snb_msr_uncores;
+	if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+}
+
+enum {
+	SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+	INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
+	INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+	INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+	INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+	{ /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK		0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET		0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE		0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS		0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE	0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES		0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE	0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE		SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group snb_uncore_imc_format_group = {
+	.name = "format",
+	.attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+	resource_size_t addr;
+	u32 pci_dword;
+
+	pci_read_config_dword(pdev, where, &pci_dword);
+	addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+	pci_read_config_dword(pdev, where + 4, &pci_dword);
+	addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+	addr &= ~(PAGE_SIZE - 1);
+
+	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * custom event_init() function because we define our own fixed, free
+ * running counters, so we do not want to conflict with generic uncore
+ * logic. Also simplifies processing
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_box *box;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+	int idx, base;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	pmu = uncore_event_to_pmu(event);
+	/* no device found for this pmu */
+	if (pmu->func_id < 0)
+		return -ENOENT;
+
+	/* Sampling not supported yet */
+	if (hwc->sample_period)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	/*
+	 * Place all uncore events for a particular physical package
+	 * onto a single cpu
+	 */
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	/* check only supported bits are set */
+	if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+		return -EINVAL;
+
+	box = uncore_pmu_to_box(pmu, event->cpu);
+	if (!box || box->cpu < 0)
+		return -EINVAL;
+
+	event->cpu = box->cpu;
+
+	event->hw.idx = -1;
+	event->hw.last_tag = ~0ULL;
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+	event->hw.branch_reg.idx = EXTRA_REG_NONE;
+	/*
+	 * check event is known (whitelist, determines counter)
+	 */
+	switch (cfg) {
+	case SNB_UNCORE_PCI_IMC_DATA_READS:
+		base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+		idx = UNCORE_PMC_IDX_FIXED;
+		break;
+	case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+		base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+		idx = UNCORE_PMC_IDX_FIXED + 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* must be done before validate_group */
+	event->hw.event_base = base;
+	event->hw.config = cfg;
+	event->hw.idx = idx;
+
+	/* no group validation needed, we have free running counters */
+
+	return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return 0;
+}
+
+static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	u64 count;
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	event->hw.state = 0;
+	box->n_active++;
+
+	list_add_tail(&event->active_entry, &box->active_list);
+
+	count = snb_uncore_imc_read_counter(box, event);
+	local64_set(&event->hw.prev_count, count);
+
+	if (box->n_active == 1)
+		uncore_pmu_start_hrtimer(box);
+}
+
+static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		box->n_active--;
+
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+
+		list_del(&event->active_entry);
+
+		if (box->n_active == 0)
+			uncore_pmu_cancel_hrtimer(box);
+	}
+
+	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		uncore_perf_event_update(box, event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+}
+
+static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (!box)
+		return -ENODEV;
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+	if (!(flags & PERF_EF_START))
+		hwc->state |= PERF_HES_ARCH;
+
+	snb_uncore_imc_event_start(event, 0);
+
+	box->n_events++;
+
+	return 0;
+}
+
+static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
+{
+	struct intel_uncore_box *box = uncore_event_to_box(event);
+	int i;
+
+	snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
+
+	for (i = 0; i < box->n_events; i++) {
+		if (event == box->event_list[i]) {
+			--box->n_events;
+			break;
+		}
+	}
+}
+
+int snb_pci2phy_map_init(int devid)
+{
+	struct pci_dev *dev = NULL;
+	struct pci2phy_map *map;
+	int bus, segment;
+
+	dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+	if (!dev)
+		return -ENOTTY;
+
+	bus = dev->bus->number;
+	segment = pci_domain_nr(dev->bus);
+
+	raw_spin_lock(&pci2phy_map_lock);
+	map = __find_pci2phy_map(segment);
+	if (!map) {
+		raw_spin_unlock(&pci2phy_map_lock);
+		pci_dev_put(dev);
+		return -ENOMEM;
+	}
+	map->pbus_to_physid[bus] = 0;
+	raw_spin_unlock(&pci2phy_map_lock);
+
+	pci_dev_put(dev);
+
+	return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+	.task_ctx_nr	= perf_invalid_context,
+	.event_init	= snb_uncore_imc_event_init,
+	.add		= snb_uncore_imc_event_add,
+	.del		= snb_uncore_imc_event_del,
+	.start		= snb_uncore_imc_event_start,
+	.stop		= snb_uncore_imc_event_stop,
+	.read		= uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+	.init_box	= snb_uncore_imc_init_box,
+	.enable_box	= snb_uncore_imc_enable_box,
+	.disable_box	= snb_uncore_imc_disable_box,
+	.disable_event	= snb_uncore_imc_disable_event,
+	.enable_event	= snb_uncore_imc_enable_event,
+	.hw_config	= snb_uncore_imc_hw_config,
+	.read_counter	= snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.fixed_ctr_bits	= 32,
+	.fixed_ctr	= SNB_UNCORE_PCI_IMC_CTR_BASE,
+	.event_descs	= snb_uncore_imc_events,
+	.format_group	= &snb_uncore_imc_format_group,
+	.perf_ctr	= SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+	.event_mask	= SNB_UNCORE_PCI_IMC_EVENT_MASK,
+	.ops		= &snb_uncore_imc_ops,
+	.pmu		= &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+	[SNB_PCI_UNCORE_IMC]	= &snb_uncore_imc,
+	NULL,
+};
+
+static const struct pci_device_id snb_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static const struct pci_device_id ivb_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static const struct pci_device_id hsw_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static const struct pci_device_id skl_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+	.name		= "snb_uncore",
+	.id_table	= snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+	.name		= "ivb_uncore",
+	.id_table	= ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+	.name		= "hsw_uncore",
+	.id_table	= hsw_uncore_pci_ids,
+};
+
+static struct pci_driver bdw_uncore_pci_driver = {
+	.name		= "bdw_uncore",
+	.id_table	= bdw_uncore_pci_ids,
+};
+
+static struct pci_driver skl_uncore_pci_driver = {
+	.name		= "skl_uncore",
+	.id_table	= skl_uncore_pci_ids,
+};
+
+struct imc_uncore_pci_dev {
+	__u32 pci_id;
+	struct pci_driver *driver;
+};
+#define IMC_DEV(a, d) \
+	{ .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
+
+static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
+	IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
+	IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),    /* 3rd Gen Core processor */
+	IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
+	IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
+	IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
+	IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
+	IMC_DEV(SKL_IMC, &skl_uncore_pci_driver),    /* 6th Gen Core */
+	{  /* end marker */ }
+};
+
+
+#define for_each_imc_pci_id(x, t) \
+	for (x = (t); (x)->pci_id; x++)
+
+static struct pci_driver *imc_uncore_find_dev(void)
+{
+	const struct imc_uncore_pci_dev *p;
+	int ret;
+
+	for_each_imc_pci_id(p, desktop_imc_pci_ids) {
+		ret = snb_pci2phy_map_init(p->pci_id);
+		if (ret == 0)
+			return p->driver;
+	}
+	return NULL;
+}
+
+static int imc_uncore_pci_init(void)
+{
+	struct pci_driver *imc_drv = imc_uncore_find_dev();
+
+	if (!imc_drv)
+		return -ENODEV;
+
+	uncore_pci_uncores = snb_pci_uncores;
+	uncore_pci_driver = imc_drv;
+
+	return 0;
+}
+
+int snb_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+
+int ivb_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+int hsw_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+
+int bdw_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+
+int skl_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+	else
+		wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask8.attr,
+	NULL,
+};
+
+static struct attribute_group nhm_uncore_format_group = {
+	.name = "format",
+	.attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
+	INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
+	INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
+	INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+	.disable_box	= nhm_uncore_msr_disable_box,
+	.enable_box	= nhm_uncore_msr_enable_box,
+	.disable_event	= snb_uncore_msr_disable_event,
+	.enable_event	= nhm_uncore_msr_enable_event,
+	.read_counter	= uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+	.name		= "",
+	.num_counters   = 8,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.event_ctl	= NHM_UNC_PERFEVTSEL0,
+	.perf_ctr	= NHM_UNC_UNCORE_PMC0,
+	.fixed_ctr	= NHM_UNC_FIXED_CTR,
+	.fixed_ctl	= NHM_UNC_FIXED_CTR_CTRL,
+	.event_mask	= NHM_UNC_RAW_EVENT_MASK,
+	.event_descs	= nhm_uncore_events,
+	.ops		= &nhm_uncore_msr_ops,
+	.format_group	= &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+	&nhm_uncore,
+	NULL,
+};
+
+void nhm_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = nhm_msr_uncores;
+}
+
+/* end of Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 316f53a..696d106 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,8 +33,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore_snb.o \
-					   perf_event_intel_uncore_snbep.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore_snbep.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
 endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
deleted file mode 100644
index e0e41f5..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ /dev/null
@@ -1,717 +0,0 @@
-/* Nehalem/SandBridge/Haswell uncore support */
-#include "../../events/intel/uncore.h"
-
-/* Uncore IMC PCI IDs */
-#define PCI_DEVICE_ID_INTEL_SNB_IMC	0x0100
-#define PCI_DEVICE_ID_INTEL_IVB_IMC	0x0154
-#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC	0x0150
-#define PCI_DEVICE_ID_INTEL_HSW_IMC	0x0c00
-#define PCI_DEVICE_ID_INTEL_HSW_U_IMC	0x0a04
-#define PCI_DEVICE_ID_INTEL_BDW_IMC	0x1604
-#define PCI_DEVICE_ID_INTEL_SKL_IMC	0x191f
-
-/* SNB event control */
-#define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
-#define SNB_UNC_CTL_UMASK_MASK			0x0000ff00
-#define SNB_UNC_CTL_EDGE_DET			(1 << 18)
-#define SNB_UNC_CTL_EN				(1 << 22)
-#define SNB_UNC_CTL_INVERT			(1 << 23)
-#define SNB_UNC_CTL_CMASK_MASK			0x1f000000
-#define NHM_UNC_CTL_CMASK_MASK			0xff000000
-#define NHM_UNC_FIXED_CTR_CTL_EN		(1 << 0)
-
-#define SNB_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
-						 SNB_UNC_CTL_UMASK_MASK | \
-						 SNB_UNC_CTL_EDGE_DET | \
-						 SNB_UNC_CTL_INVERT | \
-						 SNB_UNC_CTL_CMASK_MASK)
-
-#define NHM_UNC_RAW_EVENT_MASK			(SNB_UNC_CTL_EV_SEL_MASK | \
-						 SNB_UNC_CTL_UMASK_MASK | \
-						 SNB_UNC_CTL_EDGE_DET | \
-						 SNB_UNC_CTL_INVERT | \
-						 NHM_UNC_CTL_CMASK_MASK)
-
-/* SNB global control register */
-#define SNB_UNC_PERF_GLOBAL_CTL                 0x391
-#define SNB_UNC_FIXED_CTR_CTRL                  0x394
-#define SNB_UNC_FIXED_CTR                       0x395
-
-/* SNB uncore global control */
-#define SNB_UNC_GLOBAL_CTL_CORE_ALL             ((1 << 4) - 1)
-#define SNB_UNC_GLOBAL_CTL_EN                   (1 << 29)
-
-/* SNB Cbo register */
-#define SNB_UNC_CBO_0_PERFEVTSEL0               0x700
-#define SNB_UNC_CBO_0_PER_CTR0                  0x706
-#define SNB_UNC_CBO_MSR_OFFSET                  0x10
-
-/* SNB ARB register */
-#define SNB_UNC_ARB_PER_CTR0			0x3b0
-#define SNB_UNC_ARB_PERFEVTSEL0			0x3b2
-#define SNB_UNC_ARB_MSR_OFFSET			0x10
-
-/* NHM global control register */
-#define NHM_UNC_PERF_GLOBAL_CTL                 0x391
-#define NHM_UNC_FIXED_CTR                       0x394
-#define NHM_UNC_FIXED_CTR_CTRL                  0x395
-
-/* NHM uncore global control */
-#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL            ((1ULL << 8) - 1)
-#define NHM_UNC_GLOBAL_CTL_EN_FC                (1ULL << 32)
-
-/* NHM uncore register */
-#define NHM_UNC_PERFEVTSEL0                     0x3c0
-#define NHM_UNC_UNCORE_PMC0                     0x3b0
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
-
-/* Sandy Bridge uncore support */
-static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-	else
-		wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
-}
-
-static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	wrmsrl(event->hw.config_base, 0);
-}
-
-static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	if (box->pmu->pmu_idx == 0) {
-		wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
-			SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
-	}
-}
-
-static struct uncore_event_desc snb_uncore_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
-	{ /* end: all zeroes */ },
-};
-
-static struct attribute *snb_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask5.attr,
-	NULL,
-};
-
-static struct attribute_group snb_uncore_format_group = {
-	.name		= "format",
-	.attrs		= snb_uncore_formats_attr,
-};
-
-static struct intel_uncore_ops snb_uncore_msr_ops = {
-	.init_box	= snb_uncore_msr_init_box,
-	.disable_event	= snb_uncore_msr_disable_event,
-	.enable_event	= snb_uncore_msr_enable_event,
-	.read_counter	= uncore_msr_read_counter,
-};
-
-static struct event_constraint snb_uncore_arb_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snb_uncore_cbox = {
-	.name		= "cbox",
-	.num_counters   = 2,
-	.num_boxes	= 4,
-	.perf_ctr_bits	= 44,
-	.fixed_ctr_bits	= 48,
-	.perf_ctr	= SNB_UNC_CBO_0_PER_CTR0,
-	.event_ctl	= SNB_UNC_CBO_0_PERFEVTSEL0,
-	.fixed_ctr	= SNB_UNC_FIXED_CTR,
-	.fixed_ctl	= SNB_UNC_FIXED_CTR_CTRL,
-	.single_fixed	= 1,
-	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
-	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET,
-	.ops		= &snb_uncore_msr_ops,
-	.format_group	= &snb_uncore_format_group,
-	.event_descs	= snb_uncore_events,
-};
-
-static struct intel_uncore_type snb_uncore_arb = {
-	.name		= "arb",
-	.num_counters   = 2,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.perf_ctr	= SNB_UNC_ARB_PER_CTR0,
-	.event_ctl	= SNB_UNC_ARB_PERFEVTSEL0,
-	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
-	.msr_offset	= SNB_UNC_ARB_MSR_OFFSET,
-	.constraints	= snb_uncore_arb_constraints,
-	.ops		= &snb_uncore_msr_ops,
-	.format_group	= &snb_uncore_format_group,
-};
-
-static struct intel_uncore_type *snb_msr_uncores[] = {
-	&snb_uncore_cbox,
-	&snb_uncore_arb,
-	NULL,
-};
-
-void snb_uncore_cpu_init(void)
-{
-	uncore_msr_uncores = snb_msr_uncores;
-	if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-}
-
-enum {
-	SNB_PCI_UNCORE_IMC,
-};
-
-static struct uncore_event_desc snb_uncore_imc_events[] = {
-	INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"),
-	INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
-
-	INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
-	INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
-
-	{ /* end: all zeroes */ },
-};
-
-#define SNB_UNCORE_PCI_IMC_EVENT_MASK		0xff
-#define SNB_UNCORE_PCI_IMC_BAR_OFFSET		0x48
-
-/* page size multiple covering all config regs */
-#define SNB_UNCORE_PCI_IMC_MAP_SIZE		0x6000
-
-#define SNB_UNCORE_PCI_IMC_DATA_READS		0x1
-#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE	0x5050
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES		0x2
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE	0x5054
-#define SNB_UNCORE_PCI_IMC_CTR_BASE		SNB_UNCORE_PCI_IMC_DATA_READS_BASE
-
-static struct attribute *snb_uncore_imc_formats_attr[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-
-static struct attribute_group snb_uncore_imc_format_group = {
-	.name = "format",
-	.attrs = snb_uncore_imc_formats_attr,
-};
-
-static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
-	resource_size_t addr;
-	u32 pci_dword;
-
-	pci_read_config_dword(pdev, where, &pci_dword);
-	addr = pci_dword;
-
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
-	pci_read_config_dword(pdev, where + 4, &pci_dword);
-	addr |= ((resource_size_t)pci_dword << 32);
-#endif
-
-	addr &= ~(PAGE_SIZE - 1);
-
-	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
-	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
-}
-
-static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
-}
-
-/*
- * custom event_init() function because we define our own fixed, free
- * running counters, so we do not want to conflict with generic uncore
- * logic. Also simplifies processing
- */
-static int snb_uncore_imc_event_init(struct perf_event *event)
-{
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
-	int idx, base;
-
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	pmu = uncore_event_to_pmu(event);
-	/* no device found for this pmu */
-	if (pmu->func_id < 0)
-		return -ENOENT;
-
-	/* Sampling not supported yet */
-	if (hwc->sample_period)
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	/*
-	 * Place all uncore events for a particular physical package
-	 * onto a single cpu
-	 */
-	if (event->cpu < 0)
-		return -EINVAL;
-
-	/* check only supported bits are set */
-	if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
-		return -EINVAL;
-
-	box = uncore_pmu_to_box(pmu, event->cpu);
-	if (!box || box->cpu < 0)
-		return -EINVAL;
-
-	event->cpu = box->cpu;
-
-	event->hw.idx = -1;
-	event->hw.last_tag = ~0ULL;
-	event->hw.extra_reg.idx = EXTRA_REG_NONE;
-	event->hw.branch_reg.idx = EXTRA_REG_NONE;
-	/*
-	 * check event is known (whitelist, determines counter)
-	 */
-	switch (cfg) {
-	case SNB_UNCORE_PCI_IMC_DATA_READS:
-		base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
-		idx = UNCORE_PMC_IDX_FIXED;
-		break;
-	case SNB_UNCORE_PCI_IMC_DATA_WRITES:
-		base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
-		idx = UNCORE_PMC_IDX_FIXED + 1;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	/* must be done before validate_group */
-	event->hw.event_base = base;
-	event->hw.config = cfg;
-	event->hw.idx = idx;
-
-	/* no group validation needed, we have free running counters */
-
-	return 0;
-}
-
-static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return 0;
-}
-
-static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	u64 count;
-
-	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
-		return;
-
-	event->hw.state = 0;
-	box->n_active++;
-
-	list_add_tail(&event->active_entry, &box->active_list);
-
-	count = snb_uncore_imc_read_counter(box, event);
-	local64_set(&event->hw.prev_count, count);
-
-	if (box->n_active == 1)
-		uncore_pmu_start_hrtimer(box);
-}
-
-static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (!(hwc->state & PERF_HES_STOPPED)) {
-		box->n_active--;
-
-		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-		hwc->state |= PERF_HES_STOPPED;
-
-		list_del(&event->active_entry);
-
-		if (box->n_active == 0)
-			uncore_pmu_cancel_hrtimer(box);
-	}
-
-	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
-		/*
-		 * Drain the remaining delta count out of a event
-		 * that we are disabling:
-		 */
-		uncore_perf_event_update(box, event);
-		hwc->state |= PERF_HES_UPTODATE;
-	}
-}
-
-static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (!box)
-		return -ENODEV;
-
-	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
-	if (!(flags & PERF_EF_START))
-		hwc->state |= PERF_HES_ARCH;
-
-	snb_uncore_imc_event_start(event, 0);
-
-	box->n_events++;
-
-	return 0;
-}
-
-static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
-{
-	struct intel_uncore_box *box = uncore_event_to_box(event);
-	int i;
-
-	snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
-
-	for (i = 0; i < box->n_events; i++) {
-		if (event == box->event_list[i]) {
-			--box->n_events;
-			break;
-		}
-	}
-}
-
-int snb_pci2phy_map_init(int devid)
-{
-	struct pci_dev *dev = NULL;
-	struct pci2phy_map *map;
-	int bus, segment;
-
-	dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
-	if (!dev)
-		return -ENOTTY;
-
-	bus = dev->bus->number;
-	segment = pci_domain_nr(dev->bus);
-
-	raw_spin_lock(&pci2phy_map_lock);
-	map = __find_pci2phy_map(segment);
-	if (!map) {
-		raw_spin_unlock(&pci2phy_map_lock);
-		pci_dev_put(dev);
-		return -ENOMEM;
-	}
-	map->pbus_to_physid[bus] = 0;
-	raw_spin_unlock(&pci2phy_map_lock);
-
-	pci_dev_put(dev);
-
-	return 0;
-}
-
-static struct pmu snb_uncore_imc_pmu = {
-	.task_ctx_nr	= perf_invalid_context,
-	.event_init	= snb_uncore_imc_event_init,
-	.add		= snb_uncore_imc_event_add,
-	.del		= snb_uncore_imc_event_del,
-	.start		= snb_uncore_imc_event_start,
-	.stop		= snb_uncore_imc_event_stop,
-	.read		= uncore_pmu_event_read,
-};
-
-static struct intel_uncore_ops snb_uncore_imc_ops = {
-	.init_box	= snb_uncore_imc_init_box,
-	.enable_box	= snb_uncore_imc_enable_box,
-	.disable_box	= snb_uncore_imc_disable_box,
-	.disable_event	= snb_uncore_imc_disable_event,
-	.enable_event	= snb_uncore_imc_enable_event,
-	.hw_config	= snb_uncore_imc_hw_config,
-	.read_counter	= snb_uncore_imc_read_counter,
-};
-
-static struct intel_uncore_type snb_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 2,
-	.num_boxes	= 1,
-	.fixed_ctr_bits	= 32,
-	.fixed_ctr	= SNB_UNCORE_PCI_IMC_CTR_BASE,
-	.event_descs	= snb_uncore_imc_events,
-	.format_group	= &snb_uncore_imc_format_group,
-	.perf_ctr	= SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
-	.event_mask	= SNB_UNCORE_PCI_IMC_EVENT_MASK,
-	.ops		= &snb_uncore_imc_ops,
-	.pmu		= &snb_uncore_imc_pmu,
-};
-
-static struct intel_uncore_type *snb_pci_uncores[] = {
-	[SNB_PCI_UNCORE_IMC]	= &snb_uncore_imc,
-	NULL,
-};
-
-static const struct pci_device_id snb_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static const struct pci_device_id ivb_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static const struct pci_device_id hsw_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static const struct pci_device_id bdw_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static const struct pci_device_id skl_uncore_pci_ids[] = {
-	{ /* IMC */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_IMC),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
-	},
-	{ /* end: all zeroes */ },
-};
-
-static struct pci_driver snb_uncore_pci_driver = {
-	.name		= "snb_uncore",
-	.id_table	= snb_uncore_pci_ids,
-};
-
-static struct pci_driver ivb_uncore_pci_driver = {
-	.name		= "ivb_uncore",
-	.id_table	= ivb_uncore_pci_ids,
-};
-
-static struct pci_driver hsw_uncore_pci_driver = {
-	.name		= "hsw_uncore",
-	.id_table	= hsw_uncore_pci_ids,
-};
-
-static struct pci_driver bdw_uncore_pci_driver = {
-	.name		= "bdw_uncore",
-	.id_table	= bdw_uncore_pci_ids,
-};
-
-static struct pci_driver skl_uncore_pci_driver = {
-	.name		= "skl_uncore",
-	.id_table	= skl_uncore_pci_ids,
-};
-
-struct imc_uncore_pci_dev {
-	__u32 pci_id;
-	struct pci_driver *driver;
-};
-#define IMC_DEV(a, d) \
-	{ .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
-
-static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
-	IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
-	IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver),    /* 3rd Gen Core processor */
-	IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
-	IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
-	IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
-	IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
-	IMC_DEV(SKL_IMC, &skl_uncore_pci_driver),    /* 6th Gen Core */
-	{  /* end marker */ }
-};
-
-
-#define for_each_imc_pci_id(x, t) \
-	for (x = (t); (x)->pci_id; x++)
-
-static struct pci_driver *imc_uncore_find_dev(void)
-{
-	const struct imc_uncore_pci_dev *p;
-	int ret;
-
-	for_each_imc_pci_id(p, desktop_imc_pci_ids) {
-		ret = snb_pci2phy_map_init(p->pci_id);
-		if (ret == 0)
-			return p->driver;
-	}
-	return NULL;
-}
-
-static int imc_uncore_pci_init(void)
-{
-	struct pci_driver *imc_drv = imc_uncore_find_dev();
-
-	if (!imc_drv)
-		return -ENODEV;
-
-	uncore_pci_uncores = snb_pci_uncores;
-	uncore_pci_driver = imc_drv;
-
-	return 0;
-}
-
-int snb_uncore_pci_init(void)
-{
-	return imc_uncore_pci_init();
-}
-
-int ivb_uncore_pci_init(void)
-{
-	return imc_uncore_pci_init();
-}
-int hsw_uncore_pci_init(void)
-{
-	return imc_uncore_pci_init();
-}
-
-int bdw_uncore_pci_init(void)
-{
-	return imc_uncore_pci_init();
-}
-
-int skl_uncore_pci_init(void)
-{
-	return imc_uncore_pci_init();
-}
-
-/* end of Sandy Bridge uncore support */
-
-/* Nehalem uncore support */
-static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
-}
-
-static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-	wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
-}
-
-static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	if (hwc->idx < UNCORE_PMC_IDX_FIXED)
-		wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
-	else
-		wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
-}
-
-static struct attribute *nhm_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask8.attr,
-	NULL,
-};
-
-static struct attribute_group nhm_uncore_format_group = {
-	.name = "format",
-	.attrs = nhm_uncore_formats_attr,
-};
-
-static struct uncore_event_desc nhm_uncore_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks,                "event=0xff,umask=0x00"),
-	INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any,       "event=0x2f,umask=0x0f"),
-	INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any,      "event=0x2c,umask=0x0f"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads,     "event=0x20,umask=0x01"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes,    "event=0x20,umask=0x02"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads,  "event=0x20,umask=0x04"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads,   "event=0x20,umask=0x10"),
-	INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes,  "event=0x20,umask=0x20"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhm_uncore_msr_ops = {
-	.disable_box	= nhm_uncore_msr_disable_box,
-	.enable_box	= nhm_uncore_msr_enable_box,
-	.disable_event	= snb_uncore_msr_disable_event,
-	.enable_event	= nhm_uncore_msr_enable_event,
-	.read_counter	= uncore_msr_read_counter,
-};
-
-static struct intel_uncore_type nhm_uncore = {
-	.name		= "",
-	.num_counters   = 8,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.event_ctl	= NHM_UNC_PERFEVTSEL0,
-	.perf_ctr	= NHM_UNC_UNCORE_PMC0,
-	.fixed_ctr	= NHM_UNC_FIXED_CTR,
-	.fixed_ctl	= NHM_UNC_FIXED_CTR_CTRL,
-	.event_mask	= NHM_UNC_RAW_EVENT_MASK,
-	.event_descs	= nhm_uncore_events,
-	.ops		= &nhm_uncore_msr_ops,
-	.format_group	= &nhm_uncore_format_group,
-};
-
-static struct intel_uncore_type *nhm_msr_uncores[] = {
-	&nhm_uncore,
-	NULL,
-};
-
-void nhm_uncore_cpu_init(void)
-{
-	uncore_msr_uncores = nhm_msr_uncores;
-}
-
-/* end of Nehalem uncore support */
-- 
cgit v0.10.2


From ed367e6ca42716a11a6d1b5162fdd378f9494eff Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:18 +0100
Subject: perf/x86: Move perf_event_intel_uncore_snbep.c =>
 x86/events/intel/uncore_snbep.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-13-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index eb0d921..8c939ca 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -9,4 +9,4 @@ obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
new file mode 100644
index 0000000..0c801f7
--- /dev/null
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -0,0 +1,3125 @@
+/* SandyBridge-EP/IvyTown uncore support */
+#include "uncore.h"
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL	(1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS	(1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ		(1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN	(1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
+					 SNBEP_PMON_BOX_CTL_RST_CTRS | \
+					 SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK	0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK	0x0000ff00
+#define SNBEP_PMON_CTL_RST		(1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET		(1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT	(1 << 21)
+#define SNBEP_PMON_CTL_EN		(1 << 22)
+#define SNBEP_PMON_CTL_INVERT		(1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK	0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK	(SNBEP_PMON_CTL_EV_SEL_MASK | \
+					 SNBEP_PMON_CTL_UMASK_MASK | \
+					 SNBEP_PMON_CTL_EDGE_DET | \
+					 SNBEP_PMON_CTL_INVERT | \
+					 SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK		0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK		\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PMON_CTL_UMASK_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN		(1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK	(SNBEP_PMON_RAW_EVENT_MASK | \
+						 SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK	0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK	0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT	(1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET	(1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_RAW_EVENT_MASK | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL			0xf4
+#define SNBEP_PCI_PMON_CTL0			0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0			0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0	0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1	0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH	0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL		0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR		0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0		0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1		0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0		0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1		0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0			0xc16
+#define SNBEP_U_MSR_PMON_CTL0			0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL		0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR		0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0			0xd16
+#define SNBEP_C0_MSR_PMON_CTL0			0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL		0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER		0xd14
+#define SNBEP_CBO_MSR_OFFSET			0x20
+
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID	0x1f
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID	0x3fc00
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE	0x7c0000
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC	0xff800000
+
+#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {	\
+	.event = (e),				\
+	.msr = SNBEP_C0_MSR_PMON_BOX_FILTER,	\
+	.config_mask = (m),			\
+	.idx = (i)				\
+}
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0			0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0			0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL		0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER		0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK	0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR		0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR		0x3fd
+
+/* IVBEP event control */
+#define IVBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
+					 SNBEP_PMON_BOX_CTL_RST_CTRS)
+#define IVBEP_PMON_RAW_EVENT_MASK		(SNBEP_PMON_CTL_EV_SEL_MASK | \
+					 SNBEP_PMON_CTL_UMASK_MASK | \
+					 SNBEP_PMON_CTL_EDGE_DET | \
+					 SNBEP_PMON_CTL_TRESH_MASK)
+/* IVBEP Ubox */
+#define IVBEP_U_MSR_PMON_GLOBAL_CTL		0xc00
+#define IVBEP_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
+#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL		(1 << 29)
+
+#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PMON_CTL_UMASK_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+/* IVBEP Cbo */
+#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK		(IVBEP_PMON_RAW_EVENT_MASK | \
+						 SNBEP_CBO_PMON_CTL_TID_EN)
+
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID		(0x1fULL << 0)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 5)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE	(0x3fULL << 17)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID		(0xffffULL << 32)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC		(0x1ffULL << 52)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6		(0x1ULL << 61)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC		(0x1ULL << 62)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC	(0x1ULL << 63)
+
+/* IVBEP home agent */
+#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST		(1 << 16)
+#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK		\
+				(IVBEP_PMON_RAW_EVENT_MASK | \
+				 IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
+/* IVBEP PCU */
+#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
+				(SNBEP_PMON_CTL_EV_SEL_MASK | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* IVBEP QPI */
+#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
+				(IVBEP_PMON_RAW_EVENT_MASK | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT)
+
+#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
+				((1ULL << (n)) - 1)))
+
+/* Haswell-EP Ubox */
+#define HSWEP_U_MSR_PMON_CTR0			0x709
+#define HSWEP_U_MSR_PMON_CTL0			0x705
+#define HSWEP_U_MSR_PMON_FILTER			0x707
+
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL		0x703
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR		0x704
+
+#define HSWEP_U_MSR_PMON_BOX_FILTER_TID		(0x1 << 0)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_CID		(0x1fULL << 1)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
+					(HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
+					 HSWEP_U_MSR_PMON_BOX_FILTER_CID)
+
+/* Haswell-EP CBo */
+#define HSWEP_C0_MSR_PMON_CTR0			0xe08
+#define HSWEP_C0_MSR_PMON_CTL0			0xe01
+#define HSWEP_C0_MSR_PMON_BOX_CTL			0xe00
+#define HSWEP_C0_MSR_PMON_BOX_FILTER0		0xe05
+#define HSWEP_CBO_MSR_OFFSET			0x10
+
+
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID		(0x3fULL << 0)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 6)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE	(0x7fULL << 17)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID		(0xffffULL << 32)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC		(0x1ffULL << 52)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6		(0x1ULL << 61)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC		(0x1ULL << 62)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC	(0x1ULL << 63)
+
+
+/* Haswell-EP Sbox */
+#define HSWEP_S0_MSR_PMON_CTR0			0x726
+#define HSWEP_S0_MSR_PMON_CTL0			0x721
+#define HSWEP_S0_MSR_PMON_BOX_CTL			0x720
+#define HSWEP_SBOX_MSR_OFFSET			0xa
+#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK		(SNBEP_PMON_RAW_EVENT_MASK | \
+						 SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* Haswell-EP PCU */
+#define HSWEP_PCU_MSR_PMON_CTR0			0x717
+#define HSWEP_PCU_MSR_PMON_CTL0			0x711
+#define HSWEP_PCU_MSR_PMON_BOX_CTL		0x710
+#define HSWEP_PCU_MSR_PMON_BOX_FILTER		0x715
+
+/* KNL Ubox */
+#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
+					(SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
+						SNBEP_CBO_PMON_CTL_TID_EN)
+/* KNL CHA */
+#define KNL_CHA_MSR_OFFSET			0xc
+#define KNL_CHA_MSR_PMON_CTL_QOR		(1 << 16)
+#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
+					(SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
+					 KNL_CHA_MSR_PMON_CTL_QOR)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_TID		0x1ff
+#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE	(7 << 18)
+#define KNL_CHA_MSR_PMON_BOX_FILTER_OP		(0xfffffe2aULL << 32)
+
+/* KNL EDC/MC UCLK */
+#define KNL_UCLK_MSR_PMON_CTR0_LOW		0x400
+#define KNL_UCLK_MSR_PMON_CTL0			0x420
+#define KNL_UCLK_MSR_PMON_BOX_CTL		0x430
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW	0x44c
+#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL	0x454
+#define KNL_PMON_FIXED_CTL_EN			0x1
+
+/* KNL EDC */
+#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW		0xa00
+#define KNL_EDC0_ECLK_MSR_PMON_CTL0		0xa20
+#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL		0xa30
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW	0xa3c
+#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL	0xa44
+
+/* KNL MC */
+#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW		0xb00
+#define KNL_MC0_CH0_MSR_PMON_CTL0		0xb20
+#define KNL_MC0_CH0_MSR_PMON_BOX_CTL		0xb30
+#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW		0xb3c
+#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL		0xb44
+
+/* KNL IRP */
+#define KNL_IRP_PCI_PMON_BOX_CTL		0xf0
+#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK		(SNBEP_PMON_RAW_EVENT_MASK | \
+						 KNL_CHA_MSR_PMON_CTL_QOR)
+/* KNL PCU */
+#define KNL_PCU_PMON_CTL_EV_SEL_MASK		0x0000007f
+#define KNL_PCU_PMON_CTL_USE_OCC_CTR		(1 << 7)
+#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK		0x3f000000
+#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK	\
+				(KNL_PCU_PMON_CTL_EV_SEL_MASK | \
+				 KNL_PCU_PMON_CTL_USE_OCC_CTR | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+				 SNBEP_PMON_CTL_EDGE_DET | \
+				 SNBEP_CBO_PMON_CTL_TID_EN | \
+				 SNBEP_PMON_CTL_EV_SEL_EXT | \
+				 SNBEP_PMON_CTL_INVERT | \
+				 KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
+DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
+DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+	u32 config = 0;
+
+	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+		config |= SNBEP_PMON_BOX_CTL_FRZ;
+		pci_write_config_dword(pdev, box_ctl, config);
+	}
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+	u32 config = 0;
+
+	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+		pci_write_config_dword(pdev, box_ctl, config);
+	}
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 count = 0;
+
+	pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+	pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+	return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+	u64 config;
+	unsigned msr;
+
+	msr = uncore_msr_box_ctl(box);
+	if (msr) {
+		rdmsrl(msr, config);
+		config |= SNBEP_PMON_BOX_CTL_FRZ;
+		wrmsrl(msr, config);
+	}
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+	u64 config;
+	unsigned msr;
+
+	msr = uncore_msr_box_ctl(box);
+	if (msr) {
+		rdmsrl(msr, config);
+		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+		wrmsrl(msr, config);
+	}
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE)
+		wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
+
+	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+
+	if (msr)
+		wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid.attr,
+	&format_attr_filter_nid.attr,
+	&format_attr_filter_state.attr,
+	&format_attr_filter_opc.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_occ_sel.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge.attr,
+	&format_attr_filter_band0.attr,
+	&format_attr_filter_band1.attr,
+	&format_attr_filter_band2.attr,
+	&format_attr_filter_band3.attr,
+	NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_match_rds.attr,
+	&format_attr_match_rnid30.attr,
+	&format_attr_match_rnid4.attr,
+	&format_attr_match_dnid.attr,
+	&format_attr_match_mc.attr,
+	&format_attr_match_opc.attr,
+	&format_attr_match_vnw.attr,
+	&format_attr_match0.attr,
+	&format_attr_match1.attr,
+	&format_attr_mask_rds.attr,
+	&format_attr_mask_rnid30.attr,
+	&format_attr_mask_rnid4.attr,
+	&format_attr_mask_dnid.attr,
+	&format_attr_mask_mc.attr,
+	&format_attr_mask_opc.attr,
+	&format_attr_mask_vnw.attr,
+	&format_attr_mask0.attr,
+	&format_attr_mask1.attr,
+	NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+	{ /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
+	INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+	INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
+	INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
+	{ /* end: all zeroes */ },
+};
+
+static struct attribute_group snbep_uncore_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_qpi_format_group = {
+	.name = "format",
+	.attrs = snbep_uncore_qpi_formats_attr,
+};
+
+#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
+	.disable_box	= snbep_uncore_msr_disable_box,		\
+	.enable_box	= snbep_uncore_msr_enable_box,		\
+	.disable_event	= snbep_uncore_msr_disable_event,	\
+	.enable_event	= snbep_uncore_msr_enable_event,	\
+	.read_counter	= uncore_msr_read_counter
+
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
+	__SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),			\
+	.init_box	= snbep_uncore_msr_init_box		\
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()			\
+	.init_box	= snbep_uncore_pci_init_box,		\
+	.disable_box	= snbep_uncore_pci_disable_box,		\
+	.enable_box	= snbep_uncore_pci_enable_box,		\
+	.disable_event	= snbep_uncore_pci_disable_event,	\
+	.read_counter	= snbep_uncore_pci_read_counter
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+	.enable_event	= snbep_uncore_pci_enable_event,	\
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+	UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+	EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+	.name		= "ubox",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.fixed_ctr_bits	= 48,
+	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
+	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
+	.event_mask	= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops		= &snbep_uncore_msr_ops,
+	.format_group	= &snbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
+	EVENT_EXTRA_END
+};
+
+static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+	int i;
+
+	if (uncore_box_is_fake(box))
+		return;
+
+	for (i = 0; i < 5; i++) {
+		if (reg1->alloc & (0x1 << i))
+			atomic_sub(1 << (i * 6), &er->ref);
+	}
+	reg1->alloc = 0;
+}
+
+static struct event_constraint *
+__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
+			    u64 (*cbox_filter_mask)(int fields))
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+	int i, alloc = 0;
+	unsigned long flags;
+	u64 mask;
+
+	if (reg1->idx == EXTRA_REG_NONE)
+		return NULL;
+
+	raw_spin_lock_irqsave(&er->lock, flags);
+	for (i = 0; i < 5; i++) {
+		if (!(reg1->idx & (0x1 << i)))
+			continue;
+		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+			continue;
+
+		mask = cbox_filter_mask(0x1 << i);
+		if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
+		    !((reg1->config ^ er->config) & mask)) {
+			atomic_add(1 << (i * 6), &er->ref);
+			er->config &= ~mask;
+			er->config |= reg1->config & mask;
+			alloc |= (0x1 << i);
+		} else {
+			break;
+		}
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+	if (i < 5)
+		goto fail;
+
+	if (!uncore_box_is_fake(box))
+		reg1->alloc |= alloc;
+
+	return NULL;
+fail:
+	for (; i >= 0; i--) {
+		if (alloc & (0x1 << i))
+			atomic_sub(1 << (i * 6), &er->ref);
+	}
+	return &uncore_constraint_empty;
+}
+
+static u64 snbep_cbox_filter_mask(int fields)
+{
+	u64 mask = 0;
+
+	if (fields & 0x1)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+	if (fields & 0x4)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x8)
+		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+	return mask;
+}
+
+static struct event_constraint *
+snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
+}
+
+static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_cbox_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= snbep_cbox_hw_config,
+	.get_constraint		= snbep_cbox_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 44,
+	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= snbep_uncore_cbox_constraints,
+	.ops			= &snbep_uncore_cbox_ops,
+	.format_group		= &snbep_uncore_cbox_format_group,
+};
+
+static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	u64 config = reg1->config;
+
+	if (new_idx > reg1->idx)
+		config <<= 8 * (new_idx - reg1->idx);
+	else
+		config >>= 8 * (reg1->idx - new_idx);
+
+	if (modify) {
+		hwc->config += new_idx - reg1->idx;
+		reg1->config = config;
+		reg1->idx = new_idx;
+	}
+	return config;
+}
+
+static struct event_constraint *
+snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+	unsigned long flags;
+	int idx = reg1->idx;
+	u64 mask, config1 = reg1->config;
+	bool ok = false;
+
+	if (reg1->idx == EXTRA_REG_NONE ||
+	    (!uncore_box_is_fake(box) && reg1->alloc))
+		return NULL;
+again:
+	mask = 0xffULL << (idx * 8);
+	raw_spin_lock_irqsave(&er->lock, flags);
+	if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
+	    !((config1 ^ er->config) & mask)) {
+		atomic_add(1 << (idx * 8), &er->ref);
+		er->config &= ~mask;
+		er->config |= config1 & mask;
+		ok = true;
+	}
+	raw_spin_unlock_irqrestore(&er->lock, flags);
+
+	if (!ok) {
+		idx = (idx + 1) % 4;
+		if (idx != reg1->idx) {
+			config1 = snbep_pcu_alter_er(event, idx, false);
+			goto again;
+		}
+		return &uncore_constraint_empty;
+	}
+
+	if (!uncore_box_is_fake(box)) {
+		if (idx != reg1->idx)
+			snbep_pcu_alter_er(event, idx, true);
+		reg1->alloc = 1;
+	}
+	return NULL;
+}
+
+static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+
+	if (uncore_box_is_fake(box) || !reg1->alloc)
+		return;
+
+	atomic_sub(1 << (reg1->idx * 8), &er->ref);
+	reg1->alloc = 0;
+}
+
+static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+	if (ev_sel >= 0xb && ev_sel <= 0xe) {
+		reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+		reg1->idx = ev_sel - 0xb;
+		reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
+	}
+	return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_pcu_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= snbep_pcu_hw_config,
+	.get_constraint		= snbep_pcu_get_constraint,
+	.put_constraint		= snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_pcu_ops,
+	.format_group		= &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+	&snbep_uncore_ubox,
+	&snbep_uncore_cbox,
+	&snbep_uncore_pcu,
+	NULL,
+};
+
+void snbep_uncore_cpu_init(void)
+{
+	if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = snbep_msr_uncores;
+}
+
+enum {
+	SNBEP_PCI_QPI_PORT0_FILTER,
+	SNBEP_PCI_QPI_PORT1_FILTER,
+	HSWEP_PCI_PCU_3,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+		reg1->idx = 0;
+		reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+		reg1->config = event->attr.config1;
+		reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+		reg2->config = event->attr.config2;
+	}
+	return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+		struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
+		if (filter_pdev) {
+			pci_write_config_dword(filter_pdev, reg1->reg,
+						(u32)reg1->config);
+			pci_write_config_dword(filter_pdev, reg1->reg + 4,
+						(u32)(reg1->config >> 32));
+			pci_write_config_dword(filter_pdev, reg2->reg,
+						(u32)reg2->config);
+			pci_write_config_dword(filter_pdev, reg2->reg + 4,
+						(u32)(reg2->config >> 32));
+		}
+	}
+
+	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+	.enable_event		= snbep_qpi_enable_event,
+	.hw_config		= snbep_qpi_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT()				\
+	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
+	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
+	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,		\
+	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
+	.ops		= &snbep_uncore_pci_ops,		\
+	.format_group	= &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 4,
+	.num_boxes	= 4,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= snbep_uncore_imc_events,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_qpi_ops,
+	.event_descs		= snbep_uncore_qpi_events,
+	.format_group		= &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 3,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r3qpi_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	SNBEP_PCI_UNCORE_HA,
+	SNBEP_PCI_UNCORE_IMC,
+	SNBEP_PCI_UNCORE_QPI,
+	SNBEP_PCI_UNCORE_R2PCIE,
+	SNBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+	[SNBEP_PCI_UNCORE_HA]		= &snbep_uncore_ha,
+	[SNBEP_PCI_UNCORE_IMC]		= &snbep_uncore_imc,
+	[SNBEP_PCI_UNCORE_QPI]		= &snbep_uncore_qpi,
+	[SNBEP_PCI_UNCORE_R2PCIE]	= &snbep_uncore_r2pcie,
+	[SNBEP_PCI_UNCORE_R3QPI]	= &snbep_uncore_r3qpi,
+	NULL,
+};
+
+static const struct pci_device_id snbep_uncore_pci_ids[] = {
+	{ /* Home Agent */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
+	},
+	{ /* MC Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
+	},
+	{ /* MC Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
+	},
+	{ /* QPI Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* R3QPI Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT0_FILTER),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT1_FILTER),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+	.name		= "snbep_uncore",
+	.id_table	= snbep_uncore_pci_ids,
+};
+
+/*
+ * build pci bus to socket mapping
+ */
+static int snbep_pci2phy_map_init(int devid)
+{
+	struct pci_dev *ubox_dev = NULL;
+	int i, bus, nodeid, segment;
+	struct pci2phy_map *map;
+	int err = 0;
+	u32 config = 0;
+
+	while (1) {
+		/* find the UBOX device */
+		ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
+		if (!ubox_dev)
+			break;
+		bus = ubox_dev->bus->number;
+		/* get the Node ID of the local register */
+		err = pci_read_config_dword(ubox_dev, 0x40, &config);
+		if (err)
+			break;
+		nodeid = config;
+		/* get the Node ID mapping */
+		err = pci_read_config_dword(ubox_dev, 0x54, &config);
+		if (err)
+			break;
+
+		segment = pci_domain_nr(ubox_dev->bus);
+		raw_spin_lock(&pci2phy_map_lock);
+		map = __find_pci2phy_map(segment);
+		if (!map) {
+			raw_spin_unlock(&pci2phy_map_lock);
+			err = -ENOMEM;
+			break;
+		}
+
+		/*
+		 * every three bits in the Node ID mapping register maps
+		 * to a particular node.
+		 */
+		for (i = 0; i < 8; i++) {
+			if (nodeid == ((config >> (3 * i)) & 0x7)) {
+				map->pbus_to_physid[bus] = i;
+				break;
+			}
+		}
+		raw_spin_unlock(&pci2phy_map_lock);
+	}
+
+	if (!err) {
+		/*
+		 * For PCI bus with no UBOX device, find the next bus
+		 * that has UBOX device and use its mapping.
+		 */
+		raw_spin_lock(&pci2phy_map_lock);
+		list_for_each_entry(map, &pci2phy_map_head, list) {
+			i = -1;
+			for (bus = 255; bus >= 0; bus--) {
+				if (map->pbus_to_physid[bus] >= 0)
+					i = map->pbus_to_physid[bus];
+				else
+					map->pbus_to_physid[bus] = i;
+			}
+		}
+		raw_spin_unlock(&pci2phy_map_lock);
+	}
+
+	pci_dev_put(ubox_dev);
+
+	return err ? pcibios_err_to_errno(err) : 0;
+}
+
+int snbep_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x3ce0);
+	if (ret)
+		return ret;
+	uncore_pci_uncores = snbep_pci_uncores;
+	uncore_pci_driver = &snbep_uncore_pci_driver;
+	return 0;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* IvyTown uncore support */
+static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+	if (msr)
+		wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+
+	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
+	.init_box	= ivbep_uncore_msr_init_box,		\
+	.disable_box	= snbep_uncore_msr_disable_box,		\
+	.enable_box	= snbep_uncore_msr_enable_box,		\
+	.disable_event	= snbep_uncore_msr_disable_event,	\
+	.enable_event	= snbep_uncore_msr_enable_event,	\
+	.read_counter	= uncore_msr_read_counter
+
+static struct intel_uncore_ops ivbep_uncore_msr_ops = {
+	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops ivbep_uncore_pci_ops = {
+	.init_box	= ivbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= snbep_uncore_pci_disable_event,
+	.enable_event	= snbep_uncore_pci_enable_event,
+	.read_counter	= snbep_uncore_pci_read_counter,
+};
+
+#define IVBEP_UNCORE_PCI_COMMON_INIT()				\
+	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
+	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
+	.event_mask	= IVBEP_PMON_RAW_EVENT_MASK,		\
+	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
+	.ops		= &ivbep_uncore_pci_ops,			\
+	.format_group	= &ivbep_uncore_format_group
+
+static struct attribute *ivbep_uncore_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid.attr,
+	&format_attr_filter_link.attr,
+	&format_attr_filter_state2.attr,
+	&format_attr_filter_nid2.attr,
+	&format_attr_filter_opc2.attr,
+	&format_attr_filter_nc.attr,
+	&format_attr_filter_c6.attr,
+	&format_attr_filter_isoc.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_occ_sel.attr,
+	&format_attr_edge.attr,
+	&format_attr_thresh5.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge.attr,
+	&format_attr_filter_band0.attr,
+	&format_attr_filter_band1.attr,
+	&format_attr_filter_band2.attr,
+	&format_attr_filter_band3.attr,
+	NULL,
+};
+
+static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
+	&format_attr_event_ext.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_match_rds.attr,
+	&format_attr_match_rnid30.attr,
+	&format_attr_match_rnid4.attr,
+	&format_attr_match_dnid.attr,
+	&format_attr_match_mc.attr,
+	&format_attr_match_opc.attr,
+	&format_attr_match_vnw.attr,
+	&format_attr_match0.attr,
+	&format_attr_match1.attr,
+	&format_attr_mask_rds.attr,
+	&format_attr_mask_rnid30.attr,
+	&format_attr_mask_rnid4.attr,
+	&format_attr_mask_dnid.attr,
+	&format_attr_mask_mc.attr,
+	&format_attr_mask_opc.attr,
+	&format_attr_mask_vnw.attr,
+	&format_attr_mask0.attr,
+	&format_attr_mask1.attr,
+	NULL,
+};
+
+static struct attribute_group ivbep_uncore_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_qpi_format_group = {
+	.name = "format",
+	.attrs = ivbep_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_type ivbep_uncore_ubox = {
+	.name		= "ubox",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.fixed_ctr_bits	= 48,
+	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
+	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
+	.event_mask	= IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops		= &ivbep_uncore_msr_ops,
+	.format_group	= &ivbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+	EVENT_EXTRA_END
+};
+
+static u64 ivbep_cbox_filter_mask(int fields)
+{
+	u64 mask = 0;
+
+	if (fields & 0x1)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+	if (fields & 0x4)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x8)
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+	if (fields & 0x10) {
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
+		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+	}
+
+	return mask;
+}
+
+static struct event_constraint *
+ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
+}
+
+static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		u64 filter = uncore_shared_reg_config(box, 0);
+		wrmsrl(reg1->reg, filter & 0xffffffff);
+		wrmsrl(reg1->reg + 6, filter >> 32);
+	}
+
+	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
+	.init_box		= ivbep_uncore_msr_init_box,
+	.disable_box		= snbep_uncore_msr_disable_box,
+	.enable_box		= snbep_uncore_msr_enable_box,
+	.disable_event		= snbep_uncore_msr_disable_event,
+	.enable_event		= ivbep_cbox_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= ivbep_cbox_hw_config,
+	.get_constraint		= ivbep_cbox_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 15,
+	.perf_ctr_bits		= 44,
+	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
+	.event_mask		= IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= snbep_uncore_cbox_constraints,
+	.ops			= &ivbep_uncore_cbox_ops,
+	.format_group		= &ivbep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
+	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= snbep_pcu_hw_config,
+	.get_constraint		= snbep_pcu_get_constraint,
+	.put_constraint		= snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &ivbep_uncore_pcu_ops,
+	.format_group		= &ivbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *ivbep_msr_uncores[] = {
+	&ivbep_uncore_ubox,
+	&ivbep_uncore_cbox,
+	&ivbep_uncore_pcu,
+	NULL,
+};
+
+void ivbep_uncore_cpu_init(void)
+{
+	if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = ivbep_msr_uncores;
+}
+
+static struct intel_uncore_type ivbep_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 4,
+	.num_boxes	= 8,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= snbep_uncore_imc_events,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
+			       hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 count = 0;
+
+	pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+	pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+	return count;
+}
+
+static struct intel_uncore_ops ivbep_uncore_irp_ops = {
+	.init_box	= ivbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= ivbep_uncore_irp_disable_event,
+	.enable_event	= ivbep_uncore_irp_enable_event,
+	.read_counter	= ivbep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivbep_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_mask		= IVBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.ops			= &ivbep_uncore_irp_ops,
+	.format_group		= &ivbep_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
+	.init_box	= ivbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= snbep_uncore_pci_disable_event,
+	.enable_event	= snbep_qpi_enable_event,
+	.read_counter	= snbep_uncore_pci_read_counter,
+	.hw_config	= snbep_qpi_hw_config,
+	.get_constraint	= uncore_get_constraint,
+	.put_constraint	= uncore_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 4,
+	.num_boxes		= 3,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &ivbep_uncore_qpi_ops,
+	.format_group		= &ivbep_uncore_qpi_format_group,
+};
+
+static struct intel_uncore_type ivbep_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r2pcie_constraints,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 3,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 44,
+	.constraints	= snbep_uncore_r3qpi_constraints,
+	IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	IVBEP_PCI_UNCORE_HA,
+	IVBEP_PCI_UNCORE_IMC,
+	IVBEP_PCI_UNCORE_IRP,
+	IVBEP_PCI_UNCORE_QPI,
+	IVBEP_PCI_UNCORE_R2PCIE,
+	IVBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *ivbep_pci_uncores[] = {
+	[IVBEP_PCI_UNCORE_HA]	= &ivbep_uncore_ha,
+	[IVBEP_PCI_UNCORE_IMC]	= &ivbep_uncore_imc,
+	[IVBEP_PCI_UNCORE_IRP]	= &ivbep_uncore_irp,
+	[IVBEP_PCI_UNCORE_QPI]	= &ivbep_uncore_qpi,
+	[IVBEP_PCI_UNCORE_R2PCIE]	= &ivbep_uncore_r2pcie,
+	[IVBEP_PCI_UNCORE_R3QPI]	= &ivbep_uncore_r3qpi,
+	NULL,
+};
+
+static const struct pci_device_id ivbep_uncore_pci_ids[] = {
+	{ /* Home Agent 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
+	},
+	{ /* Home Agent 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
+	},
+	{ /* MC0 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC0 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
+	},
+	{ /* MC0 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC0 Channel 4 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
+	},
+	{ /* MC1 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
+	},
+	{ /* MC1 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
+	},
+	{ /* MC1 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
+	},
+	{ /* MC1 Channel 4 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
+	},
+	{ /* QPI0 Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI0 Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
+	},
+	{ /* QPI1 Port 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* R3QPI0 Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI0 Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* R3QPI1 Link 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
+		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT0_FILTER),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT1_FILTER),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver ivbep_uncore_pci_driver = {
+	.name		= "ivbep_uncore",
+	.id_table	= ivbep_uncore_pci_ids,
+};
+
+int ivbep_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x0e1e);
+	if (ret)
+		return ret;
+	uncore_pci_uncores = ivbep_pci_uncores;
+	uncore_pci_driver = &ivbep_uncore_pci_driver;
+	return 0;
+}
+/* end of IvyTown uncore support */
+
+/* KNL uncore support */
+static struct attribute *knl_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= KNL_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.ops			= &snbep_uncore_msr_ops,
+	.format_group		= &knl_uncore_ubox_format_group,
+};
+
+static struct attribute *knl_uncore_cha_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_qor.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid4.attr,
+	&format_attr_filter_link3.attr,
+	&format_attr_filter_state4.attr,
+	&format_attr_filter_local.attr,
+	&format_attr_filter_all_op.attr,
+	&format_attr_filter_nnm.attr,
+	&format_attr_filter_opc3.attr,
+	&format_attr_filter_nc.attr,
+	&format_attr_filter_isoc.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_cha_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_cha_formats_attr,
+};
+
+static struct event_constraint knl_uncore_cha_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg knl_uncore_cha_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
+	EVENT_EXTRA_END
+};
+
+static u64 knl_cha_filter_mask(int fields)
+{
+	u64 mask = 0;
+
+	if (fields & 0x1)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x4)
+		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
+	return mask;
+}
+
+static struct event_constraint *
+knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
+}
+
+static int knl_cha_hw_config(struct intel_uncore_box *box,
+			     struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+			    KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+				    struct perf_event *event);
+
+static struct intel_uncore_ops knl_uncore_cha_ops = {
+	.init_box		= snbep_uncore_msr_init_box,
+	.disable_box		= snbep_uncore_msr_disable_box,
+	.enable_box		= snbep_uncore_msr_enable_box,
+	.disable_event		= snbep_uncore_msr_disable_event,
+	.enable_event		= hswep_cbox_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= knl_cha_hw_config,
+	.get_constraint		= knl_cha_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type knl_uncore_cha = {
+	.name			= "cha",
+	.num_counters		= 4,
+	.num_boxes		= 38,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= KNL_CHA_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= knl_uncore_cha_constraints,
+	.ops			= &knl_uncore_cha_ops,
+	.format_group		= &knl_uncore_cha_format_group,
+};
+
+static struct attribute *knl_uncore_pcu_formats_attr[] = {
+	&format_attr_event2.attr,
+	&format_attr_use_occ_ctr.attr,
+	&format_attr_occ_sel.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh6.attr,
+	&format_attr_occ_invert.attr,
+	&format_attr_occ_edge_det.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_pcu_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_pcu_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
+	.ops			= &snbep_uncore_msr_ops,
+	.format_group		= &knl_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *knl_msr_uncores[] = {
+	&knl_uncore_ubox,
+	&knl_uncore_cha,
+	&knl_uncore_pcu,
+	NULL,
+};
+
+void knl_uncore_cpu_init(void)
+{
+	uncore_msr_uncores = knl_msr_uncores;
+}
+
+static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	int box_ctl = uncore_pci_box_ctl(box);
+
+	pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
+					struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+
+	if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
+							== UNCORE_FIXED_EVENT)
+		pci_write_config_dword(pdev, hwc->config_base,
+				       hwc->config | KNL_PMON_FIXED_CTL_EN);
+	else
+		pci_write_config_dword(pdev, hwc->config_base,
+				       hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops knl_uncore_imc_ops = {
+	.init_box	= snbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= knl_uncore_imc_enable_box,
+	.read_counter	= snbep_uncore_pci_read_counter,
+	.enable_event	= knl_uncore_imc_enable_event,
+	.disable_event	= snbep_uncore_pci_disable_event,
+};
+
+static struct intel_uncore_type knl_uncore_imc_uclk = {
+	.name			= "imc_uclk",
+	.num_counters		= 4,
+	.num_boxes		= 2,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_imc_dclk = {
+	.name			= "imc",
+	.num_counters		= 4,
+	.num_boxes		= 6,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_MC0_CH0_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
+	.fixed_ctl		= KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
+	.box_ctl		= KNL_MC0_CH0_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_uclk = {
+	.name			= "edc_uclk",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
+	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
+	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type knl_uncore_edc_eclk = {
+	.name			= "edc_eclk",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
+	.event_ctl		= KNL_EDC0_ECLK_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
+	.fixed_ctl		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
+	.box_ctl		= KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
+	.ops			= &knl_uncore_imc_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct event_constraint knl_uncore_m2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type knl_uncore_m2pcie = {
+	.name		= "m2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.constraints	= knl_uncore_m2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct attribute *knl_uncore_irp_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_qor.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group knl_uncore_irp_format_group = {
+	.name = "format",
+	.attrs = knl_uncore_irp_formats_attr,
+};
+
+static struct intel_uncore_type knl_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= KNL_IRP_PCI_PMON_BOX_CTL,
+	.ops			= &snbep_uncore_pci_ops,
+	.format_group		= &knl_uncore_irp_format_group,
+};
+
+enum {
+	KNL_PCI_UNCORE_MC_UCLK,
+	KNL_PCI_UNCORE_MC_DCLK,
+	KNL_PCI_UNCORE_EDC_UCLK,
+	KNL_PCI_UNCORE_EDC_ECLK,
+	KNL_PCI_UNCORE_M2PCIE,
+	KNL_PCI_UNCORE_IRP,
+};
+
+static struct intel_uncore_type *knl_pci_uncores[] = {
+	[KNL_PCI_UNCORE_MC_UCLK]	= &knl_uncore_imc_uclk,
+	[KNL_PCI_UNCORE_MC_DCLK]	= &knl_uncore_imc_dclk,
+	[KNL_PCI_UNCORE_EDC_UCLK]	= &knl_uncore_edc_uclk,
+	[KNL_PCI_UNCORE_EDC_ECLK]	= &knl_uncore_edc_eclk,
+	[KNL_PCI_UNCORE_M2PCIE]		= &knl_uncore_m2pcie,
+	[KNL_PCI_UNCORE_IRP]		= &knl_uncore_irp,
+	NULL,
+};
+
+/*
+ * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
+ * device type. prior to KNL, each instance of a PMU device type had a unique
+ * device ID.
+ *
+ *	PCI Device ID	Uncore PMU Devices
+ *	----------------------------------
+ *	0x7841		MC0 UClk, MC1 UClk
+ *	0x7843		MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
+ *			MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
+ *	0x7833		EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
+ *			EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
+ *	0x7835		EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
+ *			EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
+ *	0x7817		M2PCIe
+ *	0x7814		IRP
+*/
+
+static const struct pci_device_id knl_uncore_pci_ids[] = {
+	{ /* MC UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
+	},
+	{ /* MC DClk Channel */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
+	},
+	{ /* EDC UClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
+	},
+	{ /* EDC EClk */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
+	},
+	{ /* M2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
+		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver knl_uncore_pci_driver = {
+	.name		= "knl_uncore",
+	.id_table	= knl_uncore_pci_ids,
+};
+
+int knl_uncore_pci_init(void)
+{
+	int ret;
+
+	/* All KNL PCI based PMON units are on the same PCI bus except IRP */
+	ret = snb_pci2phy_map_init(0x7814); /* IRP */
+	if (ret)
+		return ret;
+	ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
+	if (ret)
+		return ret;
+	uncore_pci_uncores = knl_pci_uncores;
+	uncore_pci_driver = &knl_uncore_pci_driver;
+	return 0;
+}
+
+/* end of KNL uncore support */
+
+/* Haswell-EP uncore support */
+static struct attribute *hswep_uncore_ubox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh5.attr,
+	&format_attr_filter_tid2.attr,
+	&format_attr_filter_cid.attr,
+	NULL,
+};
+
+static struct attribute_group hswep_uncore_ubox_format_group = {
+	.name = "format",
+	.attrs = hswep_uncore_ubox_formats_attr,
+};
+
+static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	reg1->reg = HSWEP_U_MSR_PMON_FILTER;
+	reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
+	reg1->idx = 0;
+	return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_ubox_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= hswep_ubox_hw_config,
+	.get_constraint		= uncore_get_constraint,
+	.put_constraint		= uncore_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 44,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &hswep_uncore_ubox_ops,
+	.format_group		= &hswep_uncore_ubox_format_group,
+};
+
+static struct attribute *hswep_uncore_cbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_thresh8.attr,
+	&format_attr_filter_tid3.attr,
+	&format_attr_filter_link2.attr,
+	&format_attr_filter_state3.attr,
+	&format_attr_filter_nid2.attr,
+	&format_attr_filter_opc2.attr,
+	&format_attr_filter_nc.attr,
+	&format_attr_filter_c6.attr,
+	&format_attr_filter_isoc.attr,
+	NULL,
+};
+
+static struct attribute_group hswep_uncore_cbox_format_group = {
+	.name = "format",
+	.attrs = hswep_uncore_cbox_formats_attr,
+};
+
+static struct event_constraint hswep_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
+	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+	SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+	EVENT_EXTRA_END
+};
+
+static u64 hswep_cbox_filter_mask(int fields)
+{
+	u64 mask = 0;
+	if (fields & 0x1)
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
+	if (fields & 0x2)
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+	if (fields & 0x4)
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+	if (fields & 0x8)
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
+	if (fields & 0x10) {
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
+		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+	}
+	return mask;
+}
+
+static struct event_constraint *
+hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+	return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
+}
+
+static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+	struct extra_reg *er;
+	int idx = 0;
+
+	for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
+		if (er->event != (event->hw.config & er->config_mask))
+			continue;
+		idx |= er->idx;
+	}
+
+	if (idx) {
+		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+			    HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+		reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
+		reg1->idx = idx;
+	}
+	return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+				  struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+	if (reg1->idx != EXTRA_REG_NONE) {
+		u64 filter = uncore_shared_reg_config(box, 0);
+		wrmsrl(reg1->reg, filter & 0xffffffff);
+		wrmsrl(reg1->reg + 1, filter >> 32);
+	}
+
+	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops hswep_uncore_cbox_ops = {
+	.init_box		= snbep_uncore_msr_init_box,
+	.disable_box		= snbep_uncore_msr_disable_box,
+	.enable_box		= snbep_uncore_msr_enable_box,
+	.disable_event		= snbep_uncore_msr_disable_event,
+	.enable_event		= hswep_cbox_enable_event,
+	.read_counter		= uncore_msr_read_counter,
+	.hw_config		= hswep_cbox_hw_config,
+	.get_constraint		= hswep_cbox_get_constraint,
+	.put_constraint		= snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 18,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= hswep_uncore_cbox_constraints,
+	.ops			= &hswep_uncore_cbox_ops,
+	.format_group		= &hswep_uncore_cbox_format_group,
+};
+
+/*
+ * Write SBOX Initialization register bit by bit to avoid spurious #GPs
+ */
+static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
+{
+	unsigned msr = uncore_msr_box_ctl(box);
+
+	if (msr) {
+		u64 init = SNBEP_PMON_BOX_CTL_INT;
+		u64 flags = 0;
+		int i;
+
+		for_each_set_bit(i, (unsigned long *)&init, 64) {
+			flags |= (1ULL << i);
+			wrmsrl(msr, flags);
+		}
+	}
+}
+
+static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
+	__SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.init_box		= hswep_uncore_sbox_msr_init_box
+};
+
+static struct attribute *hswep_uncore_sbox_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_tid_en.attr,
+	&format_attr_inv.attr,
+	&format_attr_thresh8.attr,
+	NULL,
+};
+
+static struct attribute_group hswep_uncore_sbox_format_group = {
+	.name = "format",
+	.attrs = hswep_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_type hswep_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 4,
+	.perf_ctr_bits		= 44,
+	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
+	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
+	.ops			= &hswep_uncore_sbox_msr_ops,
+	.format_group		= &hswep_uncore_sbox_format_group,
+};
+
+static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+	int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+	if (ev_sel >= 0xb && ev_sel <= 0xe) {
+		reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
+		reg1->idx = ev_sel - 0xb;
+		reg1->config = event->attr.config1 & (0xff << reg1->idx);
+	}
+	return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_pcu_ops = {
+	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+	.hw_config		= hswep_pcu_hw_config,
+	.get_constraint		= snbep_pcu_get_constraint,
+	.put_constraint		= snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_pcu = {
+	.name			= "pcu",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_PCU_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_PCU_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &hswep_uncore_pcu_ops,
+	.format_group		= &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *hswep_msr_uncores[] = {
+	&hswep_uncore_ubox,
+	&hswep_uncore_cbox,
+	&hswep_uncore_sbox,
+	&hswep_uncore_pcu,
+	NULL,
+};
+
+void hswep_uncore_cpu_init(void)
+{
+	if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+
+	/* Detect 6-8 core systems with only two SBOXes */
+	if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
+		u32 capid4;
+
+		pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
+				      0x94, &capid4);
+		if (((capid4 >> 6) & 0x3) == 0)
+			hswep_uncore_sbox.num_boxes = 2;
+	}
+
+	uncore_msr_uncores = hswep_msr_uncores;
+}
+
+static struct intel_uncore_type hswep_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 5,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct uncore_event_desc hswep_uncore_imc_events[] = {
+	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0x00,umask=0x00"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
+	INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
+	{ /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type hswep_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 5,
+	.num_boxes	= 8,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= hswep_uncore_imc_events,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
+
+static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+	struct pci_dev *pdev = box->pci_dev;
+	struct hw_perf_event *hwc = &event->hw;
+	u64 count = 0;
+
+	pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+	pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+	return count;
+}
+
+static struct intel_uncore_ops hswep_uncore_irp_ops = {
+	.init_box	= snbep_uncore_pci_init_box,
+	.disable_box	= snbep_uncore_pci_disable_box,
+	.enable_box	= snbep_uncore_pci_enable_box,
+	.disable_event	= ivbep_uncore_irp_disable_event,
+	.enable_event	= ivbep_uncore_irp_enable_event,
+	.read_counter	= hswep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type hswep_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.ops			= &hswep_uncore_irp_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type hswep_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 5,
+	.num_boxes		= 3,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_qpi_ops,
+	.format_group		= &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.constraints	= hswep_uncore_r2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 4,
+	.num_boxes	= 3,
+	.perf_ctr_bits	= 44,
+	.constraints	= hswep_uncore_r3qpi_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	HSWEP_PCI_UNCORE_HA,
+	HSWEP_PCI_UNCORE_IMC,
+	HSWEP_PCI_UNCORE_IRP,
+	HSWEP_PCI_UNCORE_QPI,
+	HSWEP_PCI_UNCORE_R2PCIE,
+	HSWEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *hswep_pci_uncores[] = {
+	[HSWEP_PCI_UNCORE_HA]	= &hswep_uncore_ha,
+	[HSWEP_PCI_UNCORE_IMC]	= &hswep_uncore_imc,
+	[HSWEP_PCI_UNCORE_IRP]	= &hswep_uncore_irp,
+	[HSWEP_PCI_UNCORE_QPI]	= &hswep_uncore_qpi,
+	[HSWEP_PCI_UNCORE_R2PCIE]	= &hswep_uncore_r2pcie,
+	[HSWEP_PCI_UNCORE_R3QPI]	= &hswep_uncore_r3qpi,
+	NULL,
+};
+
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
+	{ /* Home Agent 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
+	},
+	{ /* Home Agent 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
+	},
+	{ /* MC0 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC0 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
+	},
+	{ /* MC0 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC0 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
+	},
+	{ /* MC1 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
+	},
+	{ /* MC1 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
+	},
+	{ /* MC1 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
+	},
+	{ /* MC1 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
+	},
+	{ /* QPI0 Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI0 Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
+	},
+	{ /* QPI1 Port 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* R3QPI0 Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI0 Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* R3QPI1 Link 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
+		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT0_FILTER),
+	},
+	{ /* QPI Port 1 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   SNBEP_PCI_QPI_PORT1_FILTER),
+	},
+	{ /* PCU.3 (for Capability registers) */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+						   HSWEP_PCI_PCU_3),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver hswep_uncore_pci_driver = {
+	.name		= "hswep_uncore",
+	.id_table	= hswep_uncore_pci_ids,
+};
+
+int hswep_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x2f1e);
+	if (ret)
+		return ret;
+	uncore_pci_uncores = hswep_pci_uncores;
+	uncore_pci_driver = &hswep_uncore_pci_driver;
+	return 0;
+}
+/* end of Haswell-EP uncore support */
+
+/* BDX uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &ivbep_uncore_msr_ops,
+	.format_group		= &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 24,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= bdx_uncore_cbox_constraints,
+	.ops			= &hswep_uncore_cbox_ops,
+	.format_group		= &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_sbox = {
+	.name			= "sbox",
+	.num_counters		= 4,
+	.num_boxes		= 4,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
+	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
+	.ops			= &hswep_uncore_sbox_msr_ops,
+	.format_group		= &hswep_uncore_sbox_format_group,
+};
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+	&bdx_uncore_ubox,
+	&bdx_uncore_cbox,
+	&bdx_uncore_sbox,
+	&hswep_uncore_pcu,
+	NULL,
+};
+
+void bdx_uncore_cpu_init(void)
+{
+	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = bdx_msr_uncores;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 5,
+	.num_boxes	= 8,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= hswep_uncore_imc_events,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.ops			= &hswep_uncore_irp_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type bdx_uncore_qpi = {
+	.name			= "qpi",
+	.num_counters		= 4,
+	.num_boxes		= 3,
+	.perf_ctr_bits		= 48,
+	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
+	.event_ctl		= SNBEP_PCI_PMON_CTL0,
+	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &snbep_uncore_qpi_ops,
+	.format_group		= &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.constraints	= bdx_uncore_r2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r3qpi = {
+	.name		= "r3qpi",
+	.num_counters   = 3,
+	.num_boxes	= 3,
+	.perf_ctr_bits	= 48,
+	.constraints	= bdx_uncore_r3qpi_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	BDX_PCI_UNCORE_HA,
+	BDX_PCI_UNCORE_IMC,
+	BDX_PCI_UNCORE_IRP,
+	BDX_PCI_UNCORE_QPI,
+	BDX_PCI_UNCORE_R2PCIE,
+	BDX_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+	[BDX_PCI_UNCORE_HA]	= &bdx_uncore_ha,
+	[BDX_PCI_UNCORE_IMC]	= &bdx_uncore_imc,
+	[BDX_PCI_UNCORE_IRP]	= &bdx_uncore_irp,
+	[BDX_PCI_UNCORE_QPI]	= &bdx_uncore_qpi,
+	[BDX_PCI_UNCORE_R2PCIE]	= &bdx_uncore_r2pcie,
+	[BDX_PCI_UNCORE_R3QPI]	= &bdx_uncore_r3qpi,
+	NULL,
+};
+
+static const struct pci_device_id bdx_uncore_pci_ids[] = {
+	{ /* Home Agent 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+	},
+	{ /* Home Agent 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
+	},
+	{ /* MC0 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC0 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+	},
+	{ /* MC0 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
+	},
+	{ /* MC0 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
+	},
+	{ /* MC1 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
+	},
+	{ /* MC1 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
+	},
+	{ /* MC1 Channel 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
+	},
+	{ /* MC1 Channel 3 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+	},
+	{ /* QPI0 Port 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
+	},
+	{ /* QPI0 Port 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
+	},
+	{ /* QPI1 Port 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* R3QPI0 Link 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
+	},
+	{ /* R3QPI0 Link 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
+	},
+	{ /* R3QPI1 Link 2 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
+	},
+	{ /* QPI Port 0 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
+	},
+	{ /* QPI Port 1 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
+	},
+	{ /* QPI Port 2 filter  */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
+		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+	.name		= "bdx_uncore",
+	.id_table	= bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x6f1e);
+
+	if (ret)
+		return ret;
+	uncore_pci_uncores = bdx_pci_uncores;
+	uncore_pci_driver = &bdx_uncore_pci_driver;
+	return 0;
+}
+
+/* end of BDX uncore support */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 696d106..973b77b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 ifdef CONFIG_PERF_EVENTS
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 
-obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore_snbep.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
 endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
deleted file mode 100644
index 188e18a..0000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ /dev/null
@@ -1,3125 +0,0 @@
-/* SandyBridge-EP/IvyTown uncore support */
-#include "../../events/intel/uncore.h"
-
-/* SNB-EP Box level control */
-#define SNBEP_PMON_BOX_CTL_RST_CTRL	(1 << 0)
-#define SNBEP_PMON_BOX_CTL_RST_CTRS	(1 << 1)
-#define SNBEP_PMON_BOX_CTL_FRZ		(1 << 8)
-#define SNBEP_PMON_BOX_CTL_FRZ_EN	(1 << 16)
-#define SNBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
-					 SNBEP_PMON_BOX_CTL_RST_CTRS | \
-					 SNBEP_PMON_BOX_CTL_FRZ_EN)
-/* SNB-EP event control */
-#define SNBEP_PMON_CTL_EV_SEL_MASK	0x000000ff
-#define SNBEP_PMON_CTL_UMASK_MASK	0x0000ff00
-#define SNBEP_PMON_CTL_RST		(1 << 17)
-#define SNBEP_PMON_CTL_EDGE_DET		(1 << 18)
-#define SNBEP_PMON_CTL_EV_SEL_EXT	(1 << 21)
-#define SNBEP_PMON_CTL_EN		(1 << 22)
-#define SNBEP_PMON_CTL_INVERT		(1 << 23)
-#define SNBEP_PMON_CTL_TRESH_MASK	0xff000000
-#define SNBEP_PMON_RAW_EVENT_MASK	(SNBEP_PMON_CTL_EV_SEL_MASK | \
-					 SNBEP_PMON_CTL_UMASK_MASK | \
-					 SNBEP_PMON_CTL_EDGE_DET | \
-					 SNBEP_PMON_CTL_INVERT | \
-					 SNBEP_PMON_CTL_TRESH_MASK)
-
-/* SNB-EP Ubox event control */
-#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK		0x1f000000
-#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK		\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PMON_CTL_UMASK_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_PMON_CTL_INVERT | \
-				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-
-#define SNBEP_CBO_PMON_CTL_TID_EN		(1 << 19)
-#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK	(SNBEP_PMON_RAW_EVENT_MASK | \
-						 SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* SNB-EP PCU event control */
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK	0x0000c000
-#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK	0x1f000000
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT	(1 << 30)
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET	(1 << 31)
-#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT | \
-				 SNBEP_PMON_CTL_INVERT | \
-				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_RAW_EVENT_MASK | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* SNB-EP pci control register */
-#define SNBEP_PCI_PMON_BOX_CTL			0xf4
-#define SNBEP_PCI_PMON_CTL0			0xd8
-/* SNB-EP pci counter register */
-#define SNBEP_PCI_PMON_CTR0			0xa0
-
-/* SNB-EP home agent register */
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0	0x40
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1	0x44
-#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH	0x48
-/* SNB-EP memory controller register */
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL		0xf0
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR		0xd0
-/* SNB-EP QPI register */
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0		0x228
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1		0x22c
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0		0x238
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1		0x23c
-
-/* SNB-EP Ubox register */
-#define SNBEP_U_MSR_PMON_CTR0			0xc16
-#define SNBEP_U_MSR_PMON_CTL0			0xc10
-
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL		0xc08
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR		0xc09
-
-/* SNB-EP Cbo register */
-#define SNBEP_C0_MSR_PMON_CTR0			0xd16
-#define SNBEP_C0_MSR_PMON_CTL0			0xd10
-#define SNBEP_C0_MSR_PMON_BOX_CTL		0xd04
-#define SNBEP_C0_MSR_PMON_BOX_FILTER		0xd14
-#define SNBEP_CBO_MSR_OFFSET			0x20
-
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID	0x1f
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID	0x3fc00
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE	0x7c0000
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC	0xff800000
-
-#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) {	\
-	.event = (e),				\
-	.msr = SNBEP_C0_MSR_PMON_BOX_FILTER,	\
-	.config_mask = (m),			\
-	.idx = (i)				\
-}
-
-/* SNB-EP PCU register */
-#define SNBEP_PCU_MSR_PMON_CTR0			0xc36
-#define SNBEP_PCU_MSR_PMON_CTL0			0xc30
-#define SNBEP_PCU_MSR_PMON_BOX_CTL		0xc24
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER		0xc34
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK	0xffffffff
-#define SNBEP_PCU_MSR_CORE_C3_CTR		0x3fc
-#define SNBEP_PCU_MSR_CORE_C6_CTR		0x3fd
-
-/* IVBEP event control */
-#define IVBEP_PMON_BOX_CTL_INT		(SNBEP_PMON_BOX_CTL_RST_CTRL | \
-					 SNBEP_PMON_BOX_CTL_RST_CTRS)
-#define IVBEP_PMON_RAW_EVENT_MASK		(SNBEP_PMON_CTL_EV_SEL_MASK | \
-					 SNBEP_PMON_CTL_UMASK_MASK | \
-					 SNBEP_PMON_CTL_EDGE_DET | \
-					 SNBEP_PMON_CTL_TRESH_MASK)
-/* IVBEP Ubox */
-#define IVBEP_U_MSR_PMON_GLOBAL_CTL		0xc00
-#define IVBEP_U_PMON_GLOBAL_FRZ_ALL		(1 << 31)
-#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL		(1 << 29)
-
-#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PMON_CTL_UMASK_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-/* IVBEP Cbo */
-#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK		(IVBEP_PMON_RAW_EVENT_MASK | \
-						 SNBEP_CBO_PMON_CTL_TID_EN)
-
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID		(0x1fULL << 0)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 5)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE	(0x3fULL << 17)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID		(0xffffULL << 32)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC		(0x1ffULL << 52)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6		(0x1ULL << 61)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC		(0x1ULL << 62)
-#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC	(0x1ULL << 63)
-
-/* IVBEP home agent */
-#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST		(1 << 16)
-#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK		\
-				(IVBEP_PMON_RAW_EVENT_MASK | \
-				 IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
-/* IVBEP PCU */
-#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK	\
-				(SNBEP_PMON_CTL_EV_SEL_MASK | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-/* IVBEP QPI */
-#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK	\
-				(IVBEP_PMON_RAW_EVENT_MASK | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT)
-
-#define __BITS_VALUE(x, i, n)  ((typeof(x))(((x) >> ((i) * (n))) & \
-				((1ULL << (n)) - 1)))
-
-/* Haswell-EP Ubox */
-#define HSWEP_U_MSR_PMON_CTR0			0x709
-#define HSWEP_U_MSR_PMON_CTL0			0x705
-#define HSWEP_U_MSR_PMON_FILTER			0x707
-
-#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL		0x703
-#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR		0x704
-
-#define HSWEP_U_MSR_PMON_BOX_FILTER_TID		(0x1 << 0)
-#define HSWEP_U_MSR_PMON_BOX_FILTER_CID		(0x1fULL << 1)
-#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
-					(HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
-					 HSWEP_U_MSR_PMON_BOX_FILTER_CID)
-
-/* Haswell-EP CBo */
-#define HSWEP_C0_MSR_PMON_CTR0			0xe08
-#define HSWEP_C0_MSR_PMON_CTL0			0xe01
-#define HSWEP_C0_MSR_PMON_BOX_CTL			0xe00
-#define HSWEP_C0_MSR_PMON_BOX_FILTER0		0xe05
-#define HSWEP_CBO_MSR_OFFSET			0x10
-
-
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID		(0x3fULL << 0)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 6)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE	(0x7fULL << 17)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID		(0xffffULL << 32)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC		(0x1ffULL << 52)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6		(0x1ULL << 61)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC		(0x1ULL << 62)
-#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC	(0x1ULL << 63)
-
-
-/* Haswell-EP Sbox */
-#define HSWEP_S0_MSR_PMON_CTR0			0x726
-#define HSWEP_S0_MSR_PMON_CTL0			0x721
-#define HSWEP_S0_MSR_PMON_BOX_CTL			0x720
-#define HSWEP_SBOX_MSR_OFFSET			0xa
-#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK		(SNBEP_PMON_RAW_EVENT_MASK | \
-						 SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* Haswell-EP PCU */
-#define HSWEP_PCU_MSR_PMON_CTR0			0x717
-#define HSWEP_PCU_MSR_PMON_CTL0			0x711
-#define HSWEP_PCU_MSR_PMON_BOX_CTL		0x710
-#define HSWEP_PCU_MSR_PMON_BOX_FILTER		0x715
-
-/* KNL Ubox */
-#define KNL_U_MSR_PMON_RAW_EVENT_MASK \
-					(SNBEP_U_MSR_PMON_RAW_EVENT_MASK | \
-						SNBEP_CBO_PMON_CTL_TID_EN)
-/* KNL CHA */
-#define KNL_CHA_MSR_OFFSET			0xc
-#define KNL_CHA_MSR_PMON_CTL_QOR		(1 << 16)
-#define KNL_CHA_MSR_PMON_RAW_EVENT_MASK \
-					(SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK | \
-					 KNL_CHA_MSR_PMON_CTL_QOR)
-#define KNL_CHA_MSR_PMON_BOX_FILTER_TID		0x1ff
-#define KNL_CHA_MSR_PMON_BOX_FILTER_STATE	(7 << 18)
-#define KNL_CHA_MSR_PMON_BOX_FILTER_OP		(0xfffffe2aULL << 32)
-
-/* KNL EDC/MC UCLK */
-#define KNL_UCLK_MSR_PMON_CTR0_LOW		0x400
-#define KNL_UCLK_MSR_PMON_CTL0			0x420
-#define KNL_UCLK_MSR_PMON_BOX_CTL		0x430
-#define KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW	0x44c
-#define KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL	0x454
-#define KNL_PMON_FIXED_CTL_EN			0x1
-
-/* KNL EDC */
-#define KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW		0xa00
-#define KNL_EDC0_ECLK_MSR_PMON_CTL0		0xa20
-#define KNL_EDC0_ECLK_MSR_PMON_BOX_CTL		0xa30
-#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW	0xa3c
-#define KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL	0xa44
-
-/* KNL MC */
-#define KNL_MC0_CH0_MSR_PMON_CTR0_LOW		0xb00
-#define KNL_MC0_CH0_MSR_PMON_CTL0		0xb20
-#define KNL_MC0_CH0_MSR_PMON_BOX_CTL		0xb30
-#define KNL_MC0_CH0_MSR_PMON_FIXED_LOW		0xb3c
-#define KNL_MC0_CH0_MSR_PMON_FIXED_CTL		0xb44
-
-/* KNL IRP */
-#define KNL_IRP_PCI_PMON_BOX_CTL		0xf0
-#define KNL_IRP_PCI_PMON_RAW_EVENT_MASK		(SNBEP_PMON_RAW_EVENT_MASK | \
-						 KNL_CHA_MSR_PMON_CTL_QOR)
-/* KNL PCU */
-#define KNL_PCU_PMON_CTL_EV_SEL_MASK		0x0000007f
-#define KNL_PCU_PMON_CTL_USE_OCC_CTR		(1 << 7)
-#define KNL_PCU_MSR_PMON_CTL_TRESH_MASK		0x3f000000
-#define KNL_PCU_MSR_PMON_RAW_EVENT_MASK	\
-				(KNL_PCU_PMON_CTL_EV_SEL_MASK | \
-				 KNL_PCU_PMON_CTL_USE_OCC_CTR | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
-				 SNBEP_PMON_CTL_EDGE_DET | \
-				 SNBEP_CBO_PMON_CTL_TID_EN | \
-				 SNBEP_PMON_CTL_EV_SEL_EXT | \
-				 SNBEP_PMON_CTL_INVERT | \
-				 KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
-				 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
-DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
-DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
-DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
-DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
-DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
-DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
-DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
-DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
-DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
-
-static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int box_ctl = uncore_pci_box_ctl(box);
-	u32 config = 0;
-
-	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-		config |= SNBEP_PMON_BOX_CTL_FRZ;
-		pci_write_config_dword(pdev, box_ctl, config);
-	}
-}
-
-static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int box_ctl = uncore_pci_box_ctl(box);
-	u32 config = 0;
-
-	if (!pci_read_config_dword(pdev, box_ctl, &config)) {
-		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-		pci_write_config_dword(pdev, box_ctl, config);
-	}
-}
-
-static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, hwc->config_base, hwc->config);
-}
-
-static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 count = 0;
-
-	pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
-	pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
-
-	return count;
-}
-
-static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int box_ctl = uncore_pci_box_ctl(box);
-
-	pci_write_config_dword(pdev, box_ctl, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
-	u64 config;
-	unsigned msr;
-
-	msr = uncore_msr_box_ctl(box);
-	if (msr) {
-		rdmsrl(msr, config);
-		config |= SNBEP_PMON_BOX_CTL_FRZ;
-		wrmsrl(msr, config);
-	}
-}
-
-static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
-	u64 config;
-	unsigned msr;
-
-	msr = uncore_msr_box_ctl(box);
-	if (msr) {
-		rdmsrl(msr, config);
-		config &= ~SNBEP_PMON_BOX_CTL_FRZ;
-		wrmsrl(msr, config);
-	}
-}
-
-static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE)
-		wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
-
-	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
-					struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	wrmsrl(hwc->config_base, hwc->config);
-}
-
-static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-
-	if (msr)
-		wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static struct attribute *snbep_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_cbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_filter_tid.attr,
-	&format_attr_filter_nid.attr,
-	&format_attr_filter_state.attr,
-	&format_attr_filter_opc.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_pcu_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_occ_sel.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	&format_attr_occ_invert.attr,
-	&format_attr_occ_edge.attr,
-	&format_attr_filter_band0.attr,
-	&format_attr_filter_band1.attr,
-	&format_attr_filter_band2.attr,
-	&format_attr_filter_band3.attr,
-	NULL,
-};
-
-static struct attribute *snbep_uncore_qpi_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_match_rds.attr,
-	&format_attr_match_rnid30.attr,
-	&format_attr_match_rnid4.attr,
-	&format_attr_match_dnid.attr,
-	&format_attr_match_mc.attr,
-	&format_attr_match_opc.attr,
-	&format_attr_match_vnw.attr,
-	&format_attr_match0.attr,
-	&format_attr_match1.attr,
-	&format_attr_mask_rds.attr,
-	&format_attr_mask_rnid30.attr,
-	&format_attr_mask_rnid4.attr,
-	&format_attr_mask_dnid.attr,
-	&format_attr_mask_mc.attr,
-	&format_attr_mask_opc.attr,
-	&format_attr_mask_vnw.attr,
-	&format_attr_mask0.attr,
-	&format_attr_mask1.attr,
-	NULL,
-};
-
-static struct uncore_event_desc snbep_uncore_imc_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0xff,umask=0x00"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
-	{ /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc snbep_uncore_qpi_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks,       "event=0x14"),
-	INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
-	INTEL_UNCORE_EVENT_DESC(drs_data,         "event=0x102,umask=0x08"),
-	INTEL_UNCORE_EVENT_DESC(ncb_data,         "event=0x103,umask=0x04"),
-	{ /* end: all zeroes */ },
-};
-
-static struct attribute_group snbep_uncore_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_ubox_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_cbox_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_pcu_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_qpi_format_group = {
-	.name = "format",
-	.attrs = snbep_uncore_qpi_formats_attr,
-};
-
-#define __SNBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
-	.disable_box	= snbep_uncore_msr_disable_box,		\
-	.enable_box	= snbep_uncore_msr_enable_box,		\
-	.disable_event	= snbep_uncore_msr_disable_event,	\
-	.enable_event	= snbep_uncore_msr_enable_event,	\
-	.read_counter	= uncore_msr_read_counter
-
-#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
-	__SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),			\
-	.init_box	= snbep_uncore_msr_init_box		\
-
-static struct intel_uncore_ops snbep_uncore_msr_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT()			\
-	.init_box	= snbep_uncore_pci_init_box,		\
-	.disable_box	= snbep_uncore_pci_disable_box,		\
-	.enable_box	= snbep_uncore_pci_enable_box,		\
-	.disable_event	= snbep_uncore_pci_disable_event,	\
-	.read_counter	= snbep_uncore_pci_read_counter
-
-static struct intel_uncore_ops snbep_uncore_pci_ops = {
-	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-	.enable_event	= snbep_uncore_pci_enable_event,	\
-};
-
-static struct event_constraint snbep_uncore_cbox_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
-	UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
-	UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
-	UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
-	EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
-	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snbep_uncore_ubox = {
-	.name		= "ubox",
-	.num_counters   = 2,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.fixed_ctr_bits	= 48,
-	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
-	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
-	.event_mask	= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.ops		= &snbep_uncore_msr_ops,
-	.format_group	= &snbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
-	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
-	EVENT_EXTRA_END
-};
-
-static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-	int i;
-
-	if (uncore_box_is_fake(box))
-		return;
-
-	for (i = 0; i < 5; i++) {
-		if (reg1->alloc & (0x1 << i))
-			atomic_sub(1 << (i * 6), &er->ref);
-	}
-	reg1->alloc = 0;
-}
-
-static struct event_constraint *
-__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
-			    u64 (*cbox_filter_mask)(int fields))
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-	int i, alloc = 0;
-	unsigned long flags;
-	u64 mask;
-
-	if (reg1->idx == EXTRA_REG_NONE)
-		return NULL;
-
-	raw_spin_lock_irqsave(&er->lock, flags);
-	for (i = 0; i < 5; i++) {
-		if (!(reg1->idx & (0x1 << i)))
-			continue;
-		if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
-			continue;
-
-		mask = cbox_filter_mask(0x1 << i);
-		if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
-		    !((reg1->config ^ er->config) & mask)) {
-			atomic_add(1 << (i * 6), &er->ref);
-			er->config &= ~mask;
-			er->config |= reg1->config & mask;
-			alloc |= (0x1 << i);
-		} else {
-			break;
-		}
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-	if (i < 5)
-		goto fail;
-
-	if (!uncore_box_is_fake(box))
-		reg1->alloc |= alloc;
-
-	return NULL;
-fail:
-	for (; i >= 0; i--) {
-		if (alloc & (0x1 << i))
-			atomic_sub(1 << (i * 6), &er->ref);
-	}
-	return &uncore_constraint_empty;
-}
-
-static u64 snbep_cbox_filter_mask(int fields)
-{
-	u64 mask = 0;
-
-	if (fields & 0x1)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
-	if (fields & 0x2)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
-	if (fields & 0x4)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-	if (fields & 0x8)
-		mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-
-	return mask;
-}
-
-static struct event_constraint *
-snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
-}
-
-static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct extra_reg *er;
-	int idx = 0;
-
-	for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		idx |= er->idx;
-	}
-
-	if (idx) {
-		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-		reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
-		reg1->idx = idx;
-	}
-	return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_cbox_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= snbep_cbox_hw_config,
-	.get_constraint		= snbep_cbox_get_constraint,
-	.put_constraint		= snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 4,
-	.num_boxes		= 8,
-	.perf_ctr_bits		= 44,
-	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
-	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= snbep_uncore_cbox_constraints,
-	.ops			= &snbep_uncore_cbox_ops,
-	.format_group		= &snbep_uncore_cbox_format_group,
-};
-
-static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	u64 config = reg1->config;
-
-	if (new_idx > reg1->idx)
-		config <<= 8 * (new_idx - reg1->idx);
-	else
-		config >>= 8 * (reg1->idx - new_idx);
-
-	if (modify) {
-		hwc->config += new_idx - reg1->idx;
-		reg1->config = config;
-		reg1->idx = new_idx;
-	}
-	return config;
-}
-
-static struct event_constraint *
-snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-	unsigned long flags;
-	int idx = reg1->idx;
-	u64 mask, config1 = reg1->config;
-	bool ok = false;
-
-	if (reg1->idx == EXTRA_REG_NONE ||
-	    (!uncore_box_is_fake(box) && reg1->alloc))
-		return NULL;
-again:
-	mask = 0xffULL << (idx * 8);
-	raw_spin_lock_irqsave(&er->lock, flags);
-	if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
-	    !((config1 ^ er->config) & mask)) {
-		atomic_add(1 << (idx * 8), &er->ref);
-		er->config &= ~mask;
-		er->config |= config1 & mask;
-		ok = true;
-	}
-	raw_spin_unlock_irqrestore(&er->lock, flags);
-
-	if (!ok) {
-		idx = (idx + 1) % 4;
-		if (idx != reg1->idx) {
-			config1 = snbep_pcu_alter_er(event, idx, false);
-			goto again;
-		}
-		return &uncore_constraint_empty;
-	}
-
-	if (!uncore_box_is_fake(box)) {
-		if (idx != reg1->idx)
-			snbep_pcu_alter_er(event, idx, true);
-		reg1->alloc = 1;
-	}
-	return NULL;
-}
-
-static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-
-	if (uncore_box_is_fake(box) || !reg1->alloc)
-		return;
-
-	atomic_sub(1 << (reg1->idx * 8), &er->ref);
-	reg1->alloc = 0;
-}
-
-static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
-	if (ev_sel >= 0xb && ev_sel <= 0xe) {
-		reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
-		reg1->idx = ev_sel - 0xb;
-		reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
-	}
-	return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_pcu_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= snbep_pcu_hw_config,
-	.get_constraint		= snbep_pcu_get_constraint,
-	.put_constraint		= snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_pcu = {
-	.name			= "pcu",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
-	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &snbep_uncore_pcu_ops,
-	.format_group		= &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *snbep_msr_uncores[] = {
-	&snbep_uncore_ubox,
-	&snbep_uncore_cbox,
-	&snbep_uncore_pcu,
-	NULL,
-};
-
-void snbep_uncore_cpu_init(void)
-{
-	if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-	uncore_msr_uncores = snbep_msr_uncores;
-}
-
-enum {
-	SNBEP_PCI_QPI_PORT0_FILTER,
-	SNBEP_PCI_QPI_PORT1_FILTER,
-	HSWEP_PCI_PCU_3,
-};
-
-static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
-		reg1->idx = 0;
-		reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
-		reg1->config = event->attr.config1;
-		reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
-		reg2->config = event->attr.config2;
-	}
-	return 0;
-}
-
-static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
-		struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
-		if (filter_pdev) {
-			pci_write_config_dword(filter_pdev, reg1->reg,
-						(u32)reg1->config);
-			pci_write_config_dword(filter_pdev, reg1->reg + 4,
-						(u32)(reg1->config >> 32));
-			pci_write_config_dword(filter_pdev, reg2->reg,
-						(u32)reg2->config);
-			pci_write_config_dword(filter_pdev, reg2->reg + 4,
-						(u32)(reg2->config >> 32));
-		}
-	}
-
-	pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops snbep_uncore_qpi_ops = {
-	SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
-	.enable_event		= snbep_qpi_enable_event,
-	.hw_config		= snbep_qpi_hw_config,
-	.get_constraint		= uncore_get_constraint,
-	.put_constraint		= uncore_put_constraint,
-};
-
-#define SNBEP_UNCORE_PCI_COMMON_INIT()				\
-	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
-	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
-	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,		\
-	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
-	.ops		= &snbep_uncore_pci_ops,		\
-	.format_group	= &snbep_uncore_format_group
-
-static struct intel_uncore_type snbep_uncore_ha = {
-	.name		= "ha",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 4,
-	.num_boxes	= 4,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-	.event_descs	= snbep_uncore_imc_events,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_qpi = {
-	.name			= "qpi",
-	.num_counters		= 4,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &snbep_uncore_qpi_ops,
-	.event_descs		= snbep_uncore_qpi_events,
-	.format_group		= &snbep_uncore_qpi_format_group,
-};
-
-
-static struct intel_uncore_type snbep_uncore_r2pcie = {
-	.name		= "r2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r2pcie_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_r3qpi = {
-	.name		= "r3qpi",
-	.num_counters   = 3,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r3qpi_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-	SNBEP_PCI_UNCORE_HA,
-	SNBEP_PCI_UNCORE_IMC,
-	SNBEP_PCI_UNCORE_QPI,
-	SNBEP_PCI_UNCORE_R2PCIE,
-	SNBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *snbep_pci_uncores[] = {
-	[SNBEP_PCI_UNCORE_HA]		= &snbep_uncore_ha,
-	[SNBEP_PCI_UNCORE_IMC]		= &snbep_uncore_imc,
-	[SNBEP_PCI_UNCORE_QPI]		= &snbep_uncore_qpi,
-	[SNBEP_PCI_UNCORE_R2PCIE]	= &snbep_uncore_r2pcie,
-	[SNBEP_PCI_UNCORE_R3QPI]	= &snbep_uncore_r3qpi,
-	NULL,
-};
-
-static const struct pci_device_id snbep_uncore_pci_ids[] = {
-	{ /* Home Agent */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
-	},
-	{ /* MC Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
-	},
-	{ /* MC Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
-	},
-	{ /* MC Channel 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
-	},
-	{ /* MC Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
-	},
-	{ /* QPI Port 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
-	},
-	{ /* QPI Port 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
-	},
-	{ /* R2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
-	},
-	{ /* R3QPI Link 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
-	},
-	{ /* R3QPI Link 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
-		.driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT0_FILTER),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT1_FILTER),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver snbep_uncore_pci_driver = {
-	.name		= "snbep_uncore",
-	.id_table	= snbep_uncore_pci_ids,
-};
-
-/*
- * build pci bus to socket mapping
- */
-static int snbep_pci2phy_map_init(int devid)
-{
-	struct pci_dev *ubox_dev = NULL;
-	int i, bus, nodeid, segment;
-	struct pci2phy_map *map;
-	int err = 0;
-	u32 config = 0;
-
-	while (1) {
-		/* find the UBOX device */
-		ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
-		if (!ubox_dev)
-			break;
-		bus = ubox_dev->bus->number;
-		/* get the Node ID of the local register */
-		err = pci_read_config_dword(ubox_dev, 0x40, &config);
-		if (err)
-			break;
-		nodeid = config;
-		/* get the Node ID mapping */
-		err = pci_read_config_dword(ubox_dev, 0x54, &config);
-		if (err)
-			break;
-
-		segment = pci_domain_nr(ubox_dev->bus);
-		raw_spin_lock(&pci2phy_map_lock);
-		map = __find_pci2phy_map(segment);
-		if (!map) {
-			raw_spin_unlock(&pci2phy_map_lock);
-			err = -ENOMEM;
-			break;
-		}
-
-		/*
-		 * every three bits in the Node ID mapping register maps
-		 * to a particular node.
-		 */
-		for (i = 0; i < 8; i++) {
-			if (nodeid == ((config >> (3 * i)) & 0x7)) {
-				map->pbus_to_physid[bus] = i;
-				break;
-			}
-		}
-		raw_spin_unlock(&pci2phy_map_lock);
-	}
-
-	if (!err) {
-		/*
-		 * For PCI bus with no UBOX device, find the next bus
-		 * that has UBOX device and use its mapping.
-		 */
-		raw_spin_lock(&pci2phy_map_lock);
-		list_for_each_entry(map, &pci2phy_map_head, list) {
-			i = -1;
-			for (bus = 255; bus >= 0; bus--) {
-				if (map->pbus_to_physid[bus] >= 0)
-					i = map->pbus_to_physid[bus];
-				else
-					map->pbus_to_physid[bus] = i;
-			}
-		}
-		raw_spin_unlock(&pci2phy_map_lock);
-	}
-
-	pci_dev_put(ubox_dev);
-
-	return err ? pcibios_err_to_errno(err) : 0;
-}
-
-int snbep_uncore_pci_init(void)
-{
-	int ret = snbep_pci2phy_map_init(0x3ce0);
-	if (ret)
-		return ret;
-	uncore_pci_uncores = snbep_pci_uncores;
-	uncore_pci_driver = &snbep_uncore_pci_driver;
-	return 0;
-}
-/* end of Sandy Bridge-EP uncore support */
-
-/* IvyTown uncore support */
-static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-	if (msr)
-		wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
-}
-
-static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-
-	pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
-}
-
-#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT()			\
-	.init_box	= ivbep_uncore_msr_init_box,		\
-	.disable_box	= snbep_uncore_msr_disable_box,		\
-	.enable_box	= snbep_uncore_msr_enable_box,		\
-	.disable_event	= snbep_uncore_msr_disable_event,	\
-	.enable_event	= snbep_uncore_msr_enable_event,	\
-	.read_counter	= uncore_msr_read_counter
-
-static struct intel_uncore_ops ivbep_uncore_msr_ops = {
-	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-static struct intel_uncore_ops ivbep_uncore_pci_ops = {
-	.init_box	= ivbep_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= snbep_uncore_pci_enable_box,
-	.disable_event	= snbep_uncore_pci_disable_event,
-	.enable_event	= snbep_uncore_pci_enable_event,
-	.read_counter	= snbep_uncore_pci_read_counter,
-};
-
-#define IVBEP_UNCORE_PCI_COMMON_INIT()				\
-	.perf_ctr	= SNBEP_PCI_PMON_CTR0,			\
-	.event_ctl	= SNBEP_PCI_PMON_CTL0,			\
-	.event_mask	= IVBEP_PMON_RAW_EVENT_MASK,		\
-	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL,		\
-	.ops		= &ivbep_uncore_pci_ops,			\
-	.format_group	= &ivbep_uncore_format_group
-
-static struct attribute *ivbep_uncore_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	NULL,
-};
-
-static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_filter_tid.attr,
-	&format_attr_filter_link.attr,
-	&format_attr_filter_state2.attr,
-	&format_attr_filter_nid2.attr,
-	&format_attr_filter_opc2.attr,
-	&format_attr_filter_nc.attr,
-	&format_attr_filter_c6.attr,
-	&format_attr_filter_isoc.attr,
-	NULL,
-};
-
-static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_occ_sel.attr,
-	&format_attr_edge.attr,
-	&format_attr_thresh5.attr,
-	&format_attr_occ_invert.attr,
-	&format_attr_occ_edge.attr,
-	&format_attr_filter_band0.attr,
-	&format_attr_filter_band1.attr,
-	&format_attr_filter_band2.attr,
-	&format_attr_filter_band3.attr,
-	NULL,
-};
-
-static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
-	&format_attr_event_ext.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_match_rds.attr,
-	&format_attr_match_rnid30.attr,
-	&format_attr_match_rnid4.attr,
-	&format_attr_match_dnid.attr,
-	&format_attr_match_mc.attr,
-	&format_attr_match_opc.attr,
-	&format_attr_match_vnw.attr,
-	&format_attr_match0.attr,
-	&format_attr_match1.attr,
-	&format_attr_mask_rds.attr,
-	&format_attr_mask_rnid30.attr,
-	&format_attr_mask_rnid4.attr,
-	&format_attr_mask_dnid.attr,
-	&format_attr_mask_mc.attr,
-	&format_attr_mask_opc.attr,
-	&format_attr_mask_vnw.attr,
-	&format_attr_mask0.attr,
-	&format_attr_mask1.attr,
-	NULL,
-};
-
-static struct attribute_group ivbep_uncore_format_group = {
-	.name = "format",
-	.attrs = ivbep_uncore_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_ubox_format_group = {
-	.name = "format",
-	.attrs = ivbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_cbox_format_group = {
-	.name = "format",
-	.attrs = ivbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_pcu_format_group = {
-	.name = "format",
-	.attrs = ivbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group ivbep_uncore_qpi_format_group = {
-	.name = "format",
-	.attrs = ivbep_uncore_qpi_formats_attr,
-};
-
-static struct intel_uncore_type ivbep_uncore_ubox = {
-	.name		= "ubox",
-	.num_counters   = 2,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.fixed_ctr_bits	= 48,
-	.perf_ctr	= SNBEP_U_MSR_PMON_CTR0,
-	.event_ctl	= SNBEP_U_MSR_PMON_CTL0,
-	.event_mask	= IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl	= SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.ops		= &ivbep_uncore_msr_ops,
-	.format_group	= &ivbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
-	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
-	EVENT_EXTRA_END
-};
-
-static u64 ivbep_cbox_filter_mask(int fields)
-{
-	u64 mask = 0;
-
-	if (fields & 0x1)
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
-	if (fields & 0x2)
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
-	if (fields & 0x4)
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-	if (fields & 0x8)
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
-	if (fields & 0x10) {
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
-		mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
-	}
-
-	return mask;
-}
-
-static struct event_constraint *
-ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
-}
-
-static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct extra_reg *er;
-	int idx = 0;
-
-	for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		idx |= er->idx;
-	}
-
-	if (idx) {
-		reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
-			SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-		reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
-		reg1->idx = idx;
-	}
-	return 0;
-}
-
-static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		u64 filter = uncore_shared_reg_config(box, 0);
-		wrmsrl(reg1->reg, filter & 0xffffffff);
-		wrmsrl(reg1->reg + 6, filter >> 32);
-	}
-
-	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
-	.init_box		= ivbep_uncore_msr_init_box,
-	.disable_box		= snbep_uncore_msr_disable_box,
-	.enable_box		= snbep_uncore_msr_enable_box,
-	.disable_event		= snbep_uncore_msr_disable_event,
-	.enable_event		= ivbep_cbox_enable_event,
-	.read_counter		= uncore_msr_read_counter,
-	.hw_config		= ivbep_cbox_hw_config,
-	.get_constraint		= ivbep_cbox_get_constraint,
-	.put_constraint		= snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 4,
-	.num_boxes		= 15,
-	.perf_ctr_bits		= 44,
-	.event_ctl		= SNBEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= SNBEP_C0_MSR_PMON_CTR0,
-	.event_mask		= IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= SNBEP_CBO_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= snbep_uncore_cbox_constraints,
-	.ops			= &ivbep_uncore_cbox_ops,
-	.format_group		= &ivbep_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
-	IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= snbep_pcu_hw_config,
-	.get_constraint		= snbep_pcu_get_constraint,
-	.put_constraint		= snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_pcu = {
-	.name			= "pcu",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCU_MSR_PMON_CTR0,
-	.event_ctl		= SNBEP_PCU_MSR_PMON_CTL0,
-	.event_mask		= IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCU_MSR_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &ivbep_uncore_pcu_ops,
-	.format_group		= &ivbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *ivbep_msr_uncores[] = {
-	&ivbep_uncore_ubox,
-	&ivbep_uncore_cbox,
-	&ivbep_uncore_pcu,
-	NULL,
-};
-
-void ivbep_uncore_cpu_init(void)
-{
-	if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-	uncore_msr_uncores = ivbep_msr_uncores;
-}
-
-static struct intel_uncore_type ivbep_uncore_ha = {
-	.name		= "ha",
-	.num_counters   = 4,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 48,
-	IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivbep_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 4,
-	.num_boxes	= 8,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-	.event_descs	= snbep_uncore_imc_events,
-	IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-/* registers in IRP boxes are not properly aligned */
-static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
-static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
-
-static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
-			       hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
-}
-
-static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 count = 0;
-
-	pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
-	pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
-	return count;
-}
-
-static struct intel_uncore_ops ivbep_uncore_irp_ops = {
-	.init_box	= ivbep_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= snbep_uncore_pci_enable_box,
-	.disable_event	= ivbep_uncore_irp_disable_event,
-	.enable_event	= ivbep_uncore_irp_enable_event,
-	.read_counter	= ivbep_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type ivbep_uncore_irp = {
-	.name			= "irp",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.event_mask		= IVBEP_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.ops			= &ivbep_uncore_irp_ops,
-	.format_group		= &ivbep_uncore_format_group,
-};
-
-static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
-	.init_box	= ivbep_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= snbep_uncore_pci_enable_box,
-	.disable_event	= snbep_uncore_pci_disable_event,
-	.enable_event	= snbep_qpi_enable_event,
-	.read_counter	= snbep_uncore_pci_read_counter,
-	.hw_config	= snbep_qpi_hw_config,
-	.get_constraint	= uncore_get_constraint,
-	.put_constraint	= uncore_put_constraint,
-};
-
-static struct intel_uncore_type ivbep_uncore_qpi = {
-	.name			= "qpi",
-	.num_counters		= 4,
-	.num_boxes		= 3,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &ivbep_uncore_qpi_ops,
-	.format_group		= &ivbep_uncore_qpi_format_group,
-};
-
-static struct intel_uncore_type ivbep_uncore_r2pcie = {
-	.name		= "r2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r2pcie_constraints,
-	IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivbep_uncore_r3qpi = {
-	.name		= "r3qpi",
-	.num_counters   = 3,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 44,
-	.constraints	= snbep_uncore_r3qpi_constraints,
-	IVBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-	IVBEP_PCI_UNCORE_HA,
-	IVBEP_PCI_UNCORE_IMC,
-	IVBEP_PCI_UNCORE_IRP,
-	IVBEP_PCI_UNCORE_QPI,
-	IVBEP_PCI_UNCORE_R2PCIE,
-	IVBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *ivbep_pci_uncores[] = {
-	[IVBEP_PCI_UNCORE_HA]	= &ivbep_uncore_ha,
-	[IVBEP_PCI_UNCORE_IMC]	= &ivbep_uncore_imc,
-	[IVBEP_PCI_UNCORE_IRP]	= &ivbep_uncore_irp,
-	[IVBEP_PCI_UNCORE_QPI]	= &ivbep_uncore_qpi,
-	[IVBEP_PCI_UNCORE_R2PCIE]	= &ivbep_uncore_r2pcie,
-	[IVBEP_PCI_UNCORE_R3QPI]	= &ivbep_uncore_r3qpi,
-	NULL,
-};
-
-static const struct pci_device_id ivbep_uncore_pci_ids[] = {
-	{ /* Home Agent 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
-	},
-	{ /* Home Agent 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
-	},
-	{ /* MC0 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
-	},
-	{ /* MC0 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
-	},
-	{ /* MC0 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
-	},
-	{ /* MC0 Channel 4 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
-	},
-	{ /* MC1 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
-	},
-	{ /* MC1 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
-	},
-	{ /* MC1 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
-	},
-	{ /* MC1 Channel 4 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
-	},
-	{ /* IRP */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
-	},
-	{ /* QPI0 Port 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
-	},
-	{ /* QPI0 Port 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
-	},
-	{ /* QPI1 Port 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
-	},
-	{ /* R2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
-	},
-	{ /* R3QPI0 Link 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
-	},
-	{ /* R3QPI0 Link 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
-	},
-	{ /* R3QPI1 Link 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
-		.driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT0_FILTER),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT1_FILTER),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver ivbep_uncore_pci_driver = {
-	.name		= "ivbep_uncore",
-	.id_table	= ivbep_uncore_pci_ids,
-};
-
-int ivbep_uncore_pci_init(void)
-{
-	int ret = snbep_pci2phy_map_init(0x0e1e);
-	if (ret)
-		return ret;
-	uncore_pci_uncores = ivbep_pci_uncores;
-	uncore_pci_driver = &ivbep_uncore_pci_driver;
-	return 0;
-}
-/* end of IvyTown uncore support */
-
-/* KNL uncore support */
-static struct attribute *knl_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	NULL,
-};
-
-static struct attribute_group knl_uncore_ubox_format_group = {
-	.name = "format",
-	.attrs = knl_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_ubox = {
-	.name			= "ubox",
-	.num_counters		= 2,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
-	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
-	.event_mask		= KNL_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.ops			= &snbep_uncore_msr_ops,
-	.format_group		= &knl_uncore_ubox_format_group,
-};
-
-static struct attribute *knl_uncore_cha_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_qor.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_filter_tid4.attr,
-	&format_attr_filter_link3.attr,
-	&format_attr_filter_state4.attr,
-	&format_attr_filter_local.attr,
-	&format_attr_filter_all_op.attr,
-	&format_attr_filter_nnm.attr,
-	&format_attr_filter_opc3.attr,
-	&format_attr_filter_nc.attr,
-	&format_attr_filter_isoc.attr,
-	NULL,
-};
-
-static struct attribute_group knl_uncore_cha_format_group = {
-	.name = "format",
-	.attrs = knl_uncore_cha_formats_attr,
-};
-
-static struct event_constraint knl_uncore_cha_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x1f, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg knl_uncore_cha_extra_regs[] = {
-	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x3d, 0xff, 0x2),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x4),
-	EVENT_EXTRA_END
-};
-
-static u64 knl_cha_filter_mask(int fields)
-{
-	u64 mask = 0;
-
-	if (fields & 0x1)
-		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_TID;
-	if (fields & 0x2)
-		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_STATE;
-	if (fields & 0x4)
-		mask |= KNL_CHA_MSR_PMON_BOX_FILTER_OP;
-	return mask;
-}
-
-static struct event_constraint *
-knl_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return __snbep_cbox_get_constraint(box, event, knl_cha_filter_mask);
-}
-
-static int knl_cha_hw_config(struct intel_uncore_box *box,
-			     struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct extra_reg *er;
-	int idx = 0;
-
-	for (er = knl_uncore_cha_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		idx |= er->idx;
-	}
-
-	if (idx) {
-		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
-			    KNL_CHA_MSR_OFFSET * box->pmu->pmu_idx;
-		reg1->config = event->attr.config1 & knl_cha_filter_mask(idx);
-		reg1->idx = idx;
-	}
-	return 0;
-}
-
-static void hswep_cbox_enable_event(struct intel_uncore_box *box,
-				    struct perf_event *event);
-
-static struct intel_uncore_ops knl_uncore_cha_ops = {
-	.init_box		= snbep_uncore_msr_init_box,
-	.disable_box		= snbep_uncore_msr_disable_box,
-	.enable_box		= snbep_uncore_msr_enable_box,
-	.disable_event		= snbep_uncore_msr_disable_event,
-	.enable_event		= hswep_cbox_enable_event,
-	.read_counter		= uncore_msr_read_counter,
-	.hw_config		= knl_cha_hw_config,
-	.get_constraint		= knl_cha_get_constraint,
-	.put_constraint		= snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type knl_uncore_cha = {
-	.name			= "cha",
-	.num_counters		= 4,
-	.num_boxes		= 38,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
-	.event_mask		= KNL_CHA_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= KNL_CHA_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= knl_uncore_cha_constraints,
-	.ops			= &knl_uncore_cha_ops,
-	.format_group		= &knl_uncore_cha_format_group,
-};
-
-static struct attribute *knl_uncore_pcu_formats_attr[] = {
-	&format_attr_event2.attr,
-	&format_attr_use_occ_ctr.attr,
-	&format_attr_occ_sel.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh6.attr,
-	&format_attr_occ_invert.attr,
-	&format_attr_occ_edge_det.attr,
-	NULL,
-};
-
-static struct attribute_group knl_uncore_pcu_format_group = {
-	.name = "format",
-	.attrs = knl_uncore_pcu_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_pcu = {
-	.name			= "pcu",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= HSWEP_PCU_MSR_PMON_CTR0,
-	.event_ctl		= HSWEP_PCU_MSR_PMON_CTL0,
-	.event_mask		= KNL_PCU_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
-	.ops			= &snbep_uncore_msr_ops,
-	.format_group		= &knl_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *knl_msr_uncores[] = {
-	&knl_uncore_ubox,
-	&knl_uncore_cha,
-	&knl_uncore_pcu,
-	NULL,
-};
-
-void knl_uncore_cpu_init(void)
-{
-	uncore_msr_uncores = knl_msr_uncores;
-}
-
-static void knl_uncore_imc_enable_box(struct intel_uncore_box *box)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	int box_ctl = uncore_pci_box_ctl(box);
-
-	pci_write_config_dword(pdev, box_ctl, 0);
-}
-
-static void knl_uncore_imc_enable_event(struct intel_uncore_box *box,
-					struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-
-	if ((event->attr.config & SNBEP_PMON_CTL_EV_SEL_MASK)
-							== UNCORE_FIXED_EVENT)
-		pci_write_config_dword(pdev, hwc->config_base,
-				       hwc->config | KNL_PMON_FIXED_CTL_EN);
-	else
-		pci_write_config_dword(pdev, hwc->config_base,
-				       hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops knl_uncore_imc_ops = {
-	.init_box	= snbep_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= knl_uncore_imc_enable_box,
-	.read_counter	= snbep_uncore_pci_read_counter,
-	.enable_event	= knl_uncore_imc_enable_event,
-	.disable_event	= snbep_uncore_pci_disable_event,
-};
-
-static struct intel_uncore_type knl_uncore_imc_uclk = {
-	.name			= "imc_uclk",
-	.num_counters		= 4,
-	.num_boxes		= 2,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
-	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
-	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
-	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
-	.ops			= &knl_uncore_imc_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_imc_dclk = {
-	.name			= "imc",
-	.num_counters		= 4,
-	.num_boxes		= 6,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= KNL_MC0_CH0_MSR_PMON_CTR0_LOW,
-	.event_ctl		= KNL_MC0_CH0_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= KNL_MC0_CH0_MSR_PMON_FIXED_LOW,
-	.fixed_ctl		= KNL_MC0_CH0_MSR_PMON_FIXED_CTL,
-	.box_ctl		= KNL_MC0_CH0_MSR_PMON_BOX_CTL,
-	.ops			= &knl_uncore_imc_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_edc_uclk = {
-	.name			= "edc_uclk",
-	.num_counters		= 4,
-	.num_boxes		= 8,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= KNL_UCLK_MSR_PMON_CTR0_LOW,
-	.event_ctl		= KNL_UCLK_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= KNL_UCLK_MSR_PMON_UCLK_FIXED_LOW,
-	.fixed_ctl		= KNL_UCLK_MSR_PMON_UCLK_FIXED_CTL,
-	.box_ctl		= KNL_UCLK_MSR_PMON_BOX_CTL,
-	.ops			= &knl_uncore_imc_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type knl_uncore_edc_eclk = {
-	.name			= "edc_eclk",
-	.num_counters		= 4,
-	.num_boxes		= 8,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= KNL_EDC0_ECLK_MSR_PMON_CTR0_LOW,
-	.event_ctl		= KNL_EDC0_ECLK_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_LOW,
-	.fixed_ctl		= KNL_EDC0_ECLK_MSR_PMON_ECLK_FIXED_CTL,
-	.box_ctl		= KNL_EDC0_ECLK_MSR_PMON_BOX_CTL,
-	.ops			= &knl_uncore_imc_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct event_constraint knl_uncore_m2pcie_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type knl_uncore_m2pcie = {
-	.name		= "m2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.constraints	= knl_uncore_m2pcie_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct attribute *knl_uncore_irp_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_qor.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute_group knl_uncore_irp_format_group = {
-	.name = "format",
-	.attrs = knl_uncore_irp_formats_attr,
-};
-
-static struct intel_uncore_type knl_uncore_irp = {
-	.name			= "irp",
-	.num_counters		= 2,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= KNL_IRP_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= KNL_IRP_PCI_PMON_BOX_CTL,
-	.ops			= &snbep_uncore_pci_ops,
-	.format_group		= &knl_uncore_irp_format_group,
-};
-
-enum {
-	KNL_PCI_UNCORE_MC_UCLK,
-	KNL_PCI_UNCORE_MC_DCLK,
-	KNL_PCI_UNCORE_EDC_UCLK,
-	KNL_PCI_UNCORE_EDC_ECLK,
-	KNL_PCI_UNCORE_M2PCIE,
-	KNL_PCI_UNCORE_IRP,
-};
-
-static struct intel_uncore_type *knl_pci_uncores[] = {
-	[KNL_PCI_UNCORE_MC_UCLK]	= &knl_uncore_imc_uclk,
-	[KNL_PCI_UNCORE_MC_DCLK]	= &knl_uncore_imc_dclk,
-	[KNL_PCI_UNCORE_EDC_UCLK]	= &knl_uncore_edc_uclk,
-	[KNL_PCI_UNCORE_EDC_ECLK]	= &knl_uncore_edc_eclk,
-	[KNL_PCI_UNCORE_M2PCIE]		= &knl_uncore_m2pcie,
-	[KNL_PCI_UNCORE_IRP]		= &knl_uncore_irp,
-	NULL,
-};
-
-/*
- * KNL uses a common PCI device ID for multiple instances of an Uncore PMU
- * device type. prior to KNL, each instance of a PMU device type had a unique
- * device ID.
- *
- *	PCI Device ID	Uncore PMU Devices
- *	----------------------------------
- *	0x7841		MC0 UClk, MC1 UClk
- *	0x7843		MC0 DClk CH 0, MC0 DClk CH 1, MC0 DClk CH 2,
- *			MC1 DClk CH 0, MC1 DClk CH 1, MC1 DClk CH 2
- *	0x7833		EDC0 UClk, EDC1 UClk, EDC2 UClk, EDC3 UClk,
- *			EDC4 UClk, EDC5 UClk, EDC6 UClk, EDC7 UClk
- *	0x7835		EDC0 EClk, EDC1 EClk, EDC2 EClk, EDC3 EClk,
- *			EDC4 EClk, EDC5 EClk, EDC6 EClk, EDC7 EClk
- *	0x7817		M2PCIe
- *	0x7814		IRP
-*/
-
-static const struct pci_device_id knl_uncore_pci_ids[] = {
-	{ /* MC UClk */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7841),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_UCLK, 0),
-	},
-	{ /* MC DClk Channel */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7843),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_MC_DCLK, 0),
-	},
-	{ /* EDC UClk */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7833),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_UCLK, 0),
-	},
-	{ /* EDC EClk */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7835),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_EDC_ECLK, 0),
-	},
-	{ /* M2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7817),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_M2PCIE, 0),
-	},
-	{ /* IRP */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7814),
-		.driver_data = UNCORE_PCI_DEV_DATA(KNL_PCI_UNCORE_IRP, 0),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver knl_uncore_pci_driver = {
-	.name		= "knl_uncore",
-	.id_table	= knl_uncore_pci_ids,
-};
-
-int knl_uncore_pci_init(void)
-{
-	int ret;
-
-	/* All KNL PCI based PMON units are on the same PCI bus except IRP */
-	ret = snb_pci2phy_map_init(0x7814); /* IRP */
-	if (ret)
-		return ret;
-	ret = snb_pci2phy_map_init(0x7817); /* M2PCIe */
-	if (ret)
-		return ret;
-	uncore_pci_uncores = knl_pci_uncores;
-	uncore_pci_driver = &knl_uncore_pci_driver;
-	return 0;
-}
-
-/* end of KNL uncore support */
-
-/* Haswell-EP uncore support */
-static struct attribute *hswep_uncore_ubox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh5.attr,
-	&format_attr_filter_tid2.attr,
-	&format_attr_filter_cid.attr,
-	NULL,
-};
-
-static struct attribute_group hswep_uncore_ubox_format_group = {
-	.name = "format",
-	.attrs = hswep_uncore_ubox_formats_attr,
-};
-
-static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	reg1->reg = HSWEP_U_MSR_PMON_FILTER;
-	reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
-	reg1->idx = 0;
-	return 0;
-}
-
-static struct intel_uncore_ops hswep_uncore_ubox_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= hswep_ubox_hw_config,
-	.get_constraint		= uncore_get_constraint,
-	.put_constraint		= uncore_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_ubox = {
-	.name			= "ubox",
-	.num_counters		= 2,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 44,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
-	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &hswep_uncore_ubox_ops,
-	.format_group		= &hswep_uncore_ubox_format_group,
-};
-
-static struct attribute *hswep_uncore_cbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_thresh8.attr,
-	&format_attr_filter_tid3.attr,
-	&format_attr_filter_link2.attr,
-	&format_attr_filter_state3.attr,
-	&format_attr_filter_nid2.attr,
-	&format_attr_filter_opc2.attr,
-	&format_attr_filter_nc.attr,
-	&format_attr_filter_c6.attr,
-	&format_attr_filter_isoc.attr,
-	NULL,
-};
-
-static struct attribute_group hswep_uncore_cbox_format_group = {
-	.name = "format",
-	.attrs = hswep_uncore_cbox_formats_attr,
-};
-
-static struct event_constraint hswep_uncore_cbox_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
-	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
-				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
-	SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
-	EVENT_EXTRA_END
-};
-
-static u64 hswep_cbox_filter_mask(int fields)
-{
-	u64 mask = 0;
-	if (fields & 0x1)
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
-	if (fields & 0x2)
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
-	if (fields & 0x4)
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
-	if (fields & 0x8)
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
-	if (fields & 0x10) {
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
-		mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
-	}
-	return mask;
-}
-
-static struct event_constraint *
-hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
-	return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
-}
-
-static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
-	struct extra_reg *er;
-	int idx = 0;
-
-	for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
-		if (er->event != (event->hw.config & er->config_mask))
-			continue;
-		idx |= er->idx;
-	}
-
-	if (idx) {
-		reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
-			    HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
-		reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
-		reg1->idx = idx;
-	}
-	return 0;
-}
-
-static void hswep_cbox_enable_event(struct intel_uncore_box *box,
-				  struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
-	if (reg1->idx != EXTRA_REG_NONE) {
-		u64 filter = uncore_shared_reg_config(box, 0);
-		wrmsrl(reg1->reg, filter & 0xffffffff);
-		wrmsrl(reg1->reg + 1, filter >> 32);
-	}
-
-	wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops hswep_uncore_cbox_ops = {
-	.init_box		= snbep_uncore_msr_init_box,
-	.disable_box		= snbep_uncore_msr_disable_box,
-	.enable_box		= snbep_uncore_msr_enable_box,
-	.disable_event		= snbep_uncore_msr_disable_event,
-	.enable_event		= hswep_cbox_enable_event,
-	.read_counter		= uncore_msr_read_counter,
-	.hw_config		= hswep_cbox_hw_config,
-	.get_constraint		= hswep_cbox_get_constraint,
-	.put_constraint		= snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 4,
-	.num_boxes		= 18,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
-	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= hswep_uncore_cbox_constraints,
-	.ops			= &hswep_uncore_cbox_ops,
-	.format_group		= &hswep_uncore_cbox_format_group,
-};
-
-/*
- * Write SBOX Initialization register bit by bit to avoid spurious #GPs
- */
-static void hswep_uncore_sbox_msr_init_box(struct intel_uncore_box *box)
-{
-	unsigned msr = uncore_msr_box_ctl(box);
-
-	if (msr) {
-		u64 init = SNBEP_PMON_BOX_CTL_INT;
-		u64 flags = 0;
-		int i;
-
-		for_each_set_bit(i, (unsigned long *)&init, 64) {
-			flags |= (1ULL << i);
-			wrmsrl(msr, flags);
-		}
-	}
-}
-
-static struct intel_uncore_ops hswep_uncore_sbox_msr_ops = {
-	__SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.init_box		= hswep_uncore_sbox_msr_init_box
-};
-
-static struct attribute *hswep_uncore_sbox_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_tid_en.attr,
-	&format_attr_inv.attr,
-	&format_attr_thresh8.attr,
-	NULL,
-};
-
-static struct attribute_group hswep_uncore_sbox_format_group = {
-	.name = "format",
-	.attrs = hswep_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_type hswep_uncore_sbox = {
-	.name			= "sbox",
-	.num_counters		= 4,
-	.num_boxes		= 4,
-	.perf_ctr_bits		= 44,
-	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
-	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
-	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
-	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
-	.ops			= &hswep_uncore_sbox_msr_ops,
-	.format_group		= &hswep_uncore_sbox_format_group,
-};
-
-static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-	int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
-	if (ev_sel >= 0xb && ev_sel <= 0xe) {
-		reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
-		reg1->idx = ev_sel - 0xb;
-		reg1->config = event->attr.config1 & (0xff << reg1->idx);
-	}
-	return 0;
-}
-
-static struct intel_uncore_ops hswep_uncore_pcu_ops = {
-	SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-	.hw_config		= hswep_pcu_hw_config,
-	.get_constraint		= snbep_pcu_get_constraint,
-	.put_constraint		= snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type hswep_uncore_pcu = {
-	.name			= "pcu",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= HSWEP_PCU_MSR_PMON_CTR0,
-	.event_ctl		= HSWEP_PCU_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_PCU_MSR_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &hswep_uncore_pcu_ops,
-	.format_group		= &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *hswep_msr_uncores[] = {
-	&hswep_uncore_ubox,
-	&hswep_uncore_cbox,
-	&hswep_uncore_sbox,
-	&hswep_uncore_pcu,
-	NULL,
-};
-
-void hswep_uncore_cpu_init(void)
-{
-	if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-
-	/* Detect 6-8 core systems with only two SBOXes */
-	if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
-		u32 capid4;
-
-		pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
-				      0x94, &capid4);
-		if (((capid4 >> 6) & 0x3) == 0)
-			hswep_uncore_sbox.num_boxes = 2;
-	}
-
-	uncore_msr_uncores = hswep_msr_uncores;
-}
-
-static struct intel_uncore_type hswep_uncore_ha = {
-	.name		= "ha",
-	.num_counters   = 5,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 48,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct uncore_event_desc hswep_uncore_imc_events[] = {
-	INTEL_UNCORE_EVENT_DESC(clockticks,      "event=0x00,umask=0x00"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read,  "event=0x04,umask=0x03"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"),
-	INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"),
-	{ /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type hswep_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 5,
-	.num_boxes	= 8,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-	.event_descs	= hswep_uncore_imc_events,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static unsigned hswep_uncore_irp_ctrs[] = {0xa0, 0xa8, 0xb0, 0xb8};
-
-static u64 hswep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
-	struct pci_dev *pdev = box->pci_dev;
-	struct hw_perf_event *hwc = &event->hw;
-	u64 count = 0;
-
-	pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
-	pci_read_config_dword(pdev, hswep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
-	return count;
-}
-
-static struct intel_uncore_ops hswep_uncore_irp_ops = {
-	.init_box	= snbep_uncore_pci_init_box,
-	.disable_box	= snbep_uncore_pci_disable_box,
-	.enable_box	= snbep_uncore_pci_enable_box,
-	.disable_event	= ivbep_uncore_irp_disable_event,
-	.enable_event	= ivbep_uncore_irp_enable_event,
-	.read_counter	= hswep_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type hswep_uncore_irp = {
-	.name			= "irp",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.ops			= &hswep_uncore_irp_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type hswep_uncore_qpi = {
-	.name			= "qpi",
-	.num_counters		= 5,
-	.num_boxes		= 3,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &snbep_uncore_qpi_ops,
-	.format_group		= &snbep_uncore_qpi_format_group,
-};
-
-static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type hswep_uncore_r2pcie = {
-	.name		= "r2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.constraints	= hswep_uncore_r2pcie_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type hswep_uncore_r3qpi = {
-	.name		= "r3qpi",
-	.num_counters   = 4,
-	.num_boxes	= 3,
-	.perf_ctr_bits	= 44,
-	.constraints	= hswep_uncore_r3qpi_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-	HSWEP_PCI_UNCORE_HA,
-	HSWEP_PCI_UNCORE_IMC,
-	HSWEP_PCI_UNCORE_IRP,
-	HSWEP_PCI_UNCORE_QPI,
-	HSWEP_PCI_UNCORE_R2PCIE,
-	HSWEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *hswep_pci_uncores[] = {
-	[HSWEP_PCI_UNCORE_HA]	= &hswep_uncore_ha,
-	[HSWEP_PCI_UNCORE_IMC]	= &hswep_uncore_imc,
-	[HSWEP_PCI_UNCORE_IRP]	= &hswep_uncore_irp,
-	[HSWEP_PCI_UNCORE_QPI]	= &hswep_uncore_qpi,
-	[HSWEP_PCI_UNCORE_R2PCIE]	= &hswep_uncore_r2pcie,
-	[HSWEP_PCI_UNCORE_R3QPI]	= &hswep_uncore_r3qpi,
-	NULL,
-};
-
-static const struct pci_device_id hswep_uncore_pci_ids[] = {
-	{ /* Home Agent 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
-	},
-	{ /* Home Agent 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
-	},
-	{ /* MC0 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
-	},
-	{ /* MC0 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
-	},
-	{ /* MC0 Channel 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
-	},
-	{ /* MC0 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
-	},
-	{ /* MC1 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
-	},
-	{ /* MC1 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
-	},
-	{ /* MC1 Channel 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
-	},
-	{ /* MC1 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
-	},
-	{ /* IRP */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
-	},
-	{ /* QPI0 Port 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
-	},
-	{ /* QPI0 Port 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
-	},
-	{ /* QPI1 Port 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
-	},
-	{ /* R2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
-	},
-	{ /* R3QPI0 Link 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
-	},
-	{ /* R3QPI0 Link 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
-	},
-	{ /* R3QPI1 Link 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
-		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT0_FILTER),
-	},
-	{ /* QPI Port 1 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   SNBEP_PCI_QPI_PORT1_FILTER),
-	},
-	{ /* PCU.3 (for Capability registers) */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
-						   HSWEP_PCI_PCU_3),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver hswep_uncore_pci_driver = {
-	.name		= "hswep_uncore",
-	.id_table	= hswep_uncore_pci_ids,
-};
-
-int hswep_uncore_pci_init(void)
-{
-	int ret = snbep_pci2phy_map_init(0x2f1e);
-	if (ret)
-		return ret;
-	uncore_pci_uncores = hswep_pci_uncores;
-	uncore_pci_driver = &hswep_uncore_pci_driver;
-	return 0;
-}
-/* end of Haswell-EP uncore support */
-
-/* BDX uncore support */
-
-static struct intel_uncore_type bdx_uncore_ubox = {
-	.name			= "ubox",
-	.num_counters		= 2,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.fixed_ctr_bits		= 48,
-	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
-	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
-	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
-	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
-	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &ivbep_uncore_msr_ops,
-	.format_group		= &ivbep_uncore_ubox_format_group,
-};
-
-static struct event_constraint bdx_uncore_cbox_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_cbox = {
-	.name			= "cbox",
-	.num_counters		= 4,
-	.num_boxes		= 24,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
-	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
-	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
-	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
-	.num_shared_regs	= 1,
-	.constraints		= bdx_uncore_cbox_constraints,
-	.ops			= &hswep_uncore_cbox_ops,
-	.format_group		= &hswep_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_type bdx_uncore_sbox = {
-	.name			= "sbox",
-	.num_counters		= 4,
-	.num_boxes		= 4,
-	.perf_ctr_bits		= 48,
-	.event_ctl		= HSWEP_S0_MSR_PMON_CTL0,
-	.perf_ctr		= HSWEP_S0_MSR_PMON_CTR0,
-	.event_mask		= HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
-	.box_ctl		= HSWEP_S0_MSR_PMON_BOX_CTL,
-	.msr_offset		= HSWEP_SBOX_MSR_OFFSET,
-	.ops			= &hswep_uncore_sbox_msr_ops,
-	.format_group		= &hswep_uncore_sbox_format_group,
-};
-
-static struct intel_uncore_type *bdx_msr_uncores[] = {
-	&bdx_uncore_ubox,
-	&bdx_uncore_cbox,
-	&bdx_uncore_sbox,
-	&hswep_uncore_pcu,
-	NULL,
-};
-
-void bdx_uncore_cpu_init(void)
-{
-	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
-		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
-	uncore_msr_uncores = bdx_msr_uncores;
-}
-
-static struct intel_uncore_type bdx_uncore_ha = {
-	.name		= "ha",
-	.num_counters   = 4,
-	.num_boxes	= 2,
-	.perf_ctr_bits	= 48,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type bdx_uncore_imc = {
-	.name		= "imc",
-	.num_counters   = 5,
-	.num_boxes	= 8,
-	.perf_ctr_bits	= 48,
-	.fixed_ctr_bits	= 48,
-	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
-	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
-	.event_descs	= hswep_uncore_imc_events,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type bdx_uncore_irp = {
-	.name			= "irp",
-	.num_counters		= 4,
-	.num_boxes		= 1,
-	.perf_ctr_bits		= 48,
-	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.ops			= &hswep_uncore_irp_ops,
-	.format_group		= &snbep_uncore_format_group,
-};
-
-static struct intel_uncore_type bdx_uncore_qpi = {
-	.name			= "qpi",
-	.num_counters		= 4,
-	.num_boxes		= 3,
-	.perf_ctr_bits		= 48,
-	.perf_ctr		= SNBEP_PCI_PMON_CTR0,
-	.event_ctl		= SNBEP_PCI_PMON_CTL0,
-	.event_mask		= SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
-	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
-	.num_shared_regs	= 1,
-	.ops			= &snbep_uncore_qpi_ops,
-	.format_group		= &snbep_uncore_qpi_format_group,
-};
-
-static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_r2pcie = {
-	.name		= "r2pcie",
-	.num_counters   = 4,
-	.num_boxes	= 1,
-	.perf_ctr_bits	= 48,
-	.constraints	= bdx_uncore_r2pcie_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct event_constraint bdx_uncore_r3qpi_constraints[] = {
-	UNCORE_EVENT_CONSTRAINT(0x01, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
-	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
-	UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
-	UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
-	EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type bdx_uncore_r3qpi = {
-	.name		= "r3qpi",
-	.num_counters   = 3,
-	.num_boxes	= 3,
-	.perf_ctr_bits	= 48,
-	.constraints	= bdx_uncore_r3qpi_constraints,
-	SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
-	BDX_PCI_UNCORE_HA,
-	BDX_PCI_UNCORE_IMC,
-	BDX_PCI_UNCORE_IRP,
-	BDX_PCI_UNCORE_QPI,
-	BDX_PCI_UNCORE_R2PCIE,
-	BDX_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *bdx_pci_uncores[] = {
-	[BDX_PCI_UNCORE_HA]	= &bdx_uncore_ha,
-	[BDX_PCI_UNCORE_IMC]	= &bdx_uncore_imc,
-	[BDX_PCI_UNCORE_IRP]	= &bdx_uncore_irp,
-	[BDX_PCI_UNCORE_QPI]	= &bdx_uncore_qpi,
-	[BDX_PCI_UNCORE_R2PCIE]	= &bdx_uncore_r2pcie,
-	[BDX_PCI_UNCORE_R3QPI]	= &bdx_uncore_r3qpi,
-	NULL,
-};
-
-static const struct pci_device_id bdx_uncore_pci_ids[] = {
-	{ /* Home Agent 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
-	},
-	{ /* Home Agent 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f38),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 1),
-	},
-	{ /* MC0 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
-	},
-	{ /* MC0 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
-	},
-	{ /* MC0 Channel 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb4),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 2),
-	},
-	{ /* MC0 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb5),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 3),
-	},
-	{ /* MC1 Channel 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd0),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 4),
-	},
-	{ /* MC1 Channel 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd1),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 5),
-	},
-	{ /* MC1 Channel 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd4),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 6),
-	},
-	{ /* MC1 Channel 3 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fd5),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 7),
-	},
-	{ /* IRP */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
-	},
-	{ /* QPI0 Port 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f32),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 0),
-	},
-	{ /* QPI0 Port 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f33),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 1),
-	},
-	{ /* QPI1 Port 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3a),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_QPI, 2),
-	},
-	{ /* R2PCIe */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
-	},
-	{ /* R3QPI0 Link 0 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f36),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 0),
-	},
-	{ /* R3QPI0 Link 1 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f37),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 1),
-	},
-	{ /* R3QPI1 Link 2 */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f3e),
-		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R3QPI, 2),
-	},
-	{ /* QPI Port 0 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f86),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 0),
-	},
-	{ /* QPI Port 1 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f96),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 1),
-	},
-	{ /* QPI Port 2 filter  */
-		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f46),
-		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, 2),
-	},
-	{ /* end: all zeroes */ }
-};
-
-static struct pci_driver bdx_uncore_pci_driver = {
-	.name		= "bdx_uncore",
-	.id_table	= bdx_uncore_pci_ids,
-};
-
-int bdx_uncore_pci_init(void)
-{
-	int ret = snbep_pci2phy_map_init(0x6f1e);
-
-	if (ret)
-		return ret;
-	uncore_pci_uncores = bdx_pci_uncores;
-	uncore_pci_driver = &bdx_uncore_pci_driver;
-	return 0;
-}
-
-/* end of BDX uncore support */
-- 
cgit v0.10.2


From edbb591870dc8d1fd855014ea5360475f7bc46af Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:19 +0100
Subject: perf/x86: Move perf_event_knc.c .............. =>
 x86/events/intel/knc.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-14-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 8c939ca..c1647a6 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -6,7 +6,7 @@ ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/lbr.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/pt.o intel/rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/knc.o 
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/pt.o intel/rapl.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c
new file mode 100644
index 0000000..630bcba
--- /dev/null
+++ b/arch/x86/events/intel/knc.c
@@ -0,0 +1,319 @@
+/* Driver for Intel Xeon Phi "Knights Corner" PMU */
+
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include <asm/hardirq.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+static const u64 knc_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x002a,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x0016,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0028,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0029,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x0012,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x002b,
+};
+
+static const u64 __initconst knc_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		/* On Xeon Phi event "0" is a valid DATA_READ          */
+		/*   (L1 Data Cache Reads) Instruction.                */
+		/* We code this as ARCH_PERFMON_EVENTSEL_INT as this   */
+		/* bit will always be set in x86_pmu_hw_config().      */
+		[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+						/* DATA_READ           */
+		[ C(RESULT_MISS)   ] = 0x0003,	/* DATA_READ_MISS      */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0001,	/* DATA_WRITE          */
+		[ C(RESULT_MISS)   ] = 0x0004,	/* DATA_WRITE_MISS     */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0011,	/* L1_DATA_PF1         */
+		[ C(RESULT_MISS)   ] = 0x001c,	/* L1_DATA_PF1_MISS    */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x000c,	/* CODE_READ          */
+		[ C(RESULT_MISS)   ] = 0x000e,	/* CODE_CACHE_MISS    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0x10cb,	/* L2_READ_MISS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x10cc,	/* L2_WRITE_HIT */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x10fc,	/* L2_DATA_PF2      */
+		[ C(RESULT_MISS)   ] = 0x10fe,	/* L2_DATA_PF2_MISS */
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+						/* DATA_READ */
+						/* see note on L1 OP_READ */
+		[ C(RESULT_MISS)   ] = 0x0002,	/* DATA_PAGE_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0001,	/* DATA_WRITE */
+		[ C(RESULT_MISS)   ] = 0x0002,	/* DATA_PAGE_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x000c,	/* CODE_READ */
+		[ C(RESULT_MISS)   ] = 0x000d,	/* CODE_PAGE_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0012,	/* BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x002b,	/* BRANCHES_MISPREDICTED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+
+static u64 knc_pmu_event_map(int hw_event)
+{
+	return knc_perfmon_event_map[hw_event];
+}
+
+static struct event_constraint knc_event_constraints[] =
+{
+	INTEL_EVENT_CONSTRAINT(0xc3, 0x1),	/* HWP_L2HIT */
+	INTEL_EVENT_CONSTRAINT(0xc4, 0x1),	/* HWP_L2MISS */
+	INTEL_EVENT_CONSTRAINT(0xc8, 0x1),	/* L2_READ_HIT_E */
+	INTEL_EVENT_CONSTRAINT(0xc9, 0x1),	/* L2_READ_HIT_M */
+	INTEL_EVENT_CONSTRAINT(0xca, 0x1),	/* L2_READ_HIT_S */
+	INTEL_EVENT_CONSTRAINT(0xcb, 0x1),	/* L2_READ_MISS */
+	INTEL_EVENT_CONSTRAINT(0xcc, 0x1),	/* L2_WRITE_HIT */
+	INTEL_EVENT_CONSTRAINT(0xce, 0x1),	/* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
+	INTEL_EVENT_CONSTRAINT(0xcf, 0x1),	/* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
+	INTEL_EVENT_CONSTRAINT(0xd7, 0x1),	/* L2_VICTIM_REQ_WITH_DATA */
+	INTEL_EVENT_CONSTRAINT(0xe3, 0x1),	/* SNP_HITM_BUNIT */
+	INTEL_EVENT_CONSTRAINT(0xe6, 0x1),	/* SNP_HIT_L2 */
+	INTEL_EVENT_CONSTRAINT(0xe7, 0x1),	/* SNP_HITM_L2 */
+	INTEL_EVENT_CONSTRAINT(0xf1, 0x1),	/* L2_DATA_READ_MISS_CACHE_FILL */
+	INTEL_EVENT_CONSTRAINT(0xf2, 0x1),	/* L2_DATA_WRITE_MISS_CACHE_FILL */
+	INTEL_EVENT_CONSTRAINT(0xf6, 0x1),	/* L2_DATA_READ_MISS_MEM_FILL */
+	INTEL_EVENT_CONSTRAINT(0xf7, 0x1),	/* L2_DATA_WRITE_MISS_MEM_FILL */
+	INTEL_EVENT_CONSTRAINT(0xfc, 0x1),	/* L2_DATA_PF2 */
+	INTEL_EVENT_CONSTRAINT(0xfd, 0x1),	/* L2_DATA_PF2_DROP */
+	INTEL_EVENT_CONSTRAINT(0xfe, 0x1),	/* L2_DATA_PF2_MISS */
+	INTEL_EVENT_CONSTRAINT(0xff, 0x1),	/* L2_DATA_HIT_INFLIGHT_PF2 */
+	EVENT_CONSTRAINT_END
+};
+
+#define MSR_KNC_IA32_PERF_GLOBAL_STATUS		0x0000002d
+#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL	0x0000002e
+#define MSR_KNC_IA32_PERF_GLOBAL_CTRL		0x0000002f
+
+#define KNC_ENABLE_COUNTER0			0x00000001
+#define KNC_ENABLE_COUNTER1			0x00000002
+
+static void knc_pmu_disable_all(void)
+{
+	u64 val;
+
+	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+	val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static void knc_pmu_enable_all(int added)
+{
+	u64 val;
+
+	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+	val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static inline void
+knc_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 val;
+
+	val = hwc->config;
+	val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+
+	(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static void knc_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 val;
+
+	val = hwc->config;
+	val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+	(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static inline u64 knc_pmu_get_status(void)
+{
+	u64 status;
+
+	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static inline void knc_pmu_ack_status(u64 ack)
+{
+	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
+}
+
+static int knc_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	int handled = 0;
+	int bit, loops;
+	u64 status;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	knc_pmu_disable_all();
+
+	status = knc_pmu_get_status();
+	if (!status) {
+		knc_pmu_enable_all(0);
+		return handled;
+	}
+
+	loops = 0;
+again:
+	knc_pmu_ack_status(status);
+	if (++loops > 100) {
+		WARN_ONCE(1, "perf: irq loop stuck!\n");
+		perf_event_print_debug();
+		goto done;
+	}
+
+	inc_irq_stat(apic_perf_irqs);
+
+	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
+		struct perf_event *event = cpuc->events[bit];
+
+		handled++;
+
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+
+		if (!intel_pmu_save_and_restart(event))
+			continue;
+
+		perf_sample_data_init(&data, 0, event->hw.last_period);
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	status = knc_pmu_get_status();
+	if (status)
+		goto again;
+
+done:
+	knc_pmu_enable_all(0);
+
+	return handled;
+}
+
+
+PMU_FORMAT_ATTR(event,	"config:0-7"	);
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+
+static struct attribute *intel_knc_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+static const struct x86_pmu knc_pmu __initconst = {
+	.name			= "knc",
+	.handle_irq		= knc_pmu_handle_irq,
+	.disable_all		= knc_pmu_disable_all,
+	.enable_all		= knc_pmu_enable_all,
+	.enable			= knc_pmu_enable_event,
+	.disable		= knc_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_KNC_EVNTSEL0,
+	.perfctr		= MSR_KNC_PERFCTR0,
+	.event_map		= knc_pmu_event_map,
+	.max_events             = ARRAY_SIZE(knc_perfmon_event_map),
+	.apic			= 1,
+	.max_period		= (1ULL << 39) - 1,
+	.version		= 0,
+	.num_counters		= 2,
+	.cntval_bits		= 40,
+	.cntval_mask		= (1ULL << 40) - 1,
+	.get_event_constraints	= x86_get_event_constraints,
+	.event_constraints	= knc_event_constraints,
+	.format_attrs		= intel_knc_formats_attr,
+};
+
+__init int knc_pmu_init(void)
+{
+	x86_pmu = knc_pmu;
+
+	memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, 
+		sizeof(hw_cache_event_ids));
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 973b77b..3195f7d 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_p4.o
 
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c
deleted file mode 100644
index 5b0c232..0000000
--- a/arch/x86/kernel/cpu/perf_event_knc.c
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Driver for Intel Xeon Phi "Knights Corner" PMU */
-
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include <asm/hardirq.h>
-
-#include "perf_event.h"
-
-static const u64 knc_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x002a,
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x0016,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0028,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0029,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x0012,
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x002b,
-};
-
-static const u64 __initconst knc_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		/* On Xeon Phi event "0" is a valid DATA_READ          */
-		/*   (L1 Data Cache Reads) Instruction.                */
-		/* We code this as ARCH_PERFMON_EVENTSEL_INT as this   */
-		/* bit will always be set in x86_pmu_hw_config().      */
-		[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
-						/* DATA_READ           */
-		[ C(RESULT_MISS)   ] = 0x0003,	/* DATA_READ_MISS      */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0001,	/* DATA_WRITE          */
-		[ C(RESULT_MISS)   ] = 0x0004,	/* DATA_WRITE_MISS     */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0011,	/* L1_DATA_PF1         */
-		[ C(RESULT_MISS)   ] = 0x001c,	/* L1_DATA_PF1_MISS    */
-	},
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x000c,	/* CODE_READ          */
-		[ C(RESULT_MISS)   ] = 0x000e,	/* CODE_CACHE_MISS    */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0x10cb,	/* L2_READ_MISS */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x10cc,	/* L2_WRITE_HIT */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x10fc,	/* L2_DATA_PF2      */
-		[ C(RESULT_MISS)   ] = 0x10fe,	/* L2_DATA_PF2_MISS */
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
-						/* DATA_READ */
-						/* see note on L1 OP_READ */
-		[ C(RESULT_MISS)   ] = 0x0002,	/* DATA_PAGE_WALK */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0001,	/* DATA_WRITE */
-		[ C(RESULT_MISS)   ] = 0x0002,	/* DATA_PAGE_WALK */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = 0x0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x000c,	/* CODE_READ */
-		[ C(RESULT_MISS)   ] = 0x000d,	/* CODE_PAGE_WALK */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0012,	/* BRANCHES */
-		[ C(RESULT_MISS)   ] = 0x002b,	/* BRANCHES_MISPREDICTED */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-
-static u64 knc_pmu_event_map(int hw_event)
-{
-	return knc_perfmon_event_map[hw_event];
-}
-
-static struct event_constraint knc_event_constraints[] =
-{
-	INTEL_EVENT_CONSTRAINT(0xc3, 0x1),	/* HWP_L2HIT */
-	INTEL_EVENT_CONSTRAINT(0xc4, 0x1),	/* HWP_L2MISS */
-	INTEL_EVENT_CONSTRAINT(0xc8, 0x1),	/* L2_READ_HIT_E */
-	INTEL_EVENT_CONSTRAINT(0xc9, 0x1),	/* L2_READ_HIT_M */
-	INTEL_EVENT_CONSTRAINT(0xca, 0x1),	/* L2_READ_HIT_S */
-	INTEL_EVENT_CONSTRAINT(0xcb, 0x1),	/* L2_READ_MISS */
-	INTEL_EVENT_CONSTRAINT(0xcc, 0x1),	/* L2_WRITE_HIT */
-	INTEL_EVENT_CONSTRAINT(0xce, 0x1),	/* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
-	INTEL_EVENT_CONSTRAINT(0xcf, 0x1),	/* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
-	INTEL_EVENT_CONSTRAINT(0xd7, 0x1),	/* L2_VICTIM_REQ_WITH_DATA */
-	INTEL_EVENT_CONSTRAINT(0xe3, 0x1),	/* SNP_HITM_BUNIT */
-	INTEL_EVENT_CONSTRAINT(0xe6, 0x1),	/* SNP_HIT_L2 */
-	INTEL_EVENT_CONSTRAINT(0xe7, 0x1),	/* SNP_HITM_L2 */
-	INTEL_EVENT_CONSTRAINT(0xf1, 0x1),	/* L2_DATA_READ_MISS_CACHE_FILL */
-	INTEL_EVENT_CONSTRAINT(0xf2, 0x1),	/* L2_DATA_WRITE_MISS_CACHE_FILL */
-	INTEL_EVENT_CONSTRAINT(0xf6, 0x1),	/* L2_DATA_READ_MISS_MEM_FILL */
-	INTEL_EVENT_CONSTRAINT(0xf7, 0x1),	/* L2_DATA_WRITE_MISS_MEM_FILL */
-	INTEL_EVENT_CONSTRAINT(0xfc, 0x1),	/* L2_DATA_PF2 */
-	INTEL_EVENT_CONSTRAINT(0xfd, 0x1),	/* L2_DATA_PF2_DROP */
-	INTEL_EVENT_CONSTRAINT(0xfe, 0x1),	/* L2_DATA_PF2_MISS */
-	INTEL_EVENT_CONSTRAINT(0xff, 0x1),	/* L2_DATA_HIT_INFLIGHT_PF2 */
-	EVENT_CONSTRAINT_END
-};
-
-#define MSR_KNC_IA32_PERF_GLOBAL_STATUS		0x0000002d
-#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL	0x0000002e
-#define MSR_KNC_IA32_PERF_GLOBAL_CTRL		0x0000002f
-
-#define KNC_ENABLE_COUNTER0			0x00000001
-#define KNC_ENABLE_COUNTER1			0x00000002
-
-static void knc_pmu_disable_all(void)
-{
-	u64 val;
-
-	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-	val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
-	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static void knc_pmu_enable_all(int added)
-{
-	u64 val;
-
-	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-	val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
-	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
-}
-
-static inline void
-knc_pmu_disable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 val;
-
-	val = hwc->config;
-	val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-
-	(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static void knc_pmu_enable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 val;
-
-	val = hwc->config;
-	val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-
-	(void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
-}
-
-static inline u64 knc_pmu_get_status(void)
-{
-	u64 status;
-
-	rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
-
-	return status;
-}
-
-static inline void knc_pmu_ack_status(u64 ack)
-{
-	wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
-}
-
-static int knc_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_events *cpuc;
-	int handled = 0;
-	int bit, loops;
-	u64 status;
-
-	cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	knc_pmu_disable_all();
-
-	status = knc_pmu_get_status();
-	if (!status) {
-		knc_pmu_enable_all(0);
-		return handled;
-	}
-
-	loops = 0;
-again:
-	knc_pmu_ack_status(status);
-	if (++loops > 100) {
-		WARN_ONCE(1, "perf: irq loop stuck!\n");
-		perf_event_print_debug();
-		goto done;
-	}
-
-	inc_irq_stat(apic_perf_irqs);
-
-	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
-		struct perf_event *event = cpuc->events[bit];
-
-		handled++;
-
-		if (!test_bit(bit, cpuc->active_mask))
-			continue;
-
-		if (!intel_pmu_save_and_restart(event))
-			continue;
-
-		perf_sample_data_init(&data, 0, event->hw.last_period);
-
-		if (perf_event_overflow(event, &data, regs))
-			x86_pmu_stop(event, 0);
-	}
-
-	/*
-	 * Repeat if there is more work to be done:
-	 */
-	status = knc_pmu_get_status();
-	if (status)
-		goto again;
-
-done:
-	knc_pmu_enable_all(0);
-
-	return handled;
-}
-
-
-PMU_FORMAT_ATTR(event,	"config:0-7"	);
-PMU_FORMAT_ATTR(umask,	"config:8-15"	);
-PMU_FORMAT_ATTR(edge,	"config:18"	);
-PMU_FORMAT_ATTR(inv,	"config:23"	);
-PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
-
-static struct attribute *intel_knc_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask.attr,
-	NULL,
-};
-
-static const struct x86_pmu knc_pmu __initconst = {
-	.name			= "knc",
-	.handle_irq		= knc_pmu_handle_irq,
-	.disable_all		= knc_pmu_disable_all,
-	.enable_all		= knc_pmu_enable_all,
-	.enable			= knc_pmu_enable_event,
-	.disable		= knc_pmu_disable_event,
-	.hw_config		= x86_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_KNC_EVNTSEL0,
-	.perfctr		= MSR_KNC_PERFCTR0,
-	.event_map		= knc_pmu_event_map,
-	.max_events             = ARRAY_SIZE(knc_perfmon_event_map),
-	.apic			= 1,
-	.max_period		= (1ULL << 39) - 1,
-	.version		= 0,
-	.num_counters		= 2,
-	.cntval_bits		= 40,
-	.cntval_mask		= (1ULL << 40) - 1,
-	.get_event_constraints	= x86_get_event_constraints,
-	.event_constraints	= knc_event_constraints,
-	.format_attrs		= intel_knc_formats_attr,
-};
-
-__init int knc_pmu_init(void)
-{
-	x86_pmu = knc_pmu;
-
-	memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, 
-		sizeof(hw_cache_event_ids));
-
-	return 0;
-}
-- 
cgit v0.10.2


From f03e97dbd2bb7d9436400fb6946502268ab33542 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:20 +0100
Subject: perf/x86: Move perf_event_p4.c ............... =>
 x86/events/intel/p4.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-15-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index c1647a6..5a6f20d 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/knc.o 
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/pt.o intel/rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/p4.o intel/pt.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/rapl.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
new file mode 100644
index 0000000..1c72fed
--- /dev/null
+++ b/arch/x86/events/intel/p4.c
@@ -0,0 +1,1376 @@
+/*
+ * Netburst Performance Events (P4, old Xeon)
+ *
+ *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+#include <asm/perf_event_p4.h>
+#include <asm/hardirq.h>
+#include <asm/apic.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+#define P4_CNTR_LIMIT 3
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_bind {
+	unsigned int opcode;			/* Event code and ESCR selector */
+	unsigned int escr_msr[2];		/* ESCR MSR for this event */
+	unsigned int escr_emask;		/* valid ESCR EventMask bits */
+	unsigned int shared;			/* event is shared across threads */
+	char cntr[2][P4_CNTR_LIMIT];		/* counter index (offset), -1 on abscence */
+};
+
+struct p4_pebs_bind {
+	unsigned int metric_pebs;
+	unsigned int metric_vert;
+};
+
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert)			\
+	[P4_PEBS_METRIC__##name] = {				\
+		.metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,	\
+		.metric_vert = vert,				\
+	}
+
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+	P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,	0x0000001, 0x0000001),
+	P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,	0x0000002, 0x0000001),
+	P4_GEN_PEBS_BIND(dtlb_load_miss_retired,	0x0000004, 0x0000001),
+	P4_GEN_PEBS_BIND(dtlb_store_miss_retired,	0x0000004, 0x0000002),
+	P4_GEN_PEBS_BIND(dtlb_all_miss_retired,		0x0000004, 0x0000003),
+	P4_GEN_PEBS_BIND(tagged_mispred_branch,		0x0018000, 0x0000010),
+	P4_GEN_PEBS_BIND(mob_load_replay_retired,	0x0000200, 0x0000001),
+	P4_GEN_PEBS_BIND(split_load_retired,		0x0000400, 0x0000001),
+	P4_GEN_PEBS_BIND(split_store_retired,		0x0000400, 0x0000002),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+	[P4_EVENT_TC_DELIVER_MODE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
+		.shared		= 1,
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_BPU_FETCH_REQUEST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+		.escr_msr	= { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_ITLB_REFERENCE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_MEMORY_CANCEL] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_MEMORY_COMPLETE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_LOAD_PORT_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_STORE_PORT_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+		.escr_msr	= { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_MOB_LOAD_REPLAY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+		.escr_msr	= { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_PAGE_WALK_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+		.escr_msr	= { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
+		.shared		= 1,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BSQ_CACHE_REFERENCE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_IOQ_ALLOCATION] = {
+		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_IOQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
+		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+		.escr_msr	= { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
+		.cntr		= { {2, -1, -1}, {3, -1, -1} },
+	},
+	[P4_EVENT_FSB_DATA_ACTIVITY] = {
+		.opcode		= P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
+		.shared		= 1,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BSQ_ALLOCATION] = {		/* shared ESCR, broken CCCR1 */
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
+		.cntr		= { {0, -1, -1}, {1, -1, -1} },
+	},
+	[P4_EVENT_BSQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
+		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+		.escr_msr	= { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
+		.cntr		= { {2, -1, -1}, {3, -1, -1} },
+	},
+	[P4_EVENT_SSE_INPUT_ASSIST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_PACKED_SP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_PACKED_DP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_SCALAR_SP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_SCALAR_DP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_64BIT_MMX_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_128BIT_MMX_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_X87_FP_UOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_X87_FP_UOP),
+		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_TC_MISC] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_MISC),
+		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_GLOBAL_POWER_EVENTS] = {
+		.opcode		= P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_TC_MS_XFER] = {
+		.opcode		= P4_OPCODE(P4_EVENT_TC_MS_XFER),
+		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_UOP_QUEUE_WRITES] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RETIRED_BRANCH_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
+		.cntr		= { {4, 5, -1}, {6, 7, -1} },
+	},
+	[P4_EVENT_RESOURCE_STALL] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+		.escr_msr	= { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_WC_BUFFER] = {
+		.opcode		= P4_OPCODE(P4_EVENT_WC_BUFFER),
+		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
+		.shared		= 1,
+		.cntr		= { {8, 9, -1}, {10, 11, -1} },
+	},
+	[P4_EVENT_B2B_CYCLES] = {
+		.opcode		= P4_OPCODE(P4_EVENT_B2B_CYCLES),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_BNR] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BNR),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_SNOOP] = {
+		.opcode		= P4_OPCODE(P4_EVENT_SNOOP),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_RESPONSE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_RESPONSE),
+		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+		.escr_emask	= 0,
+		.cntr		= { {0, -1, -1}, {2, -1, -1} },
+	},
+	[P4_EVENT_FRONT_END_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_EXECUTION_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_REPLAY_EVENT] = {
+		.opcode		= P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_INSTR_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_UOPS_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_UOP_TYPE] = {
+		.opcode		= P4_OPCODE(P4_EVENT_UOP_TYPE),
+		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_BRANCH_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_X87_ASSIST] = {
+		.opcode		= P4_OPCODE(P4_EVENT_X87_ASSIST),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)			|
+			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_MACHINE_CLEAR] = {
+		.opcode		= P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+	[P4_EVENT_INSTR_COMPLETED] = {
+		.opcode		= P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+		.escr_emask	=
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)		|
+			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
+		.cntr		= { {12, 13, 16}, {14, 15, 17} },
+	},
+};
+
+#define P4_GEN_CACHE_EVENT(event, bit, metric)				  \
+	p4_config_pack_escr(P4_ESCR_EVENT(event)			| \
+			    P4_ESCR_EMASK_BIT(event, bit))		| \
+	p4_config_pack_cccr(metric					| \
+			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
+
+static __initconst const u64 p4_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_PEBS_METRIC__1stl_cache_load_miss_retired),
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
+	},
+},
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_PEBS_METRIC__dtlb_load_miss_retired),
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+						P4_PEBS_METRIC__dtlb_store_miss_retired),
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+						P4_PEBS_METRIC__none),
+		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+						P4_PEBS_METRIC__none),
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+	u64 original;
+	u64 alternative;
+} p4_event_aliases[] = {
+	{
+		/*
+		 * Non-halted cycles can be substituted with non-sleeping cycles (see
+		 * Intel SDM Vol3b for details). We need this alias to be able
+		 * to run nmi-watchdog and 'perf top' (or any other user space tool
+		 * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+		 * simultaneously.
+		 */
+	.original	=
+		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+	.alternative	=
+		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)		|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+		p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT		|
+				    P4_CCCR_COMPARE),
+	},
+};
+
+static u64 p4_get_alias_event(u64 config)
+{
+	u64 config_match;
+	int i;
+
+	/*
+	 * Only event with special mark is allowed,
+	 * we're to be sure it didn't come as malformed
+	 * RAW event.
+	 */
+	if (!(config & P4_CONFIG_ALIASABLE))
+		return 0;
+
+	config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+	for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+		if (config_match == p4_event_aliases[i].original) {
+			config_match = p4_event_aliases[i].alternative;
+			break;
+		} else if (config_match == p4_event_aliases[i].alternative) {
+			config_match = p4_event_aliases[i].original;
+			break;
+		}
+	}
+
+	if (i >= ARRAY_SIZE(p4_event_aliases))
+		return 0;
+
+	return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+  /* non-halted CPU clocks */
+  [PERF_COUNT_HW_CPU_CYCLES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))	|
+		P4_CONFIG_ALIASABLE,
+
+  /*
+   * retired instructions
+   * in a sake of simplicity we don't use the FSB tagging
+   */
+  [PERF_COUNT_HW_INSTRUCTIONS] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+  /* cache hits */
+  [PERF_COUNT_HW_CACHE_REFERENCES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+  /* cache misses */
+  [PERF_COUNT_HW_CACHE_MISSES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+  /* branch instructions retired */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+  /* mispredicted branches retired */
+  [PERF_COUNT_HW_BRANCH_MISSES]	=
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)	|
+		P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
+  [PERF_COUNT_HW_BUS_CYCLES] =
+	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
+		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))	|
+	p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+};
+
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+	unsigned int evnt = p4_config_unpack_event(config);
+	struct p4_event_bind *bind = NULL;
+
+	if (evnt < ARRAY_SIZE(p4_event_bind_map))
+		bind = &p4_event_bind_map[evnt];
+
+	return bind;
+}
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+	struct p4_event_bind *bind;
+	unsigned int esel;
+	u64 config;
+
+	config = p4_general_events[hw_event];
+	bind = p4_config_get_bind(config);
+	esel = P4_OPCODE_ESEL(bind->opcode);
+	config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+
+	return config;
+}
+
+/* check cpu model specifics */
+static bool p4_event_match_cpu_model(unsigned int event_idx)
+{
+	/* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
+	if (event_idx == P4_EVENT_INSTR_COMPLETED) {
+		if (boot_cpu_data.x86_model != 3 &&
+			boot_cpu_data.x86_model != 4 &&
+			boot_cpu_data.x86_model != 6)
+			return false;
+	}
+
+	/*
+	 * For info
+	 * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
+	 */
+
+	return true;
+}
+
+static int p4_validate_raw_event(struct perf_event *event)
+{
+	unsigned int v, emask;
+
+	/* User data may have out-of-bound event index */
+	v = p4_config_unpack_event(event->attr.config);
+	if (v >= ARRAY_SIZE(p4_event_bind_map))
+		return -EINVAL;
+
+	/* It may be unsupported: */
+	if (!p4_event_match_cpu_model(v))
+		return -EINVAL;
+
+	/*
+	 * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
+	 * in Architectural Performance Monitoring, it means not
+	 * on _which_ logical cpu to count but rather _when_, ie it
+	 * depends on logical cpu state -- count event if one cpu active,
+	 * none, both or any, so we just allow user to pass any value
+	 * desired.
+	 *
+	 * In turn we always set Tx_OS/Tx_USR bits bound to logical
+	 * cpu without their propagation to another cpu
+	 */
+
+	/*
+	 * if an event is shared across the logical threads
+	 * the user needs special permissions to be able to use it
+	 */
+	if (p4_ht_active() && p4_event_bind_map[v].shared) {
+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
+	}
+
+	/* ESCR EventMask bits may be invalid */
+	emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
+	if (emask & ~p4_event_bind_map[v].escr_emask)
+		return -EINVAL;
+
+	/*
+	 * it may have some invalid PEBS bits
+	 */
+	if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
+		return -EINVAL;
+
+	v = p4_config_unpack_metric(event->attr.config);
+	if (v >= ARRAY_SIZE(p4_pebs_bind_map))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int p4_hw_config(struct perf_event *event)
+{
+	int cpu = get_cpu();
+	int rc = 0;
+	u32 escr, cccr;
+
+	/*
+	 * the reason we use cpu that early is that: if we get scheduled
+	 * first time on the same cpu -- we will not need swap thread
+	 * specific flags in config (and will save some cpu cycles)
+	 */
+
+	cccr = p4_default_cccr_conf(cpu);
+	escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+					 event->attr.exclude_user);
+	event->hw.config = p4_config_pack_escr(escr) |
+			   p4_config_pack_cccr(cccr);
+
+	if (p4_ht_active() && p4_ht_thread(cpu))
+		event->hw.config = p4_set_ht_bit(event->hw.config);
+
+	if (event->attr.type == PERF_TYPE_RAW) {
+		struct p4_event_bind *bind;
+		unsigned int esel;
+		/*
+		 * Clear bits we reserve to be managed by kernel itself
+		 * and never allowed from a user space
+		 */
+		 event->attr.config &= P4_CONFIG_MASK;
+
+		rc = p4_validate_raw_event(event);
+		if (rc)
+			goto out;
+
+		/*
+		 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+		 * bits since we keep additional info here (for cache events and etc)
+		 */
+		event->hw.config |= event->attr.config;
+		bind = p4_config_get_bind(event->attr.config);
+		if (!bind) {
+			rc = -EINVAL;
+			goto out;
+		}
+		esel = P4_OPCODE_ESEL(bind->opcode);
+		event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+	}
+
+	rc = x86_setup_perfctr(event);
+out:
+	put_cpu();
+	return rc;
+}
+
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+	u64 v;
+
+	/* an official way for overflow indication */
+	rdmsrl(hwc->config_base, v);
+	if (v & P4_CCCR_OVF) {
+		wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
+		return 1;
+	}
+
+	/*
+	 * In some circumstances the overflow might issue an NMI but did
+	 * not set P4_CCCR_OVF bit. Because a counter holds a negative value
+	 * we simply check for high bit being set, if it's cleared it means
+	 * the counter has reached zero value and continued counting before
+	 * real NMI signal was received:
+	 */
+	rdmsrl(hwc->event_base, v);
+	if (!(v & ARCH_P4_UNFLAGGED_BIT))
+		return 1;
+
+	return 0;
+}
+
+static void p4_pmu_disable_pebs(void)
+{
+	/*
+	 * FIXME
+	 *
+	 * It's still allowed that two threads setup same cache
+	 * events so we can't simply clear metrics until we knew
+	 * no one is depending on us, so we need kind of counter
+	 * for "ReplayEvent" users.
+	 *
+	 * What is more complex -- RAW events, if user (for some
+	 * reason) will pass some cache event metric with improper
+	 * event opcode -- it's fine from hardware point of view
+	 * but completely nonsense from "meaning" of such action.
+	 *
+	 * So at moment let leave metrics turned on forever -- it's
+	 * ok for now but need to be revisited!
+	 *
+	 * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
+	 * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
+	 */
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	/*
+	 * If event gets disabled while counter is in overflowed
+	 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+	 * asserted again and again
+	 */
+	(void)wrmsrl_safe(hwc->config_base,
+		p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
+}
+
+static void p4_pmu_disable_all(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		p4_pmu_disable_event(event);
+	}
+
+	p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+	struct p4_pebs_bind *bind;
+	unsigned int idx;
+
+	BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+	idx = p4_config_unpack_metric(config);
+	if (idx == P4_PEBS_METRIC__none)
+		return;
+
+	bind = &p4_pebs_bind_map[idx];
+
+	(void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE,	(u64)bind->metric_pebs);
+	(void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,	(u64)bind->metric_vert);
+}
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	int thread = p4_ht_config_thread(hwc->config);
+	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+	unsigned int idx = p4_config_unpack_event(hwc->config);
+	struct p4_event_bind *bind;
+	u64 escr_addr, cccr;
+
+	bind = &p4_event_bind_map[idx];
+	escr_addr = bind->escr_msr[thread];
+
+	/*
+	 * - we dont support cascaded counters yet
+	 * - and counter 1 is broken (erratum)
+	 */
+	WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+	WARN_ON_ONCE(hwc->idx == 1);
+
+	/* we need a real Event value */
+	escr_conf &= ~P4_ESCR_EVENT_MASK;
+	escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+	cccr = p4_config_unpack_cccr(hwc->config);
+
+	/*
+	 * it could be Cache event so we need to write metrics
+	 * into additional MSRs
+	 */
+	p4_pmu_enable_pebs(hwc->config);
+
+	(void)wrmsrl_safe(escr_addr, escr_conf);
+	(void)wrmsrl_safe(hwc->config_base,
+				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
+}
+
+static void p4_pmu_enable_all(int added)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx;
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		struct perf_event *event = cpuc->events[idx];
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		p4_pmu_enable_event(event);
+	}
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_events *cpuc;
+	struct perf_event *event;
+	struct hw_perf_event *hwc;
+	int idx, handled = 0;
+	u64 val;
+
+	cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		int overflow;
+
+		if (!test_bit(idx, cpuc->active_mask)) {
+			/* catch in-flight IRQs */
+			if (__test_and_clear_bit(idx, cpuc->running))
+				handled++;
+			continue;
+		}
+
+		event = cpuc->events[idx];
+		hwc = &event->hw;
+
+		WARN_ON_ONCE(hwc->idx != idx);
+
+		/* it might be unflagged overflow */
+		overflow = p4_pmu_clear_cccr_ovf(hwc);
+
+		val = x86_perf_event_update(event);
+		if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+			continue;
+
+		handled += overflow;
+
+		/* event overflow for sure */
+		perf_sample_data_init(&data, 0, hwc->last_period);
+
+		if (!x86_perf_event_set_period(event))
+			continue;
+
+
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
+	}
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	/*
+	 * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
+	 * been observed that the OVF bit flag has to be cleared first _before_
+	 * the LVTPC can be unmasked.
+	 *
+	 * The reason is the NMI line will continue to be asserted while the OVF
+	 * bit is set.  This causes a second NMI to generate if the LVTPC is
+	 * unmasked before the OVF bit is cleared, leading to unknown NMI
+	 * messages.
+	 */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+	return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+	u32 escr, cccr;
+
+	/*
+	 * we either lucky and continue on same cpu or no HT support
+	 */
+	if (!p4_should_swap_ts(hwc->config, cpu))
+		return;
+
+	/*
+	 * the event is migrated from an another logical
+	 * cpu, so we need to swap thread specific flags
+	 */
+
+	escr = p4_config_unpack_escr(hwc->config);
+	cccr = p4_config_unpack_cccr(hwc->config);
+
+	if (p4_ht_thread(cpu)) {
+		cccr &= ~P4_CCCR_OVF_PMI_T0;
+		cccr |= P4_CCCR_OVF_PMI_T1;
+		if (escr & P4_ESCR_T0_OS) {
+			escr &= ~P4_ESCR_T0_OS;
+			escr |= P4_ESCR_T1_OS;
+		}
+		if (escr & P4_ESCR_T0_USR) {
+			escr &= ~P4_ESCR_T0_USR;
+			escr |= P4_ESCR_T1_USR;
+		}
+		hwc->config  = p4_config_pack_escr(escr);
+		hwc->config |= p4_config_pack_cccr(cccr);
+		hwc->config |= P4_CONFIG_HT;
+	} else {
+		cccr &= ~P4_CCCR_OVF_PMI_T1;
+		cccr |= P4_CCCR_OVF_PMI_T0;
+		if (escr & P4_ESCR_T1_OS) {
+			escr &= ~P4_ESCR_T1_OS;
+			escr |= P4_ESCR_T0_OS;
+		}
+		if (escr & P4_ESCR_T1_USR) {
+			escr &= ~P4_ESCR_T1_USR;
+			escr |= P4_ESCR_T0_USR;
+		}
+		hwc->config  = p4_config_pack_escr(escr);
+		hwc->config |= p4_config_pack_cccr(cccr);
+		hwc->config &= ~P4_CONFIG_HT;
+	}
+}
+
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE		0x000003a0
+#define P4_ESCR_MSR_MAX			0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE		(P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr)		(msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr)	[P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+	unsigned int idx = P4_ESCR_MSR_IDX(addr);
+
+	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE	||
+			!p4_escr_table[idx]		||
+			p4_escr_table[idx] != addr)) {
+		WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+		return -1;
+	}
+
+	return idx;
+}
+
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+			struct p4_event_bind *bind)
+{
+	int i, j;
+
+	for (i = 0; i < P4_CNTR_LIMIT; i++) {
+		j = bind->cntr[thread][i];
+		if (j != -1 && !test_bit(j, used_mask))
+			return j;
+	}
+
+	return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
+	int cpu = smp_processor_id();
+	struct hw_perf_event *hwc;
+	struct p4_event_bind *bind;
+	unsigned int i, thread, num;
+	int cntr_idx, escr_idx;
+	u64 config_alias;
+	int pass;
+
+	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+	bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
+
+	for (i = 0, num = n; i < n; i++, num--) {
+
+		hwc = &cpuc->event_list[i]->hw;
+		thread = p4_ht_thread(cpu);
+		pass = 0;
+
+again:
+		/*
+		 * It's possible to hit a circular lock
+		 * between original and alternative events
+		 * if both are scheduled already.
+		 */
+		if (pass > 2)
+			goto done;
+
+		bind = p4_config_get_bind(hwc->config);
+		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+		if (unlikely(escr_idx == -1))
+			goto done;
+
+		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+			cntr_idx = hwc->idx;
+			if (assign)
+				assign[i] = hwc->idx;
+			goto reserve;
+		}
+
+		cntr_idx = p4_next_cntr(thread, used_mask, bind);
+		if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+			/*
+			 * Check whether an event alias is still available.
+			 */
+			config_alias = p4_get_alias_event(hwc->config);
+			if (!config_alias)
+				goto done;
+			hwc->config = config_alias;
+			pass++;
+			goto again;
+		}
+		/*
+		 * Perf does test runs to see if a whole group can be assigned
+		 * together succesfully.  There can be multiple rounds of this.
+		 * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
+		 * bits, such that the next round of group assignments will
+		 * cause the above p4_should_swap_ts to pass instead of fail.
+		 * This leads to counters exclusive to thread0 being used by
+		 * thread1.
+		 *
+		 * Solve this with a cheap hack, reset the idx back to -1 to
+		 * force a new lookup (p4_next_cntr) to get the right counter
+		 * for the right thread.
+		 *
+		 * This probably doesn't comply with the general spirit of how
+		 * perf wants to work, but P4 is special. :-(
+		 */
+		if (p4_should_swap_ts(hwc->config, cpu))
+			hwc->idx = -1;
+		p4_pmu_swap_config_ts(hwc, cpu);
+		if (assign)
+			assign[i] = cntr_idx;
+reserve:
+		set_bit(cntr_idx, used_mask);
+		set_bit(escr_idx, escr_mask);
+	}
+
+done:
+	return num ? -EINVAL : 0;
+}
+
+PMU_FORMAT_ATTR(cccr, "config:0-31" );
+PMU_FORMAT_ATTR(escr, "config:32-62");
+PMU_FORMAT_ATTR(ht,   "config:63"   );
+
+static struct attribute *intel_p4_formats_attr[] = {
+	&format_attr_cccr.attr,
+	&format_attr_escr.attr,
+	&format_attr_ht.attr,
+	NULL,
+};
+
+static __initconst const struct x86_pmu p4_pmu = {
+	.name			= "Netburst P4/Xeon",
+	.handle_irq		= p4_pmu_handle_irq,
+	.disable_all		= p4_pmu_disable_all,
+	.enable_all		= p4_pmu_enable_all,
+	.enable			= p4_pmu_enable_event,
+	.disable		= p4_pmu_disable_event,
+	.eventsel		= MSR_P4_BPU_CCCR0,
+	.perfctr		= MSR_P4_BPU_PERFCTR0,
+	.event_map		= p4_pmu_event_map,
+	.max_events		= ARRAY_SIZE(p4_general_events),
+	.get_event_constraints	= x86_get_event_constraints,
+	/*
+	 * IF HT disabled we may need to use all
+	 * ARCH_P4_MAX_CCCR counters simulaneously
+	 * though leave it restricted at moment assuming
+	 * HT is on
+	 */
+	.num_counters		= ARCH_P4_MAX_CCCR,
+	.apic			= 1,
+	.cntval_bits		= ARCH_P4_CNTRVAL_BITS,
+	.cntval_mask		= ARCH_P4_CNTRVAL_MASK,
+	.max_period		= (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
+	.hw_config		= p4_hw_config,
+	.schedule_events	= p4_pmu_schedule_events,
+	/*
+	 * This handles erratum N15 in intel doc 249199-029,
+	 * the counter may not be updated correctly on write
+	 * so we need a second write operation to do the trick
+	 * (the official workaround didn't work)
+	 *
+	 * the former idea is taken from OProfile code
+	 */
+	.perfctr_second_write	= 1,
+
+	.format_attrs		= intel_p4_formats_attr,
+};
+
+__init int p4_pmu_init(void)
+{
+	unsigned int low, high;
+	int i, reg;
+
+	/* If we get stripped -- indexing fails */
+	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
+
+	rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+	if (!(low & (1 << 7))) {
+		pr_cont("unsupported Netburst CPU model %d ",
+			boot_cpu_data.x86_model);
+		return -ENODEV;
+	}
+
+	memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+		sizeof(hw_cache_event_ids));
+
+	pr_cont("Netburst events, ");
+
+	x86_pmu = p4_pmu;
+
+	/*
+	 * Even though the counters are configured to interrupt a particular
+	 * logical processor when an overflow happens, testing has shown that
+	 * on kdump kernels (which uses a single cpu), thread1's counter
+	 * continues to run and will report an NMI on thread0.  Due to the
+	 * overflow bug, this leads to a stream of unknown NMIs.
+	 *
+	 * Solve this by zero'ing out the registers to mimic a reset.
+	 */
+	for (i = 0; i < x86_pmu.num_counters; i++) {
+		reg = x86_pmu_config_addr(i);
+		wrmsrl_safe(reg, 0ULL);
+	}
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3195f7d..2bb41d0 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -31,7 +31,7 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_p4.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o
 
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
deleted file mode 100644
index f2e5678..0000000
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ /dev/null
@@ -1,1376 +0,0 @@
-/*
- * Netburst Performance Events (P4, old Xeon)
- *
- *  Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
- *  Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-#include <asm/perf_event_p4.h>
-#include <asm/hardirq.h>
-#include <asm/apic.h>
-
-#include "perf_event.h"
-
-#define P4_CNTR_LIMIT 3
-/*
- * array indices: 0,1 - HT threads, used with HT enabled cpu
- */
-struct p4_event_bind {
-	unsigned int opcode;			/* Event code and ESCR selector */
-	unsigned int escr_msr[2];		/* ESCR MSR for this event */
-	unsigned int escr_emask;		/* valid ESCR EventMask bits */
-	unsigned int shared;			/* event is shared across threads */
-	char cntr[2][P4_CNTR_LIMIT];		/* counter index (offset), -1 on abscence */
-};
-
-struct p4_pebs_bind {
-	unsigned int metric_pebs;
-	unsigned int metric_vert;
-};
-
-/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
-#define P4_GEN_PEBS_BIND(name, pebs, vert)			\
-	[P4_PEBS_METRIC__##name] = {				\
-		.metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG,	\
-		.metric_vert = vert,				\
-	}
-
-/*
- * note we have P4_PEBS_ENABLE_UOP_TAG always set here
- *
- * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
- * event configuration to find out which values are to be
- * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
- * resgisters
- */
-static struct p4_pebs_bind p4_pebs_bind_map[] = {
-	P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired,	0x0000001, 0x0000001),
-	P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired,	0x0000002, 0x0000001),
-	P4_GEN_PEBS_BIND(dtlb_load_miss_retired,	0x0000004, 0x0000001),
-	P4_GEN_PEBS_BIND(dtlb_store_miss_retired,	0x0000004, 0x0000002),
-	P4_GEN_PEBS_BIND(dtlb_all_miss_retired,		0x0000004, 0x0000003),
-	P4_GEN_PEBS_BIND(tagged_mispred_branch,		0x0018000, 0x0000010),
-	P4_GEN_PEBS_BIND(mob_load_replay_retired,	0x0000200, 0x0000001),
-	P4_GEN_PEBS_BIND(split_load_retired,		0x0000400, 0x0000001),
-	P4_GEN_PEBS_BIND(split_store_retired,		0x0000400, 0x0000002),
-};
-
-/*
- * Note that we don't use CCCR1 here, there is an
- * exception for P4_BSQ_ALLOCATION but we just have
- * no workaround
- *
- * consider this binding as resources which particular
- * event may borrow, it doesn't contain EventMask,
- * Tags and friends -- they are left to a caller
- */
-static struct p4_event_bind p4_event_bind_map[] = {
-	[P4_EVENT_TC_DELIVER_MODE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
-		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
-		.shared		= 1,
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_BPU_FETCH_REQUEST] = {
-		.opcode		= P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
-		.escr_msr	= { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_ITLB_REFERENCE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
-		.escr_msr	= { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_MEMORY_CANCEL] = {
-		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
-		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_MEMORY_COMPLETE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
-		.escr_msr	= { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_LOAD_PORT_REPLAY] = {
-		.opcode		= P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
-		.escr_msr	= { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_STORE_PORT_REPLAY] = {
-		.opcode		= P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
-		.escr_msr	= { MSR_P4_SAAT_ESCR0 ,  MSR_P4_SAAT_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_MOB_LOAD_REPLAY] = {
-		.opcode		= P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
-		.escr_msr	= { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_PAGE_WALK_TYPE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
-		.escr_msr	= { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
-		.shared		= 1,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_BSQ_CACHE_REFERENCE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
-		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_IOQ_ALLOCATION] = {
-		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_IOQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
-		.opcode		= P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
-		.escr_msr	= { MSR_P4_FSB_ESCR1,  MSR_P4_FSB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
-		.cntr		= { {2, -1, -1}, {3, -1, -1} },
-	},
-	[P4_EVENT_FSB_DATA_ACTIVITY] = {
-		.opcode		= P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
-		.shared		= 1,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_BSQ_ALLOCATION] = {		/* shared ESCR, broken CCCR1 */
-		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
-		.escr_msr	= { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
-		.cntr		= { {0, -1, -1}, {1, -1, -1} },
-	},
-	[P4_EVENT_BSQ_ACTIVE_ENTRIES] = {	/* shared ESCR */
-		.opcode		= P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
-		.escr_msr	= { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
-		.cntr		= { {2, -1, -1}, {3, -1, -1} },
-	},
-	[P4_EVENT_SSE_INPUT_ASSIST] = {
-		.opcode		= P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_PACKED_SP_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_PACKED_DP_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_SCALAR_SP_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_SCALAR_DP_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_64BIT_MMX_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_128BIT_MMX_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_X87_FP_UOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_X87_FP_UOP),
-		.escr_msr	= { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_TC_MISC] = {
-		.opcode		= P4_OPCODE(P4_EVENT_TC_MISC),
-		.escr_msr	= { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_GLOBAL_POWER_EVENTS] = {
-		.opcode		= P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_TC_MS_XFER] = {
-		.opcode		= P4_OPCODE(P4_EVENT_TC_MS_XFER),
-		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_UOP_QUEUE_WRITES] = {
-		.opcode		= P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
-		.escr_msr	= { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
-		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_RETIRED_BRANCH_TYPE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
-		.escr_msr	= { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
-		.cntr		= { {4, 5, -1}, {6, 7, -1} },
-	},
-	[P4_EVENT_RESOURCE_STALL] = {
-		.opcode		= P4_OPCODE(P4_EVENT_RESOURCE_STALL),
-		.escr_msr	= { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_WC_BUFFER] = {
-		.opcode		= P4_OPCODE(P4_EVENT_WC_BUFFER),
-		.escr_msr	= { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
-		.shared		= 1,
-		.cntr		= { {8, 9, -1}, {10, 11, -1} },
-	},
-	[P4_EVENT_B2B_CYCLES] = {
-		.opcode		= P4_OPCODE(P4_EVENT_B2B_CYCLES),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	= 0,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_BNR] = {
-		.opcode		= P4_OPCODE(P4_EVENT_BNR),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	= 0,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_SNOOP] = {
-		.opcode		= P4_OPCODE(P4_EVENT_SNOOP),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	= 0,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_RESPONSE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_RESPONSE),
-		.escr_msr	= { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
-		.escr_emask	= 0,
-		.cntr		= { {0, -1, -1}, {2, -1, -1} },
-	},
-	[P4_EVENT_FRONT_END_EVENT] = {
-		.opcode		= P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_EXECUTION_EVENT] = {
-		.opcode		= P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_REPLAY_EVENT] = {
-		.opcode		= P4_OPCODE(P4_EVENT_REPLAY_EVENT),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_INSTR_RETIRED] = {
-		.opcode		= P4_OPCODE(P4_EVENT_INSTR_RETIRED),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_UOPS_RETIRED] = {
-		.opcode		= P4_OPCODE(P4_EVENT_UOPS_RETIRED),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_UOP_TYPE] = {
-		.opcode		= P4_OPCODE(P4_EVENT_UOP_TYPE),
-		.escr_msr	= { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_BRANCH_RETIRED] = {
-		.opcode		= P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_MISPRED_BRANCH_RETIRED] = {
-		.opcode		= P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_X87_ASSIST] = {
-		.opcode		= P4_OPCODE(P4_EVENT_X87_ASSIST),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU)			|
-			P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_MACHINE_CLEAR] = {
-		.opcode		= P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
-		.escr_msr	= { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-	[P4_EVENT_INSTR_COMPLETED] = {
-		.opcode		= P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
-		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
-		.escr_emask	=
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS)		|
-			P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
-		.cntr		= { {12, 13, 16}, {14, 15, 17} },
-	},
-};
-
-#define P4_GEN_CACHE_EVENT(event, bit, metric)				  \
-	p4_config_pack_escr(P4_ESCR_EVENT(event)			| \
-			    P4_ESCR_EMASK_BIT(event, bit))		| \
-	p4_config_pack_cccr(metric					| \
-			    P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
-
-static __initconst const u64 p4_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_PEBS_METRIC__1stl_cache_load_miss_retired),
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
-	},
-},
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_PEBS_METRIC__dtlb_load_miss_retired),
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0,
-		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
-						P4_PEBS_METRIC__dtlb_store_miss_retired),
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
-						P4_PEBS_METRIC__none),
-		[ C(RESULT_MISS)   ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
-						P4_PEBS_METRIC__none),
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(NODE) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-/*
- * Because of Netburst being quite restricted in how many
- * identical events may run simultaneously, we introduce event aliases,
- * ie the different events which have the same functionality but
- * utilize non-intersected resources (ESCR/CCCR/counter registers).
- *
- * This allow us to relax restrictions a bit and run two or more
- * identical events together.
- *
- * Never set any custom internal bits such as P4_CONFIG_HT,
- * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
- * either up to date automatically or not applicable at all.
- */
-struct p4_event_alias {
-	u64 original;
-	u64 alternative;
-} p4_event_aliases[] = {
-	{
-		/*
-		 * Non-halted cycles can be substituted with non-sleeping cycles (see
-		 * Intel SDM Vol3b for details). We need this alias to be able
-		 * to run nmi-watchdog and 'perf top' (or any other user space tool
-		 * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
-		 * simultaneously.
-		 */
-	.original	=
-		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
-	.alternative	=
-		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)		|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)	|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)	|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)	|
-				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
-		p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT		|
-				    P4_CCCR_COMPARE),
-	},
-};
-
-static u64 p4_get_alias_event(u64 config)
-{
-	u64 config_match;
-	int i;
-
-	/*
-	 * Only event with special mark is allowed,
-	 * we're to be sure it didn't come as malformed
-	 * RAW event.
-	 */
-	if (!(config & P4_CONFIG_ALIASABLE))
-		return 0;
-
-	config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
-
-	for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
-		if (config_match == p4_event_aliases[i].original) {
-			config_match = p4_event_aliases[i].alternative;
-			break;
-		} else if (config_match == p4_event_aliases[i].alternative) {
-			config_match = p4_event_aliases[i].original;
-			break;
-		}
-	}
-
-	if (i >= ARRAY_SIZE(p4_event_aliases))
-		return 0;
-
-	return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
-}
-
-static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
-  /* non-halted CPU clocks */
-  [PERF_COUNT_HW_CPU_CYCLES] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))	|
-		P4_CONFIG_ALIASABLE,
-
-  /*
-   * retired instructions
-   * in a sake of simplicity we don't use the FSB tagging
-   */
-  [PERF_COUNT_HW_INSTRUCTIONS] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
-
-  /* cache hits */
-  [PERF_COUNT_HW_CACHE_REFERENCES] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
-
-  /* cache misses */
-  [PERF_COUNT_HW_CACHE_MISSES] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
-
-  /* branch instructions retired */
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
-
-  /* mispredicted branches retired */
-  [PERF_COUNT_HW_BRANCH_MISSES]	=
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED)	|
-		P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
-
-  /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN):  */
-  [PERF_COUNT_HW_BUS_CYCLES] =
-	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN))	|
-	p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
-};
-
-static struct p4_event_bind *p4_config_get_bind(u64 config)
-{
-	unsigned int evnt = p4_config_unpack_event(config);
-	struct p4_event_bind *bind = NULL;
-
-	if (evnt < ARRAY_SIZE(p4_event_bind_map))
-		bind = &p4_event_bind_map[evnt];
-
-	return bind;
-}
-
-static u64 p4_pmu_event_map(int hw_event)
-{
-	struct p4_event_bind *bind;
-	unsigned int esel;
-	u64 config;
-
-	config = p4_general_events[hw_event];
-	bind = p4_config_get_bind(config);
-	esel = P4_OPCODE_ESEL(bind->opcode);
-	config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-
-	return config;
-}
-
-/* check cpu model specifics */
-static bool p4_event_match_cpu_model(unsigned int event_idx)
-{
-	/* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
-	if (event_idx == P4_EVENT_INSTR_COMPLETED) {
-		if (boot_cpu_data.x86_model != 3 &&
-			boot_cpu_data.x86_model != 4 &&
-			boot_cpu_data.x86_model != 6)
-			return false;
-	}
-
-	/*
-	 * For info
-	 * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
-	 */
-
-	return true;
-}
-
-static int p4_validate_raw_event(struct perf_event *event)
-{
-	unsigned int v, emask;
-
-	/* User data may have out-of-bound event index */
-	v = p4_config_unpack_event(event->attr.config);
-	if (v >= ARRAY_SIZE(p4_event_bind_map))
-		return -EINVAL;
-
-	/* It may be unsupported: */
-	if (!p4_event_match_cpu_model(v))
-		return -EINVAL;
-
-	/*
-	 * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
-	 * in Architectural Performance Monitoring, it means not
-	 * on _which_ logical cpu to count but rather _when_, ie it
-	 * depends on logical cpu state -- count event if one cpu active,
-	 * none, both or any, so we just allow user to pass any value
-	 * desired.
-	 *
-	 * In turn we always set Tx_OS/Tx_USR bits bound to logical
-	 * cpu without their propagation to another cpu
-	 */
-
-	/*
-	 * if an event is shared across the logical threads
-	 * the user needs special permissions to be able to use it
-	 */
-	if (p4_ht_active() && p4_event_bind_map[v].shared) {
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
-	}
-
-	/* ESCR EventMask bits may be invalid */
-	emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
-	if (emask & ~p4_event_bind_map[v].escr_emask)
-		return -EINVAL;
-
-	/*
-	 * it may have some invalid PEBS bits
-	 */
-	if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
-		return -EINVAL;
-
-	v = p4_config_unpack_metric(event->attr.config);
-	if (v >= ARRAY_SIZE(p4_pebs_bind_map))
-		return -EINVAL;
-
-	return 0;
-}
-
-static int p4_hw_config(struct perf_event *event)
-{
-	int cpu = get_cpu();
-	int rc = 0;
-	u32 escr, cccr;
-
-	/*
-	 * the reason we use cpu that early is that: if we get scheduled
-	 * first time on the same cpu -- we will not need swap thread
-	 * specific flags in config (and will save some cpu cycles)
-	 */
-
-	cccr = p4_default_cccr_conf(cpu);
-	escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
-					 event->attr.exclude_user);
-	event->hw.config = p4_config_pack_escr(escr) |
-			   p4_config_pack_cccr(cccr);
-
-	if (p4_ht_active() && p4_ht_thread(cpu))
-		event->hw.config = p4_set_ht_bit(event->hw.config);
-
-	if (event->attr.type == PERF_TYPE_RAW) {
-		struct p4_event_bind *bind;
-		unsigned int esel;
-		/*
-		 * Clear bits we reserve to be managed by kernel itself
-		 * and never allowed from a user space
-		 */
-		 event->attr.config &= P4_CONFIG_MASK;
-
-		rc = p4_validate_raw_event(event);
-		if (rc)
-			goto out;
-
-		/*
-		 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
-		 * bits since we keep additional info here (for cache events and etc)
-		 */
-		event->hw.config |= event->attr.config;
-		bind = p4_config_get_bind(event->attr.config);
-		if (!bind) {
-			rc = -EINVAL;
-			goto out;
-		}
-		esel = P4_OPCODE_ESEL(bind->opcode);
-		event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
-	}
-
-	rc = x86_setup_perfctr(event);
-out:
-	put_cpu();
-	return rc;
-}
-
-static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
-{
-	u64 v;
-
-	/* an official way for overflow indication */
-	rdmsrl(hwc->config_base, v);
-	if (v & P4_CCCR_OVF) {
-		wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
-		return 1;
-	}
-
-	/*
-	 * In some circumstances the overflow might issue an NMI but did
-	 * not set P4_CCCR_OVF bit. Because a counter holds a negative value
-	 * we simply check for high bit being set, if it's cleared it means
-	 * the counter has reached zero value and continued counting before
-	 * real NMI signal was received:
-	 */
-	rdmsrl(hwc->event_base, v);
-	if (!(v & ARCH_P4_UNFLAGGED_BIT))
-		return 1;
-
-	return 0;
-}
-
-static void p4_pmu_disable_pebs(void)
-{
-	/*
-	 * FIXME
-	 *
-	 * It's still allowed that two threads setup same cache
-	 * events so we can't simply clear metrics until we knew
-	 * no one is depending on us, so we need kind of counter
-	 * for "ReplayEvent" users.
-	 *
-	 * What is more complex -- RAW events, if user (for some
-	 * reason) will pass some cache event metric with improper
-	 * event opcode -- it's fine from hardware point of view
-	 * but completely nonsense from "meaning" of such action.
-	 *
-	 * So at moment let leave metrics turned on forever -- it's
-	 * ok for now but need to be revisited!
-	 *
-	 * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0);
-	 * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0);
-	 */
-}
-
-static inline void p4_pmu_disable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	/*
-	 * If event gets disabled while counter is in overflowed
-	 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
-	 * asserted again and again
-	 */
-	(void)wrmsrl_safe(hwc->config_base,
-		p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
-}
-
-static void p4_pmu_disable_all(void)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		struct perf_event *event = cpuc->events[idx];
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-		p4_pmu_disable_event(event);
-	}
-
-	p4_pmu_disable_pebs();
-}
-
-/* configuration must be valid */
-static void p4_pmu_enable_pebs(u64 config)
-{
-	struct p4_pebs_bind *bind;
-	unsigned int idx;
-
-	BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
-
-	idx = p4_config_unpack_metric(config);
-	if (idx == P4_PEBS_METRIC__none)
-		return;
-
-	bind = &p4_pebs_bind_map[idx];
-
-	(void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE,	(u64)bind->metric_pebs);
-	(void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT,	(u64)bind->metric_vert);
-}
-
-static void p4_pmu_enable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	int thread = p4_ht_config_thread(hwc->config);
-	u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
-	unsigned int idx = p4_config_unpack_event(hwc->config);
-	struct p4_event_bind *bind;
-	u64 escr_addr, cccr;
-
-	bind = &p4_event_bind_map[idx];
-	escr_addr = bind->escr_msr[thread];
-
-	/*
-	 * - we dont support cascaded counters yet
-	 * - and counter 1 is broken (erratum)
-	 */
-	WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
-	WARN_ON_ONCE(hwc->idx == 1);
-
-	/* we need a real Event value */
-	escr_conf &= ~P4_ESCR_EVENT_MASK;
-	escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
-
-	cccr = p4_config_unpack_cccr(hwc->config);
-
-	/*
-	 * it could be Cache event so we need to write metrics
-	 * into additional MSRs
-	 */
-	p4_pmu_enable_pebs(hwc->config);
-
-	(void)wrmsrl_safe(escr_addr, escr_conf);
-	(void)wrmsrl_safe(hwc->config_base,
-				(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
-}
-
-static void p4_pmu_enable_all(int added)
-{
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int idx;
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		struct perf_event *event = cpuc->events[idx];
-		if (!test_bit(idx, cpuc->active_mask))
-			continue;
-		p4_pmu_enable_event(event);
-	}
-}
-
-static int p4_pmu_handle_irq(struct pt_regs *regs)
-{
-	struct perf_sample_data data;
-	struct cpu_hw_events *cpuc;
-	struct perf_event *event;
-	struct hw_perf_event *hwc;
-	int idx, handled = 0;
-	u64 val;
-
-	cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		int overflow;
-
-		if (!test_bit(idx, cpuc->active_mask)) {
-			/* catch in-flight IRQs */
-			if (__test_and_clear_bit(idx, cpuc->running))
-				handled++;
-			continue;
-		}
-
-		event = cpuc->events[idx];
-		hwc = &event->hw;
-
-		WARN_ON_ONCE(hwc->idx != idx);
-
-		/* it might be unflagged overflow */
-		overflow = p4_pmu_clear_cccr_ovf(hwc);
-
-		val = x86_perf_event_update(event);
-		if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
-			continue;
-
-		handled += overflow;
-
-		/* event overflow for sure */
-		perf_sample_data_init(&data, 0, hwc->last_period);
-
-		if (!x86_perf_event_set_period(event))
-			continue;
-
-
-		if (perf_event_overflow(event, &data, regs))
-			x86_pmu_stop(event, 0);
-	}
-
-	if (handled)
-		inc_irq_stat(apic_perf_irqs);
-
-	/*
-	 * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
-	 * been observed that the OVF bit flag has to be cleared first _before_
-	 * the LVTPC can be unmasked.
-	 *
-	 * The reason is the NMI line will continue to be asserted while the OVF
-	 * bit is set.  This causes a second NMI to generate if the LVTPC is
-	 * unmasked before the OVF bit is cleared, leading to unknown NMI
-	 * messages.
-	 */
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-
-	return handled;
-}
-
-/*
- * swap thread specific fields according to a thread
- * we are going to run on
- */
-static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
-{
-	u32 escr, cccr;
-
-	/*
-	 * we either lucky and continue on same cpu or no HT support
-	 */
-	if (!p4_should_swap_ts(hwc->config, cpu))
-		return;
-
-	/*
-	 * the event is migrated from an another logical
-	 * cpu, so we need to swap thread specific flags
-	 */
-
-	escr = p4_config_unpack_escr(hwc->config);
-	cccr = p4_config_unpack_cccr(hwc->config);
-
-	if (p4_ht_thread(cpu)) {
-		cccr &= ~P4_CCCR_OVF_PMI_T0;
-		cccr |= P4_CCCR_OVF_PMI_T1;
-		if (escr & P4_ESCR_T0_OS) {
-			escr &= ~P4_ESCR_T0_OS;
-			escr |= P4_ESCR_T1_OS;
-		}
-		if (escr & P4_ESCR_T0_USR) {
-			escr &= ~P4_ESCR_T0_USR;
-			escr |= P4_ESCR_T1_USR;
-		}
-		hwc->config  = p4_config_pack_escr(escr);
-		hwc->config |= p4_config_pack_cccr(cccr);
-		hwc->config |= P4_CONFIG_HT;
-	} else {
-		cccr &= ~P4_CCCR_OVF_PMI_T1;
-		cccr |= P4_CCCR_OVF_PMI_T0;
-		if (escr & P4_ESCR_T1_OS) {
-			escr &= ~P4_ESCR_T1_OS;
-			escr |= P4_ESCR_T0_OS;
-		}
-		if (escr & P4_ESCR_T1_USR) {
-			escr &= ~P4_ESCR_T1_USR;
-			escr |= P4_ESCR_T0_USR;
-		}
-		hwc->config  = p4_config_pack_escr(escr);
-		hwc->config |= p4_config_pack_cccr(cccr);
-		hwc->config &= ~P4_CONFIG_HT;
-	}
-}
-
-/*
- * ESCR address hashing is tricky, ESCRs are not sequential
- * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
- * the metric between any ESCRs is laid in range [0xa0,0xe1]
- *
- * so we make ~70% filled hashtable
- */
-
-#define P4_ESCR_MSR_BASE		0x000003a0
-#define P4_ESCR_MSR_MAX			0x000003e1
-#define P4_ESCR_MSR_TABLE_SIZE		(P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
-#define P4_ESCR_MSR_IDX(msr)		(msr - P4_ESCR_MSR_BASE)
-#define P4_ESCR_MSR_TABLE_ENTRY(msr)	[P4_ESCR_MSR_IDX(msr)] = msr
-
-static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
-	P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
-};
-
-static int p4_get_escr_idx(unsigned int addr)
-{
-	unsigned int idx = P4_ESCR_MSR_IDX(addr);
-
-	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE	||
-			!p4_escr_table[idx]		||
-			p4_escr_table[idx] != addr)) {
-		WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
-		return -1;
-	}
-
-	return idx;
-}
-
-static int p4_next_cntr(int thread, unsigned long *used_mask,
-			struct p4_event_bind *bind)
-{
-	int i, j;
-
-	for (i = 0; i < P4_CNTR_LIMIT; i++) {
-		j = bind->cntr[thread][i];
-		if (j != -1 && !test_bit(j, used_mask))
-			return j;
-	}
-
-	return -1;
-}
-
-static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
-{
-	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
-	int cpu = smp_processor_id();
-	struct hw_perf_event *hwc;
-	struct p4_event_bind *bind;
-	unsigned int i, thread, num;
-	int cntr_idx, escr_idx;
-	u64 config_alias;
-	int pass;
-
-	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-	bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
-
-	for (i = 0, num = n; i < n; i++, num--) {
-
-		hwc = &cpuc->event_list[i]->hw;
-		thread = p4_ht_thread(cpu);
-		pass = 0;
-
-again:
-		/*
-		 * It's possible to hit a circular lock
-		 * between original and alternative events
-		 * if both are scheduled already.
-		 */
-		if (pass > 2)
-			goto done;
-
-		bind = p4_config_get_bind(hwc->config);
-		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
-		if (unlikely(escr_idx == -1))
-			goto done;
-
-		if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
-			cntr_idx = hwc->idx;
-			if (assign)
-				assign[i] = hwc->idx;
-			goto reserve;
-		}
-
-		cntr_idx = p4_next_cntr(thread, used_mask, bind);
-		if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
-			/*
-			 * Check whether an event alias is still available.
-			 */
-			config_alias = p4_get_alias_event(hwc->config);
-			if (!config_alias)
-				goto done;
-			hwc->config = config_alias;
-			pass++;
-			goto again;
-		}
-		/*
-		 * Perf does test runs to see if a whole group can be assigned
-		 * together succesfully.  There can be multiple rounds of this.
-		 * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
-		 * bits, such that the next round of group assignments will
-		 * cause the above p4_should_swap_ts to pass instead of fail.
-		 * This leads to counters exclusive to thread0 being used by
-		 * thread1.
-		 *
-		 * Solve this with a cheap hack, reset the idx back to -1 to
-		 * force a new lookup (p4_next_cntr) to get the right counter
-		 * for the right thread.
-		 *
-		 * This probably doesn't comply with the general spirit of how
-		 * perf wants to work, but P4 is special. :-(
-		 */
-		if (p4_should_swap_ts(hwc->config, cpu))
-			hwc->idx = -1;
-		p4_pmu_swap_config_ts(hwc, cpu);
-		if (assign)
-			assign[i] = cntr_idx;
-reserve:
-		set_bit(cntr_idx, used_mask);
-		set_bit(escr_idx, escr_mask);
-	}
-
-done:
-	return num ? -EINVAL : 0;
-}
-
-PMU_FORMAT_ATTR(cccr, "config:0-31" );
-PMU_FORMAT_ATTR(escr, "config:32-62");
-PMU_FORMAT_ATTR(ht,   "config:63"   );
-
-static struct attribute *intel_p4_formats_attr[] = {
-	&format_attr_cccr.attr,
-	&format_attr_escr.attr,
-	&format_attr_ht.attr,
-	NULL,
-};
-
-static __initconst const struct x86_pmu p4_pmu = {
-	.name			= "Netburst P4/Xeon",
-	.handle_irq		= p4_pmu_handle_irq,
-	.disable_all		= p4_pmu_disable_all,
-	.enable_all		= p4_pmu_enable_all,
-	.enable			= p4_pmu_enable_event,
-	.disable		= p4_pmu_disable_event,
-	.eventsel		= MSR_P4_BPU_CCCR0,
-	.perfctr		= MSR_P4_BPU_PERFCTR0,
-	.event_map		= p4_pmu_event_map,
-	.max_events		= ARRAY_SIZE(p4_general_events),
-	.get_event_constraints	= x86_get_event_constraints,
-	/*
-	 * IF HT disabled we may need to use all
-	 * ARCH_P4_MAX_CCCR counters simulaneously
-	 * though leave it restricted at moment assuming
-	 * HT is on
-	 */
-	.num_counters		= ARCH_P4_MAX_CCCR,
-	.apic			= 1,
-	.cntval_bits		= ARCH_P4_CNTRVAL_BITS,
-	.cntval_mask		= ARCH_P4_CNTRVAL_MASK,
-	.max_period		= (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
-	.hw_config		= p4_hw_config,
-	.schedule_events	= p4_pmu_schedule_events,
-	/*
-	 * This handles erratum N15 in intel doc 249199-029,
-	 * the counter may not be updated correctly on write
-	 * so we need a second write operation to do the trick
-	 * (the official workaround didn't work)
-	 *
-	 * the former idea is taken from OProfile code
-	 */
-	.perfctr_second_write	= 1,
-
-	.format_attrs		= intel_p4_formats_attr,
-};
-
-__init int p4_pmu_init(void)
-{
-	unsigned int low, high;
-	int i, reg;
-
-	/* If we get stripped -- indexing fails */
-	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
-
-	rdmsr(MSR_IA32_MISC_ENABLE, low, high);
-	if (!(low & (1 << 7))) {
-		pr_cont("unsupported Netburst CPU model %d ",
-			boot_cpu_data.x86_model);
-		return -ENODEV;
-	}
-
-	memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
-		sizeof(hw_cache_event_ids));
-
-	pr_cont("Netburst events, ");
-
-	x86_pmu = p4_pmu;
-
-	/*
-	 * Even though the counters are configured to interrupt a particular
-	 * logical processor when an overflow happens, testing has shown that
-	 * on kdump kernels (which uses a single cpu), thread1's counter
-	 * continues to run and will report an NMI on thread0.  Due to the
-	 * overflow bug, this leads to a stream of unknown NMIs.
-	 *
-	 * Solve this by zero'ing out the registers to mimic a reset.
-	 */
-	for (i = 0; i < x86_pmu.num_counters; i++) {
-		reg = x86_pmu_config_addr(i);
-		wrmsrl_safe(reg, 0ULL);
-	}
-
-	return 0;
-}
-- 
cgit v0.10.2


From 5e865ed44b39fa991a8fb1d7d9c0338f8fedac33 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:21 +0100
Subject: perf/x86: Move perf_event_p6.c ............... =>
 x86/events/intel/p6.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-16-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 5a6f20d..b4e80d02 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/knc.o 
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/p4.o intel/pt.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/p4.o intel/p6.o intel/pt.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/rapl.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c
new file mode 100644
index 0000000..ee5c4e8
--- /dev/null
+++ b/arch/x86/events/intel/p6.c
@@ -0,0 +1,279 @@
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include "../../kernel/cpu/perf_event.h"
+
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,	/* CPU_CLK_UNHALTED */
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,	/* INST_RETIRED     */
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0f2e,	/* L2_RQSTS:M:E:S:I */
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x012e,	/* L2_RQSTS:I       */
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,	/* BR_INST_RETIRED  */
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,	/* BR_MISS_PRED_RETIRED */
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,	/* BUS_DRDY_CLOCKS  */
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2,	/* RESOURCE_STALLS  */
+
+};
+
+static const u64 __initconst p6_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0043,	/* DATA_MEM_REFS       */
+                [ C(RESULT_MISS)   ] = 0x0045,	/* DCU_LINES_IN        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0x0f29,	/* L2_LD:M:E:S:I       */
+	},
+        [ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+        },
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080,	/* IFU_IFETCH         */
+		[ C(RESULT_MISS)   ] = 0x0f28,	/* L2_IFETCH:M:E:S:I  */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0x0025,	/* L2_M_LINES_INM     */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0043,	/* DATA_MEM_REFS      */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080,	/* IFU_IFETCH         */
+		[ C(RESULT_MISS)   ] = 0x0085,	/* ITLB_MISS          */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4,	/* BR_INST_RETIRED      */
+		[ C(RESULT_MISS)   ] = 0x00c5,	/* BR_MISS_PRED_RETIRED */
+        },
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static u64 p6_pmu_event_map(int hw_event)
+{
+	return p6_perfmon_event_map[hw_event];
+}
+
+/*
+ * Event setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_EVENT			0x0000002EULL
+
+static struct event_constraint p6_event_constraints[] =
+{
+	INTEL_EVENT_CONSTRAINT(0xc1, 0x1),	/* FLOPS */
+	INTEL_EVENT_CONSTRAINT(0x10, 0x1),	/* FP_COMP_OPS_EXE */
+	INTEL_EVENT_CONSTRAINT(0x11, 0x2),	/* FP_ASSIST */
+	INTEL_EVENT_CONSTRAINT(0x12, 0x2),	/* MUL */
+	INTEL_EVENT_CONSTRAINT(0x13, 0x2),	/* DIV */
+	INTEL_EVENT_CONSTRAINT(0x14, 0x1),	/* CYCLES_DIV_BUSY */
+	EVENT_CONSTRAINT_END
+};
+
+static void p6_pmu_disable_all(void)
+{
+	u64 val;
+
+	/* p6 only has one enable register */
+	rdmsrl(MSR_P6_EVNTSEL0, val);
+	val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+	wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static void p6_pmu_enable_all(int added)
+{
+	unsigned long val;
+
+	/* p6 only has one enable register */
+	rdmsrl(MSR_P6_EVNTSEL0, val);
+	val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+	wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
+static inline void
+p6_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 val = P6_NOP_EVENT;
+
+	(void)wrmsrl_safe(hwc->config_base, val);
+}
+
+static void p6_pmu_enable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 val;
+
+	val = hwc->config;
+
+	/*
+	 * p6 only has a global event enable, set on PerfEvtSel0
+	 * We "disable" events by programming P6_NOP_EVENT
+	 * and we rely on p6_pmu_enable_all() being called
+	 * to actually enable the events.
+	 */
+
+	(void)wrmsrl_safe(hwc->config_base, val);
+}
+
+PMU_FORMAT_ATTR(event,	"config:0-7"	);
+PMU_FORMAT_ATTR(umask,	"config:8-15"	);
+PMU_FORMAT_ATTR(edge,	"config:18"	);
+PMU_FORMAT_ATTR(pc,	"config:19"	);
+PMU_FORMAT_ATTR(inv,	"config:23"	);
+PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
+
+static struct attribute *intel_p6_formats_attr[] = {
+	&format_attr_event.attr,
+	&format_attr_umask.attr,
+	&format_attr_edge.attr,
+	&format_attr_pc.attr,
+	&format_attr_inv.attr,
+	&format_attr_cmask.attr,
+	NULL,
+};
+
+static __initconst const struct x86_pmu p6_pmu = {
+	.name			= "p6",
+	.handle_irq		= x86_pmu_handle_irq,
+	.disable_all		= p6_pmu_disable_all,
+	.enable_all		= p6_pmu_enable_all,
+	.enable			= p6_pmu_enable_event,
+	.disable		= p6_pmu_disable_event,
+	.hw_config		= x86_pmu_hw_config,
+	.schedule_events	= x86_schedule_events,
+	.eventsel		= MSR_P6_EVNTSEL0,
+	.perfctr		= MSR_P6_PERFCTR0,
+	.event_map		= p6_pmu_event_map,
+	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
+	.apic			= 1,
+	.max_period		= (1ULL << 31) - 1,
+	.version		= 0,
+	.num_counters		= 2,
+	/*
+	 * Events have 40 bits implemented. However they are designed such
+	 * that bits [32-39] are sign extensions of bit 31. As such the
+	 * effective width of a event for P6-like PMU is 32 bits only.
+	 *
+	 * See IA-32 Intel Architecture Software developer manual Vol 3B
+	 */
+	.cntval_bits		= 32,
+	.cntval_mask		= (1ULL << 32) - 1,
+	.get_event_constraints	= x86_get_event_constraints,
+	.event_constraints	= p6_event_constraints,
+
+	.format_attrs		= intel_p6_formats_attr,
+	.events_sysfs_show	= intel_event_sysfs_show,
+
+};
+
+static __init void p6_pmu_rdpmc_quirk(void)
+{
+	if (boot_cpu_data.x86_mask < 9) {
+		/*
+		 * PPro erratum 26; fixed in stepping 9 and above.
+		 */
+		pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
+		x86_pmu.attr_rdpmc_broken = 1;
+		x86_pmu.attr_rdpmc = 0;
+	}
+}
+
+__init int p6_pmu_init(void)
+{
+	x86_pmu = p6_pmu;
+
+	switch (boot_cpu_data.x86_model) {
+	case  1: /* Pentium Pro */
+		x86_add_quirk(p6_pmu_rdpmc_quirk);
+		break;
+
+	case  3: /* Pentium II - Klamath */
+	case  5: /* Pentium II - Deschutes */
+	case  6: /* Pentium II - Mendocino */
+		break;
+
+	case  7: /* Pentium III - Katmai */
+	case  8: /* Pentium III - Coppermine */
+	case 10: /* Pentium III Xeon */
+	case 11: /* Pentium III - Tualatin */
+		break;
+
+	case  9: /* Pentium M - Banias */
+	case 13: /* Pentium M - Dothan */
+		break;
+
+	default:
+		pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
+		return -ENODEV;
+	}
+
+	memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
+		sizeof(hw_cache_event_ids));
+
+	return 0;
+}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 2bb41d0..5cc09fc 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -31,8 +31,6 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o
-
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
 endif
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
deleted file mode 100644
index 7c1a0c0..0000000
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ /dev/null
@@ -1,279 +0,0 @@
-#include <linux/perf_event.h>
-#include <linux/types.h>
-
-#include "perf_event.h"
-
-/*
- * Not sure about some of these
- */
-static const u64 p6_perfmon_event_map[] =
-{
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,	/* CPU_CLK_UNHALTED */
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,	/* INST_RETIRED     */
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0f2e,	/* L2_RQSTS:M:E:S:I */
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x012e,	/* L2_RQSTS:I       */
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,	/* BR_INST_RETIRED  */
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,	/* BR_MISS_PRED_RETIRED */
-  [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,	/* BUS_DRDY_CLOCKS  */
-  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2,	/* RESOURCE_STALLS  */
-
-};
-
-static const u64 __initconst p6_hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
-{
- [ C(L1D) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0043,	/* DATA_MEM_REFS       */
-                [ C(RESULT_MISS)   ] = 0x0045,	/* DCU_LINES_IN        */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0x0f29,	/* L2_LD:M:E:S:I       */
-	},
-        [ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-        },
- },
- [ C(L1I ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080,	/* IFU_IFETCH         */
-		[ C(RESULT_MISS)   ] = 0x0f28,	/* L2_IFETCH:M:E:S:I  */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(LL  ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0x0025,	/* L2_M_LINES_INM     */
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(DTLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0043,	/* DATA_MEM_REFS      */
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0,
-		[ C(RESULT_MISS)   ] = 0,
-	},
- },
- [ C(ITLB) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080,	/* IFU_IFETCH         */
-		[ C(RESULT_MISS)   ] = 0x0085,	/* ITLB_MISS          */
-	},
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
- [ C(BPU ) ] = {
-	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x00c4,	/* BR_INST_RETIRED      */
-		[ C(RESULT_MISS)   ] = 0x00c5,	/* BR_MISS_PRED_RETIRED */
-        },
-	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
-	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = -1,
-		[ C(RESULT_MISS)   ] = -1,
-	},
- },
-};
-
-static u64 p6_pmu_event_map(int hw_event)
-{
-	return p6_perfmon_event_map[hw_event];
-}
-
-/*
- * Event setting that is specified not to count anything.
- * We use this to effectively disable a counter.
- *
- * L2_RQSTS with 0 MESI unit mask.
- */
-#define P6_NOP_EVENT			0x0000002EULL
-
-static struct event_constraint p6_event_constraints[] =
-{
-	INTEL_EVENT_CONSTRAINT(0xc1, 0x1),	/* FLOPS */
-	INTEL_EVENT_CONSTRAINT(0x10, 0x1),	/* FP_COMP_OPS_EXE */
-	INTEL_EVENT_CONSTRAINT(0x11, 0x2),	/* FP_ASSIST */
-	INTEL_EVENT_CONSTRAINT(0x12, 0x2),	/* MUL */
-	INTEL_EVENT_CONSTRAINT(0x13, 0x2),	/* DIV */
-	INTEL_EVENT_CONSTRAINT(0x14, 0x1),	/* CYCLES_DIV_BUSY */
-	EVENT_CONSTRAINT_END
-};
-
-static void p6_pmu_disable_all(void)
-{
-	u64 val;
-
-	/* p6 only has one enable register */
-	rdmsrl(MSR_P6_EVNTSEL0, val);
-	val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
-	wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static void p6_pmu_enable_all(int added)
-{
-	unsigned long val;
-
-	/* p6 only has one enable register */
-	rdmsrl(MSR_P6_EVNTSEL0, val);
-	val |= ARCH_PERFMON_EVENTSEL_ENABLE;
-	wrmsrl(MSR_P6_EVNTSEL0, val);
-}
-
-static inline void
-p6_pmu_disable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 val = P6_NOP_EVENT;
-
-	(void)wrmsrl_safe(hwc->config_base, val);
-}
-
-static void p6_pmu_enable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-	u64 val;
-
-	val = hwc->config;
-
-	/*
-	 * p6 only has a global event enable, set on PerfEvtSel0
-	 * We "disable" events by programming P6_NOP_EVENT
-	 * and we rely on p6_pmu_enable_all() being called
-	 * to actually enable the events.
-	 */
-
-	(void)wrmsrl_safe(hwc->config_base, val);
-}
-
-PMU_FORMAT_ATTR(event,	"config:0-7"	);
-PMU_FORMAT_ATTR(umask,	"config:8-15"	);
-PMU_FORMAT_ATTR(edge,	"config:18"	);
-PMU_FORMAT_ATTR(pc,	"config:19"	);
-PMU_FORMAT_ATTR(inv,	"config:23"	);
-PMU_FORMAT_ATTR(cmask,	"config:24-31"	);
-
-static struct attribute *intel_p6_formats_attr[] = {
-	&format_attr_event.attr,
-	&format_attr_umask.attr,
-	&format_attr_edge.attr,
-	&format_attr_pc.attr,
-	&format_attr_inv.attr,
-	&format_attr_cmask.attr,
-	NULL,
-};
-
-static __initconst const struct x86_pmu p6_pmu = {
-	.name			= "p6",
-	.handle_irq		= x86_pmu_handle_irq,
-	.disable_all		= p6_pmu_disable_all,
-	.enable_all		= p6_pmu_enable_all,
-	.enable			= p6_pmu_enable_event,
-	.disable		= p6_pmu_disable_event,
-	.hw_config		= x86_pmu_hw_config,
-	.schedule_events	= x86_schedule_events,
-	.eventsel		= MSR_P6_EVNTSEL0,
-	.perfctr		= MSR_P6_PERFCTR0,
-	.event_map		= p6_pmu_event_map,
-	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
-	.apic			= 1,
-	.max_period		= (1ULL << 31) - 1,
-	.version		= 0,
-	.num_counters		= 2,
-	/*
-	 * Events have 40 bits implemented. However they are designed such
-	 * that bits [32-39] are sign extensions of bit 31. As such the
-	 * effective width of a event for P6-like PMU is 32 bits only.
-	 *
-	 * See IA-32 Intel Architecture Software developer manual Vol 3B
-	 */
-	.cntval_bits		= 32,
-	.cntval_mask		= (1ULL << 32) - 1,
-	.get_event_constraints	= x86_get_event_constraints,
-	.event_constraints	= p6_event_constraints,
-
-	.format_attrs		= intel_p6_formats_attr,
-	.events_sysfs_show	= intel_event_sysfs_show,
-
-};
-
-static __init void p6_pmu_rdpmc_quirk(void)
-{
-	if (boot_cpu_data.x86_mask < 9) {
-		/*
-		 * PPro erratum 26; fixed in stepping 9 and above.
-		 */
-		pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
-		x86_pmu.attr_rdpmc_broken = 1;
-		x86_pmu.attr_rdpmc = 0;
-	}
-}
-
-__init int p6_pmu_init(void)
-{
-	x86_pmu = p6_pmu;
-
-	switch (boot_cpu_data.x86_model) {
-	case  1: /* Pentium Pro */
-		x86_add_quirk(p6_pmu_rdpmc_quirk);
-		break;
-
-	case  3: /* Pentium II - Klamath */
-	case  5: /* Pentium II - Deschutes */
-	case  6: /* Pentium II - Mendocino */
-		break;
-
-	case  7: /* Pentium III - Katmai */
-	case  8: /* Pentium III - Coppermine */
-	case 10: /* Pentium III Xeon */
-	case 11: /* Pentium III - Tualatin */
-		break;
-
-	case  9: /* Pentium M - Banias */
-	case 13: /* Pentium M - Dothan */
-		break;
-
-	default:
-		pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
-		return -ENODEV;
-	}
-
-	memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
-		sizeof(hw_cache_event_ids));
-
-	return 0;
-}
-- 
cgit v0.10.2


From 65a27a3510c82f16e673548b4c819462fabb12ae Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:22 +0100
Subject: perf/x86: Move perf_event_msr.c .............. => x86/events/msr.c

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-17-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index b4e80d02..fdfea15 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,13 +1,13 @@
 obj-y			+= core.o
 
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/core.o amd/uncore.o
-obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o
+obj-$(CONFIG_X86_LOCAL_APIC)            += amd/ibs.o msr.o
 ifdef CONFIG_AMD_IOMMU
 obj-$(CONFIG_CPU_SUP_AMD)               += amd/iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/core.o intel/bts.o intel/cqm.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/cstate.o intel/ds.o intel/knc.o 
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/lbr.o intel/p4.o intel/p6.o intel/pt.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel/rapl.o msr.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore.o intel/uncore_nhmex.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= intel/uncore_snb.o intel/uncore_snbep.o
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
new file mode 100644
index 0000000..ec863b9
--- /dev/null
+++ b/arch/x86/events/msr.c
@@ -0,0 +1,241 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+	PERF_MSR_TSC			= 0,
+	PERF_MSR_APERF			= 1,
+	PERF_MSR_MPERF			= 2,
+	PERF_MSR_PPERF			= 3,
+	PERF_MSR_SMI			= 4,
+
+	PERF_MSR_EVENT_MAX,
+};
+
+static bool test_aperfmperf(int idx)
+{
+	return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+static bool test_intel(int idx)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return false;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 69: /* 22nm Haswell ULT */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case 76: /* 14nm Atom "Airmont"                   */
+		if (idx == PERF_MSR_SMI)
+			return true;
+		break;
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+			return true;
+		break;
+	}
+
+	return false;
+}
+
+struct perf_msr {
+	u64	msr;
+	struct	perf_pmu_events_attr *attr;
+	bool	(*test)(int idx);
+};
+
+PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
+
+static struct perf_msr msr[] = {
+	[PERF_MSR_TSC]   = { 0,			&evattr_tsc,	NULL,		 },
+	[PERF_MSR_APERF] = { MSR_IA32_APERF,	&evattr_aperf,	test_aperfmperf, },
+	[PERF_MSR_MPERF] = { MSR_IA32_MPERF,	&evattr_mperf,	test_aperfmperf, },
+	[PERF_MSR_PPERF] = { MSR_PPERF,		&evattr_pperf,	test_intel,	 },
+	[PERF_MSR_SMI]   = { MSR_SMI_COUNT,	&evattr_smi,	test_intel,	 },
+};
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+	NULL,
+};
+
+static struct attribute_group events_attr_group = {
+	.name = "events",
+	.attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+static struct attribute_group format_attr_group = {
+	.name = "format",
+	.attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+	&events_attr_group,
+	&format_attr_group,
+	NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (cfg >= PERF_MSR_EVENT_MAX)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	if (!msr[cfg].attr)
+		return -EINVAL;
+
+	event->hw.idx = -1;
+	event->hw.event_base = msr[cfg].msr;
+	event->hw.config = cfg;
+
+	return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+	u64 now;
+
+	if (event->hw.event_base)
+		rdmsrl(event->hw.event_base, now);
+	else
+		rdtscll(now);
+
+	return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+	u64 prev, now;
+	s64 delta;
+
+	/* Careful, an NMI might modify the previous event value. */
+again:
+	prev = local64_read(&event->hw.prev_count);
+	now = msr_read_counter(event);
+
+	if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+		goto again;
+
+	delta = now - prev;
+	if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
+		delta = sign_extend64(delta, 31);
+
+	local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+	u64 now;
+
+	now = msr_read_counter(event);
+	local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+	msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+	msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		msr_event_start(event, flags);
+
+	return 0;
+}
+
+static struct pmu pmu_msr = {
+	.task_ctx_nr	= perf_sw_context,
+	.attr_groups	= attr_groups,
+	.event_init	= msr_event_init,
+	.add		= msr_event_add,
+	.del		= msr_event_del,
+	.start		= msr_event_start,
+	.stop		= msr_event_stop,
+	.read		= msr_event_update,
+	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init msr_init(void)
+{
+	int i, j = 0;
+
+	if (!boot_cpu_has(X86_FEATURE_TSC)) {
+		pr_cont("no MSR PMU driver.\n");
+		return 0;
+	}
+
+	/* Probe the MSRs. */
+	for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+		u64 val;
+
+		/*
+		 * Virt sucks arse; you cannot tell if a R/O MSR is present :/
+		 */
+		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+			msr[i].attr = NULL;
+	}
+
+	/* List remaining MSRs in the sysfs attrs. */
+	for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+		if (msr[i].attr)
+			events_attrs[j++] = &msr[i].attr->attr.attr;
+	}
+	events_attrs[j] = NULL;
+
+	perf_pmu_register(&pmu_msr, "msr", -1);
+
+	return 0;
+}
+device_initcall(msr_init);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 5cc09fc..7a60424 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,12 +30,6 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-ifdef CONFIG_PERF_EVENTS
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
-obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
-endif
-
-
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
 obj-$(CONFIG_MICROCODE)			+= microcode/
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
deleted file mode 100644
index ec863b9..0000000
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ /dev/null
@@ -1,241 +0,0 @@
-#include <linux/perf_event.h>
-
-enum perf_msr_id {
-	PERF_MSR_TSC			= 0,
-	PERF_MSR_APERF			= 1,
-	PERF_MSR_MPERF			= 2,
-	PERF_MSR_PPERF			= 3,
-	PERF_MSR_SMI			= 4,
-
-	PERF_MSR_EVENT_MAX,
-};
-
-static bool test_aperfmperf(int idx)
-{
-	return boot_cpu_has(X86_FEATURE_APERFMPERF);
-}
-
-static bool test_intel(int idx)
-{
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-	    boot_cpu_data.x86 != 6)
-		return false;
-
-	switch (boot_cpu_data.x86_model) {
-	case 30: /* 45nm Nehalem    */
-	case 26: /* 45nm Nehalem-EP */
-	case 46: /* 45nm Nehalem-EX */
-
-	case 37: /* 32nm Westmere    */
-	case 44: /* 32nm Westmere-EP */
-	case 47: /* 32nm Westmere-EX */
-
-	case 42: /* 32nm SandyBridge         */
-	case 45: /* 32nm SandyBridge-E/EN/EP */
-
-	case 58: /* 22nm IvyBridge       */
-	case 62: /* 22nm IvyBridge-EP/EX */
-
-	case 60: /* 22nm Haswell Core */
-	case 63: /* 22nm Haswell Server */
-	case 69: /* 22nm Haswell ULT */
-	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
-
-	case 61: /* 14nm Broadwell Core-M */
-	case 86: /* 14nm Broadwell Xeon D */
-	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
-	case 79: /* 14nm Broadwell Server */
-
-	case 55: /* 22nm Atom "Silvermont"                */
-	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
-	case 76: /* 14nm Atom "Airmont"                   */
-		if (idx == PERF_MSR_SMI)
-			return true;
-		break;
-
-	case 78: /* 14nm Skylake Mobile */
-	case 94: /* 14nm Skylake Desktop */
-		if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
-			return true;
-		break;
-	}
-
-	return false;
-}
-
-struct perf_msr {
-	u64	msr;
-	struct	perf_pmu_events_attr *attr;
-	bool	(*test)(int idx);
-};
-
-PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
-PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
-PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
-PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
-PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
-
-static struct perf_msr msr[] = {
-	[PERF_MSR_TSC]   = { 0,			&evattr_tsc,	NULL,		 },
-	[PERF_MSR_APERF] = { MSR_IA32_APERF,	&evattr_aperf,	test_aperfmperf, },
-	[PERF_MSR_MPERF] = { MSR_IA32_MPERF,	&evattr_mperf,	test_aperfmperf, },
-	[PERF_MSR_PPERF] = { MSR_PPERF,		&evattr_pperf,	test_intel,	 },
-	[PERF_MSR_SMI]   = { MSR_SMI_COUNT,	&evattr_smi,	test_intel,	 },
-};
-
-static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
-	NULL,
-};
-
-static struct attribute_group events_attr_group = {
-	.name = "events",
-	.attrs = events_attrs,
-};
-
-PMU_FORMAT_ATTR(event, "config:0-63");
-static struct attribute *format_attrs[] = {
-	&format_attr_event.attr,
-	NULL,
-};
-static struct attribute_group format_attr_group = {
-	.name = "format",
-	.attrs = format_attrs,
-};
-
-static const struct attribute_group *attr_groups[] = {
-	&events_attr_group,
-	&format_attr_group,
-	NULL,
-};
-
-static int msr_event_init(struct perf_event *event)
-{
-	u64 cfg = event->attr.config;
-
-	if (event->attr.type != event->pmu->type)
-		return -ENOENT;
-
-	if (cfg >= PERF_MSR_EVENT_MAX)
-		return -EINVAL;
-
-	/* unsupported modes and filters */
-	if (event->attr.exclude_user   ||
-	    event->attr.exclude_kernel ||
-	    event->attr.exclude_hv     ||
-	    event->attr.exclude_idle   ||
-	    event->attr.exclude_host   ||
-	    event->attr.exclude_guest  ||
-	    event->attr.sample_period) /* no sampling */
-		return -EINVAL;
-
-	if (!msr[cfg].attr)
-		return -EINVAL;
-
-	event->hw.idx = -1;
-	event->hw.event_base = msr[cfg].msr;
-	event->hw.config = cfg;
-
-	return 0;
-}
-
-static inline u64 msr_read_counter(struct perf_event *event)
-{
-	u64 now;
-
-	if (event->hw.event_base)
-		rdmsrl(event->hw.event_base, now);
-	else
-		rdtscll(now);
-
-	return now;
-}
-static void msr_event_update(struct perf_event *event)
-{
-	u64 prev, now;
-	s64 delta;
-
-	/* Careful, an NMI might modify the previous event value. */
-again:
-	prev = local64_read(&event->hw.prev_count);
-	now = msr_read_counter(event);
-
-	if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
-		goto again;
-
-	delta = now - prev;
-	if (unlikely(event->hw.event_base == MSR_SMI_COUNT))
-		delta = sign_extend64(delta, 31);
-
-	local64_add(now - prev, &event->count);
-}
-
-static void msr_event_start(struct perf_event *event, int flags)
-{
-	u64 now;
-
-	now = msr_read_counter(event);
-	local64_set(&event->hw.prev_count, now);
-}
-
-static void msr_event_stop(struct perf_event *event, int flags)
-{
-	msr_event_update(event);
-}
-
-static void msr_event_del(struct perf_event *event, int flags)
-{
-	msr_event_stop(event, PERF_EF_UPDATE);
-}
-
-static int msr_event_add(struct perf_event *event, int flags)
-{
-	if (flags & PERF_EF_START)
-		msr_event_start(event, flags);
-
-	return 0;
-}
-
-static struct pmu pmu_msr = {
-	.task_ctx_nr	= perf_sw_context,
-	.attr_groups	= attr_groups,
-	.event_init	= msr_event_init,
-	.add		= msr_event_add,
-	.del		= msr_event_del,
-	.start		= msr_event_start,
-	.stop		= msr_event_stop,
-	.read		= msr_event_update,
-	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
-};
-
-static int __init msr_init(void)
-{
-	int i, j = 0;
-
-	if (!boot_cpu_has(X86_FEATURE_TSC)) {
-		pr_cont("no MSR PMU driver.\n");
-		return 0;
-	}
-
-	/* Probe the MSRs. */
-	for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
-		u64 val;
-
-		/*
-		 * Virt sucks arse; you cannot tell if a R/O MSR is present :/
-		 */
-		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
-			msr[i].attr = NULL;
-	}
-
-	/* List remaining MSRs in the sysfs attrs. */
-	for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
-		if (msr[i].attr)
-			events_attrs[j++] = &msr[i].attr->attr.attr;
-	}
-	events_attrs[j] = NULL;
-
-	perf_pmu_register(&pmu_msr, "msr", -1);
-
-	return 0;
-}
-device_initcall(msr_init);
-- 
cgit v0.10.2


From 27f6d22b037b2be6685e0e27cce929779d634119 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 10:55:23 +0100
Subject: perf/x86: Move perf_event.h to its new home

Now that all functionality has been moved to arch/x86/events/, move the
perf_event.h header and adjust include paths.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1455098123-11740-18-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 51b1658..049ada8d 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -5,7 +5,7 @@
 #include <linux/slab.h>
 #include <asm/apicdef.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 static __initconst const u64 amd_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index a8abd08..51087c2 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -14,7 +14,7 @@
 
 #include <asm/apic.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 static u32 ibs_caps;
 
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 629bc70..635e5eb 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -16,7 +16,7 @@
 #include <linux/cpumask.h>
 #include <linux/slab.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 #include "iommu.h"
 
 #define COUNTER_SHIFT		16
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 90ca601..7402c818 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -37,7 +37,7 @@
 #include <asm/desc.h>
 #include <asm/ldt.h>
 
-#include "../kernel/cpu/perf_event.h"
+#include "perf_event.h"
 
 struct x86_pmu x86_pmu __read_mostly;
 
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 2bd4833..b99dc92 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -26,7 +26,7 @@
 #include <asm-generic/sizes.h>
 #include <asm/perf_event.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 struct bts_ctx {
 	struct perf_output_handle	handle;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 1edf301..a7ec685 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -18,7 +18,7 @@
 #include <asm/hardirq.h>
 #include <asm/apic.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 /*
  * Intel PerfMon, used on Core and later.
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index d1b623a..1b064c4 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -7,7 +7,7 @@
 #include <linux/perf_event.h>
 #include <linux/slab.h>
 #include <asm/cpu_device_id.h>
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 #define MSR_IA32_PQR_ASSOC	0x0c8f
 #define MSR_IA32_QM_CTR		0x0c8e
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 1bbf37e..7946c42 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -89,7 +89,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <asm/cpu_device_id.h>
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 #define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format)		\
 static ssize_t __cstate_##_var##_show(struct kobject *kobj,	\
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 9677207..c8a243d 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -5,7 +5,7 @@
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE		24
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c
index 630bcba..206226e 100644
--- a/arch/x86/events/intel/knc.c
+++ b/arch/x86/events/intel/knc.c
@@ -5,7 +5,7 @@
 
 #include <asm/hardirq.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 static const u64 knc_perfmon_event_map[] =
 {
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 78c88f9..69dd118 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -5,7 +5,7 @@
 #include <asm/msr.h>
 #include <asm/insn.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 enum {
 	LBR_FORMAT_32		= 0x00,
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index 1c72fed..0a5ede1 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -13,7 +13,7 @@
 #include <asm/hardirq.h>
 #include <asm/apic.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 #define P4_CNTR_LIMIT 3
 /*
diff --git a/arch/x86/events/intel/p6.c b/arch/x86/events/intel/p6.c
index ee5c4e8..1f5c47a 100644
--- a/arch/x86/events/intel/p6.c
+++ b/arch/x86/events/intel/p6.c
@@ -1,7 +1,7 @@
 #include <linux/perf_event.h>
 #include <linux/types.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 /*
  * Not sure about some of these
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index e56cebe..6af7cf7 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -29,7 +29,7 @@
 #include <asm/io.h>
 #include <asm/intel_pt.h>
 
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 #include "pt.h"
 
 static DEFINE_PER_CPU(struct pt, pt_ctx);
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 9541f50..580f504 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -48,7 +48,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <asm/cpu_device_id.h>
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 /*
  * RAPL energy status counters
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 1dea204..6a1340c 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -2,7 +2,7 @@
 #include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/perf_event.h>
-#include "../../kernel/cpu/perf_event.h"
+#include "../perf_event.h"
 
 #define UNCORE_PMU_NAME_LEN		32
 #define UNCORE_PMU_HRTIMER_INTERVAL	(60LL * NSEC_PER_SEC)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
new file mode 100644
index 0000000..7bb61e3
--- /dev/null
+++ b/arch/x86/events/perf_event.h
@@ -0,0 +1,955 @@
+/*
+ * Performance events x86 architecture header
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
+ *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
+ *  Copyright (C) 2009 Google, Inc., Stephane Eranian
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+
+/* To enable MSR tracing please use the generic trace points. */
+
+/*
+ *          |   NHM/WSM    |      SNB     |
+ * register -------------------------------
+ *          |  HT  | no HT |  HT  | no HT |
+ *-----------------------------------------
+ * offcore  | core | core  | cpu  | core  |
+ * lbr_sel  | core | core  | cpu  | core  |
+ * ld_lat   | cpu  | core  | cpu  | core  |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
+ */
+enum extra_reg_type {
+	EXTRA_REG_NONE  = -1,	/* not used */
+
+	EXTRA_REG_RSP_0 = 0,	/* offcore_response_0 */
+	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */
+	EXTRA_REG_LBR   = 2,	/* lbr_select */
+	EXTRA_REG_LDLAT = 3,	/* ld_lat_threshold */
+	EXTRA_REG_FE    = 4,    /* fe_* */
+
+	EXTRA_REG_MAX		/* number of entries needed */
+};
+
+struct event_constraint {
+	union {
+		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+		u64		idxmsk64;
+	};
+	u64	code;
+	u64	cmask;
+	int	weight;
+	int	overlap;
+	int	flags;
+};
+/*
+ * struct hw_perf_event.flags flags
+ */
+#define PERF_X86_EVENT_PEBS_LDLAT	0x0001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST		0x0002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW	0x0004 /* haswell style datala, store */
+#define PERF_X86_EVENT_COMMITTED	0x0008 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW	0x0010 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW	0x0020 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL		0x0040 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC		0x0080 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED	0x0100 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT	0x0200 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD	0x0400 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_FREERUNNING	0x0800 /* use freerunning PEBS */
+
+
+struct amd_nb {
+	int nb_id;  /* NorthBridge id */
+	int refcnt; /* reference count */
+	struct perf_event *owners[X86_PMC_IDX_MAX];
+	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
+};
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS		8
+
+/*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ *
+ */
+#define PEBS_FREERUNNING_FLAGS \
+	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+	PERF_SAMPLE_TRANSACTION)
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+/*
+ * Per register state.
+ */
+struct er_account {
+	raw_spinlock_t		lock;	/* per-core: protect structure */
+	u64                 config;	/* extra MSR config */
+	u64                 reg;	/* extra MSR number */
+	atomic_t            ref;	/* reference count */
+};
+
+/*
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
+ */
+struct intel_shared_regs {
+	struct er_account       regs[EXTRA_REG_MAX];
+	int                     refcnt;		/* per-core: #HT threads */
+	unsigned                core_id;	/* per-core: core id */
+};
+
+enum intel_excl_state_type {
+	INTEL_EXCL_UNUSED    = 0, /* counter is unused */
+	INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
+	INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+	bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+	raw_spinlock_t	lock;
+
+	struct intel_excl_states states[2];
+
+	union {
+		u16	has_exclusive[2];
+		u32	exclusive_present;
+	};
+
+	int		refcnt;		/* per-core: #HT threads */
+	unsigned	core_id;	/* per-core: core id */
+};
+
+#define MAX_LBR_ENTRIES		32
+
+enum {
+	X86_PERF_KFREE_SHARED = 0,
+	X86_PERF_KFREE_EXCL   = 1,
+	X86_PERF_KFREE_MAX
+};
+
+struct cpu_hw_events {
+	/*
+	 * Generic x86 PMC bits
+	 */
+	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	int			enabled;
+
+	int			n_events; /* the # of events in the below arrays */
+	int			n_added;  /* the # last events in the below arrays;
+					     they've never been enabled yet */
+	int			n_txn;    /* the # last events in the below arrays;
+					     added in the current transaction */
+	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
+	u64			tags[X86_PMC_IDX_MAX];
+
+	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+	struct event_constraint	*event_constraint[X86_PMC_IDX_MAX];
+
+	int			n_excl; /* the number of exclusive events */
+
+	unsigned int		txn_flags;
+	int			is_fake;
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	struct debug_store	*ds;
+	u64			pebs_enabled;
+
+	/*
+	 * Intel LBR bits
+	 */
+	int				lbr_users;
+	void				*lbr_context;
+	struct perf_branch_stack	lbr_stack;
+	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
+	struct er_account		*lbr_sel;
+	u64				br_sel;
+
+	/*
+	 * Intel host/guest exclude bits
+	 */
+	u64				intel_ctrl_guest_mask;
+	u64				intel_ctrl_host_mask;
+	struct perf_guest_switch_msr	guest_switch_msrs[X86_PMC_IDX_MAX];
+
+	/*
+	 * Intel checkpoint mask
+	 */
+	u64				intel_cp_status;
+
+	/*
+	 * manage shared (per-core, per-cpu) registers
+	 * used on Intel NHM/WSM/SNB
+	 */
+	struct intel_shared_regs	*shared_regs;
+	/*
+	 * manage exclusive counter access between hyperthread
+	 */
+	struct event_constraint *constraint_list; /* in enable order */
+	struct intel_excl_cntrs		*excl_cntrs;
+	int excl_thread_id; /* 0 or 1 */
+
+	/*
+	 * AMD specific bits
+	 */
+	struct amd_nb			*amd_nb;
+	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
+	u64				perf_ctr_virt_mask;
+
+	void				*kfree_on_online[X86_PERF_KFREE_MAX];
+};
+
+#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
+	{ .idxmsk64 = (n) },		\
+	.code = (c),			\
+	.cmask = (m),			\
+	.weight = (w),			\
+	.overlap = (o),			\
+	.flags = f,			\
+}
+
+#define EVENT_CONSTRAINT(c, n, m)	\
+	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
+
+#define INTEL_EXCLEVT_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+			   0, PERF_X86_EVENT_EXCL)
+
+/*
+ * The overlap flag marks event constraints with overlapping counter
+ * masks. This is the case if the counter mask of such an event is not
+ * a subset of any other counter mask of a constraint with an equal or
+ * higher weight, e.g.:
+ *
+ *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+ *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
+ *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
+ *
+ * The event scheduler may not select the correct counter in the first
+ * cycle because it needs to know which subsequent events will be
+ * scheduled. It may fail to schedule the events then. So we set the
+ * overlap flag for such constraints to give the scheduler a hint which
+ * events to select for counter rescheduling.
+ *
+ * Care must be taken as the rescheduling algorithm is O(n!) which
+ * will increase scheduling cycles for an over-commited system
+ * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
+ * and its counter masks must be kept at a minimum.
+ */
+#define EVENT_CONSTRAINT_OVERLAP(c, n, m)	\
+	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
+
+/*
+ * Constraint on the Event code.
+ */
+#define INTEL_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ *  - inv
+ *  - edge
+ *  - cnt-mask
+ *  - in_tx
+ *  - in_tx_checkpointed
+ *  The other filters are supported by fixed counters.
+ *  The any-thread option is supported starting with v3.
+ */
+#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
+#define FIXED_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
+
+/*
+ * Constraint on the Event code + UMask
+ */
+#define INTEL_UEVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
+
+/* Constraint on specific umask bit only + event */
+#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
+
+/* Like UEVENT_CONSTRAINT, but match flags too */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)	\
+	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+			   HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
+#define INTEL_PLD_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
+
+#define INTEL_PST_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
+
+/* Event constraint, but match on all event flags too. */
+#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
+	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+/* Check only flags, but allow all event/umask */
+#define INTEL_ALL_EVENT_CONSTRAINT(code, n)	\
+	EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
+
+/* Check flags and event code, and set the HSW store flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+/* Check flags and event code, and set the HSW load flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW store flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW load flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
+/* Check flags and event code/umask, and set the HSW N/A flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
+	__EVENT_CONSTRAINT(code, n, 			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
+
+
+/*
+ * We define the end marker as having a weight of -1
+ * to enable blacklisting of events using a counter bitmask
+ * of zero and thus a weight of zero.
+ * The end marker has a weight that cannot possibly be
+ * obtained from counting the bits in the bitmask.
+ */
+#define EVENT_CONSTRAINT_END { .weight = -1 }
+
+/*
+ * Check for end marker with weight == -1
+ */
+#define for_each_event_constraint(e, c)	\
+	for ((e) = (c); (e)->weight != -1; (e)++)
+
+/*
+ * Extra registers for specific events.
+ *
+ * Some events need large masks and require external MSRs.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
+ */
+struct extra_reg {
+	unsigned int		event;
+	unsigned int		msr;
+	u64			config_mask;
+	u64			valid_mask;
+	int			idx;  /* per_xxx->regs[] reg index */
+	bool			extra_msr_access;
+};
+
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) {	\
+	.event = (e),			\
+	.msr = (ms),			\
+	.config_mask = (m),		\
+	.valid_mask = (vm),		\
+	.idx = EXTRA_REG_##i,		\
+	.extra_msr_access = true,	\
+	}
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)	\
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
+			ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
+
+#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
+	INTEL_UEVENT_EXTRA_REG(c, \
+			       MSR_PEBS_LD_LAT_THRESHOLD, \
+			       0xffff, \
+			       LDLAT)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
+
+union perf_capabilities {
+	struct {
+		u64	lbr_format:6;
+		u64	pebs_trap:1;
+		u64	pebs_arch_reg:1;
+		u64	pebs_format:4;
+		u64	smm_freeze:1;
+		/*
+		 * PMU supports separate counter range for writing
+		 * values > 32bit.
+		 */
+		u64	full_width_write:1;
+	};
+	u64	capabilities;
+};
+
+struct x86_pmu_quirk {
+	struct x86_pmu_quirk *next;
+	void (*func)(void);
+};
+
+union x86_pmu_config {
+	struct {
+		u64 event:8,
+		    umask:8,
+		    usr:1,
+		    os:1,
+		    edge:1,
+		    pc:1,
+		    interrupt:1,
+		    __reserved1:1,
+		    en:1,
+		    inv:1,
+		    cmask:8,
+		    event2:4,
+		    __reserved2:4,
+		    go:1,
+		    ho:1;
+	} bits;
+	u64 value;
+};
+
+#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+
+enum {
+	x86_lbr_exclusive_lbr,
+	x86_lbr_exclusive_bts,
+	x86_lbr_exclusive_pt,
+	x86_lbr_exclusive_max,
+};
+
+/*
+ * struct x86_pmu - generic x86 pmu
+ */
+struct x86_pmu {
+	/*
+	 * Generic x86 PMC bits
+	 */
+	const char	*name;
+	int		version;
+	int		(*handle_irq)(struct pt_regs *);
+	void		(*disable_all)(void);
+	void		(*enable_all)(int added);
+	void		(*enable)(struct perf_event *);
+	void		(*disable)(struct perf_event *);
+	int		(*hw_config)(struct perf_event *event);
+	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
+	unsigned	eventsel;
+	unsigned	perfctr;
+	int		(*addr_offset)(int index, bool eventsel);
+	int		(*rdpmc_index)(int index);
+	u64		(*event_map)(int);
+	int		max_events;
+	int		num_counters;
+	int		num_counters_fixed;
+	int		cntval_bits;
+	u64		cntval_mask;
+	union {
+			unsigned long events_maskl;
+			unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
+	};
+	int		events_mask_len;
+	int		apic;
+	u64		max_period;
+	struct event_constraint *
+			(*get_event_constraints)(struct cpu_hw_events *cpuc,
+						 int idx,
+						 struct perf_event *event);
+
+	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
+						 struct perf_event *event);
+
+	void		(*start_scheduling)(struct cpu_hw_events *cpuc);
+
+	void		(*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
+	void		(*stop_scheduling)(struct cpu_hw_events *cpuc);
+
+	struct event_constraint *event_constraints;
+	struct x86_pmu_quirk *quirks;
+	int		perfctr_second_write;
+	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
+
+	/*
+	 * sysfs attrs
+	 */
+	int		attr_rdpmc_broken;
+	int		attr_rdpmc;
+	struct attribute **format_attrs;
+	struct attribute **event_attrs;
+
+	ssize_t		(*events_sysfs_show)(char *page, u64 config);
+	struct attribute **cpu_events;
+
+	/*
+	 * CPU Hotplug hooks
+	 */
+	int		(*cpu_prepare)(int cpu);
+	void		(*cpu_starting)(int cpu);
+	void		(*cpu_dying)(int cpu);
+	void		(*cpu_dead)(int cpu);
+
+	void		(*check_microcode)(void);
+	void		(*sched_task)(struct perf_event_context *ctx,
+				      bool sched_in);
+
+	/*
+	 * Intel Arch Perfmon v2+
+	 */
+	u64			intel_ctrl;
+	union perf_capabilities intel_cap;
+
+	/*
+	 * Intel DebugStore bits
+	 */
+	unsigned int	bts		:1,
+			bts_active	:1,
+			pebs		:1,
+			pebs_active	:1,
+			pebs_broken	:1,
+			pebs_prec_dist	:1;
+	int		pebs_record_size;
+	void		(*drain_pebs)(struct pt_regs *regs);
+	struct event_constraint *pebs_constraints;
+	void		(*pebs_aliases)(struct perf_event *event);
+	int 		max_pebs_events;
+	unsigned long	free_running_flags;
+
+	/*
+	 * Intel LBR
+	 */
+	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
+	int		lbr_nr;			   /* hardware stack size */
+	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */
+	const int	*lbr_sel_map;		   /* lbr_select mappings */
+	bool		lbr_double_abort;	   /* duplicated lbr aborts */
+
+	/*
+	 * Intel PT/LBR/BTS are exclusive
+	 */
+	atomic_t	lbr_exclusive[x86_lbr_exclusive_max];
+
+	/*
+	 * Extra registers for events
+	 */
+	struct extra_reg *extra_regs;
+	unsigned int flags;
+
+	/*
+	 * Intel host/guest support (KVM)
+	 */
+	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
+};
+
+struct x86_perf_task_context {
+	u64 lbr_from[MAX_LBR_ENTRIES];
+	u64 lbr_to[MAX_LBR_ENTRIES];
+	u64 lbr_info[MAX_LBR_ENTRIES];
+	int tos;
+	int lbr_callstack_users;
+	int lbr_stack_state;
+};
+
+#define x86_add_quirk(func_)						\
+do {									\
+	static struct x86_pmu_quirk __quirk __initdata = {		\
+		.func = func_,						\
+	};								\
+	__quirk.next = x86_pmu.quirks;					\
+	x86_pmu.quirks = &__quirk;					\
+} while (0)
+
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
+#define PMU_FL_EXCL_CNTRS	0x4 /* has exclusive counter requirements  */
+#define PMU_FL_EXCL_ENABLED	0x8 /* exclusive counter active */
+
+#define EVENT_VAR(_id)  event_attr_##_id
+#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
+
+#define EVENT_ATTR(_name, _id)						\
+static struct perf_pmu_events_attr EVENT_VAR(_id) = {			\
+	.attr		= __ATTR(_name, 0444, events_sysfs_show, NULL),	\
+	.id		= PERF_COUNT_HW_##_id,				\
+	.event_str	= NULL,						\
+};
+
+#define EVENT_ATTR_STR(_name, v, str)					\
+static struct perf_pmu_events_attr event_attr_##v = {			\
+	.attr		= __ATTR(_name, 0444, events_sysfs_show, NULL),	\
+	.id		= 0,						\
+	.event_str	= str,						\
+};
+
+extern struct x86_pmu x86_pmu __read_mostly;
+
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+	return  x86_pmu.lbr_sel_map &&
+		x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
+DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+
+int x86_perf_event_set_period(struct perf_event *event);
+
+/*
+ * Generalized hw caching related hw_event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'hw_event makes no sense on
+ * this CPU', any other value means the raw hw_event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+extern u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+extern u64 __read_mostly hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+u64 x86_perf_event_update(struct perf_event *event);
+
+static inline unsigned int x86_pmu_config_addr(int index)
+{
+	return x86_pmu.eventsel + (x86_pmu.addr_offset ?
+				   x86_pmu.addr_offset(index, true) : index);
+}
+
+static inline unsigned int x86_pmu_event_addr(int index)
+{
+	return x86_pmu.perfctr + (x86_pmu.addr_offset ?
+				  x86_pmu.addr_offset(index, false) : index);
+}
+
+static inline int x86_pmu_rdpmc_index(int index)
+{
+	return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
+}
+
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+int x86_reserve_hardware(void);
+
+void x86_release_hardware(void);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
+int x86_setup_perfctr(struct perf_event *event);
+
+int x86_pmu_hw_config(struct perf_event *event);
+
+void x86_pmu_disable_all(void);
+
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+					  u64 enable_mask)
+{
+	u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
+
+	if (hwc->extra_reg.reg)
+		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
+	wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
+}
+
+void x86_pmu_enable_all(int added);
+
+int perf_assign_events(struct event_constraint **constraints, int n,
+			int wmin, int wmax, int gpmax, int *assign);
+int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
+
+void x86_pmu_stop(struct perf_event *event, int flags);
+
+static inline void x86_pmu_disable_event(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	wrmsrl(hwc->config_base, hwc->config);
+}
+
+void x86_pmu_enable_event(struct perf_event *event);
+
+int x86_pmu_handle_irq(struct pt_regs *regs);
+
+extern struct event_constraint emptyconstraint;
+
+extern struct event_constraint unconstrained;
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+	return ip > PAGE_OFFSET;
+#else
+	return (long)ip < 0;
+#endif
+}
+
+/*
+ * Not all PMUs provide the right context information to place the reported IP
+ * into full context. Specifically segment registers are typically not
+ * supplied.
+ *
+ * Assuming the address is a linear address (it is for IBS), we fake the CS and
+ * vm86 mode using the known zero-based code segment and 'fix up' the registers
+ * to reflect this.
+ *
+ * Intel PEBS/LBR appear to typically provide the effective address, nothing
+ * much we can do about that but pray and treat it like a linear address.
+ */
+static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
+{
+	regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
+	if (regs->flags & X86_VM_MASK)
+		regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
+	regs->ip = ip;
+}
+
+ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
+ssize_t intel_event_sysfs_show(char *page, u64 config);
+
+struct attribute **merge_attr(struct attribute **a, struct attribute **b);
+
+#ifdef CONFIG_CPU_SUP_AMD
+
+int amd_pmu_init(void);
+
+#else /* CONFIG_CPU_SUP_AMD */
+
+static inline int amd_pmu_init(void)
+{
+	return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_AMD */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+	    !event->attr.freq && event->hw.sample_period == 1)
+		return true;
+
+	return false;
+}
+
+int intel_pmu_save_and_restart(struct perf_event *event);
+
+struct event_constraint *
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event);
+
+struct intel_shared_regs *allocate_shared_regs(int cpu);
+
+int intel_pmu_init(void);
+
+void init_debug_store_on_cpu(int cpu);
+
+void fini_debug_store_on_cpu(int cpu);
+
+void release_ds_buffers(void);
+
+void reserve_ds_buffers(void);
+
+extern struct event_constraint bts_constraint;
+
+void intel_pmu_enable_bts(u64 config);
+
+void intel_pmu_disable_bts(void);
+
+int intel_pmu_drain_bts_buffer(void);
+
+extern struct event_constraint intel_core2_pebs_event_constraints[];
+
+extern struct event_constraint intel_atom_pebs_event_constraints[];
+
+extern struct event_constraint intel_slm_pebs_event_constraints[];
+
+extern struct event_constraint intel_nehalem_pebs_event_constraints[];
+
+extern struct event_constraint intel_westmere_pebs_event_constraints[];
+
+extern struct event_constraint intel_snb_pebs_event_constraints[];
+
+extern struct event_constraint intel_ivb_pebs_event_constraints[];
+
+extern struct event_constraint intel_hsw_pebs_event_constraints[];
+
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
+struct event_constraint *intel_pebs_constraints(struct perf_event *event);
+
+void intel_pmu_pebs_enable(struct perf_event *event);
+
+void intel_pmu_pebs_disable(struct perf_event *event);
+
+void intel_pmu_pebs_enable_all(void);
+
+void intel_pmu_pebs_disable_all(void);
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+void intel_ds_init(void);
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
+void intel_pmu_lbr_reset(void);
+
+void intel_pmu_lbr_enable(struct perf_event *event);
+
+void intel_pmu_lbr_disable(struct perf_event *event);
+
+void intel_pmu_lbr_enable_all(bool pmi);
+
+void intel_pmu_lbr_disable_all(void);
+
+void intel_pmu_lbr_read(void);
+
+void intel_pmu_lbr_init_core(void);
+
+void intel_pmu_lbr_init_nhm(void);
+
+void intel_pmu_lbr_init_atom(void);
+
+void intel_pmu_lbr_init_snb(void);
+
+void intel_pmu_lbr_init_hsw(void);
+
+void intel_pmu_lbr_init_skl(void);
+
+void intel_pmu_lbr_init_knl(void);
+
+int intel_pmu_setup_lbr_filter(struct perf_event *event);
+
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
+int p4_pmu_init(void);
+
+int p6_pmu_init(void);
+
+int knc_pmu_init(void);
+
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
+			  char *page);
+
+static inline int is_ht_workaround_enabled(void)
+{
+	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static inline void reserve_ds_buffers(void)
+{
+}
+
+static inline void release_ds_buffers(void)
+{
+}
+
+static inline int intel_pmu_init(void)
+{
+	return 0;
+}
+
+static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+	return NULL;
+}
+
+static inline int is_ht_workaround_enabled(void)
+{
+	return 0;
+}
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
deleted file mode 100644
index 7bb61e3..0000000
--- a/arch/x86/kernel/cpu/perf_event.h
+++ /dev/null
@@ -1,955 +0,0 @@
-/*
- * Performance events x86 architecture header
- *
- *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2009 Jaswinder Singh Rajput
- *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
- *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
- *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
- *  Copyright (C) 2009 Google, Inc., Stephane Eranian
- *
- *  For licencing details see kernel-base/COPYING
- */
-
-#include <linux/perf_event.h>
-
-/* To enable MSR tracing please use the generic trace points. */
-
-/*
- *          |   NHM/WSM    |      SNB     |
- * register -------------------------------
- *          |  HT  | no HT |  HT  | no HT |
- *-----------------------------------------
- * offcore  | core | core  | cpu  | core  |
- * lbr_sel  | core | core  | cpu  | core  |
- * ld_lat   | cpu  | core  | cpu  | core  |
- *-----------------------------------------
- *
- * Given that there is a small number of shared regs,
- * we can pre-allocate their slot in the per-cpu
- * per-core reg tables.
- */
-enum extra_reg_type {
-	EXTRA_REG_NONE  = -1,	/* not used */
-
-	EXTRA_REG_RSP_0 = 0,	/* offcore_response_0 */
-	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */
-	EXTRA_REG_LBR   = 2,	/* lbr_select */
-	EXTRA_REG_LDLAT = 3,	/* ld_lat_threshold */
-	EXTRA_REG_FE    = 4,    /* fe_* */
-
-	EXTRA_REG_MAX		/* number of entries needed */
-};
-
-struct event_constraint {
-	union {
-		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-		u64		idxmsk64;
-	};
-	u64	code;
-	u64	cmask;
-	int	weight;
-	int	overlap;
-	int	flags;
-};
-/*
- * struct hw_perf_event.flags flags
- */
-#define PERF_X86_EVENT_PEBS_LDLAT	0x0001 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST		0x0002 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW	0x0004 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED	0x0008 /* event passed commit_txn */
-#define PERF_X86_EVENT_PEBS_LD_HSW	0x0010 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW	0x0020 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_EXCL		0x0040 /* HT exclusivity on counter */
-#define PERF_X86_EVENT_DYNAMIC		0x0080 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED	0x0100 /* grant rdpmc permission */
-#define PERF_X86_EVENT_EXCL_ACCT	0x0200 /* accounted EXCL event */
-#define PERF_X86_EVENT_AUTO_RELOAD	0x0400 /* use PEBS auto-reload */
-#define PERF_X86_EVENT_FREERUNNING	0x0800 /* use freerunning PEBS */
-
-
-struct amd_nb {
-	int nb_id;  /* NorthBridge id */
-	int refcnt; /* reference count */
-	struct perf_event *owners[X86_PMC_IDX_MAX];
-	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
-};
-
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS		8
-
-/*
- * Flags PEBS can handle without an PMI.
- *
- * TID can only be handled by flushing at context switch.
- *
- */
-#define PEBS_FREERUNNING_FLAGS \
-	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
-	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
-	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-	PERF_SAMPLE_TRANSACTION)
-
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
-/*
- * Per register state.
- */
-struct er_account {
-	raw_spinlock_t		lock;	/* per-core: protect structure */
-	u64                 config;	/* extra MSR config */
-	u64                 reg;	/* extra MSR number */
-	atomic_t            ref;	/* reference count */
-};
-
-/*
- * Per core/cpu state
- *
- * Used to coordinate shared registers between HT threads or
- * among events on a single PMU.
- */
-struct intel_shared_regs {
-	struct er_account       regs[EXTRA_REG_MAX];
-	int                     refcnt;		/* per-core: #HT threads */
-	unsigned                core_id;	/* per-core: core id */
-};
-
-enum intel_excl_state_type {
-	INTEL_EXCL_UNUSED    = 0, /* counter is unused */
-	INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
-	INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
-};
-
-struct intel_excl_states {
-	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
-	bool sched_started; /* true if scheduling has started */
-};
-
-struct intel_excl_cntrs {
-	raw_spinlock_t	lock;
-
-	struct intel_excl_states states[2];
-
-	union {
-		u16	has_exclusive[2];
-		u32	exclusive_present;
-	};
-
-	int		refcnt;		/* per-core: #HT threads */
-	unsigned	core_id;	/* per-core: core id */
-};
-
-#define MAX_LBR_ENTRIES		32
-
-enum {
-	X86_PERF_KFREE_SHARED = 0,
-	X86_PERF_KFREE_EXCL   = 1,
-	X86_PERF_KFREE_MAX
-};
-
-struct cpu_hw_events {
-	/*
-	 * Generic x86 PMC bits
-	 */
-	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
-	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	int			enabled;
-
-	int			n_events; /* the # of events in the below arrays */
-	int			n_added;  /* the # last events in the below arrays;
-					     they've never been enabled yet */
-	int			n_txn;    /* the # last events in the below arrays;
-					     added in the current transaction */
-	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
-	u64			tags[X86_PMC_IDX_MAX];
-
-	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
-	struct event_constraint	*event_constraint[X86_PMC_IDX_MAX];
-
-	int			n_excl; /* the number of exclusive events */
-
-	unsigned int		txn_flags;
-	int			is_fake;
-
-	/*
-	 * Intel DebugStore bits
-	 */
-	struct debug_store	*ds;
-	u64			pebs_enabled;
-
-	/*
-	 * Intel LBR bits
-	 */
-	int				lbr_users;
-	void				*lbr_context;
-	struct perf_branch_stack	lbr_stack;
-	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
-	struct er_account		*lbr_sel;
-	u64				br_sel;
-
-	/*
-	 * Intel host/guest exclude bits
-	 */
-	u64				intel_ctrl_guest_mask;
-	u64				intel_ctrl_host_mask;
-	struct perf_guest_switch_msr	guest_switch_msrs[X86_PMC_IDX_MAX];
-
-	/*
-	 * Intel checkpoint mask
-	 */
-	u64				intel_cp_status;
-
-	/*
-	 * manage shared (per-core, per-cpu) registers
-	 * used on Intel NHM/WSM/SNB
-	 */
-	struct intel_shared_regs	*shared_regs;
-	/*
-	 * manage exclusive counter access between hyperthread
-	 */
-	struct event_constraint *constraint_list; /* in enable order */
-	struct intel_excl_cntrs		*excl_cntrs;
-	int excl_thread_id; /* 0 or 1 */
-
-	/*
-	 * AMD specific bits
-	 */
-	struct amd_nb			*amd_nb;
-	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
-	u64				perf_ctr_virt_mask;
-
-	void				*kfree_on_online[X86_PERF_KFREE_MAX];
-};
-
-#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
-	{ .idxmsk64 = (n) },		\
-	.code = (c),			\
-	.cmask = (m),			\
-	.weight = (w),			\
-	.overlap = (o),			\
-	.flags = f,			\
-}
-
-#define EVENT_CONSTRAINT(c, n, m)	\
-	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
-
-#define INTEL_EXCLEVT_CONSTRAINT(c, n)	\
-	__EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
-			   0, PERF_X86_EVENT_EXCL)
-
-/*
- * The overlap flag marks event constraints with overlapping counter
- * masks. This is the case if the counter mask of such an event is not
- * a subset of any other counter mask of a constraint with an equal or
- * higher weight, e.g.:
- *
- *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
- *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
- *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
- *
- * The event scheduler may not select the correct counter in the first
- * cycle because it needs to know which subsequent events will be
- * scheduled. It may fail to schedule the events then. So we set the
- * overlap flag for such constraints to give the scheduler a hint which
- * events to select for counter rescheduling.
- *
- * Care must be taken as the rescheduling algorithm is O(n!) which
- * will increase scheduling cycles for an over-commited system
- * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros
- * and its counter masks must be kept at a minimum.
- */
-#define EVENT_CONSTRAINT_OVERLAP(c, n, m)	\
-	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
-
-/*
- * Constraint on the Event code.
- */
-#define INTEL_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
-
-/*
- * Constraint on the Event code + UMask + fixed-mask
- *
- * filter mask to validate fixed counter events.
- * the following filters disqualify for fixed counters:
- *  - inv
- *  - edge
- *  - cnt-mask
- *  - in_tx
- *  - in_tx_checkpointed
- *  The other filters are supported by fixed counters.
- *  The any-thread option is supported starting with v3.
- */
-#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)
-#define FIXED_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS)
-
-/*
- * Constraint on the Event code + UMask
- */
-#define INTEL_UEVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
-
-/* Constraint on specific umask bit only + event */
-#define INTEL_UBIT_EVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c))
-
-/* Like UEVENT_CONSTRAINT, but match flags too */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)	\
-	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
-
-#define INTEL_EXCLUEVT_CONSTRAINT(c, n)	\
-	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
-			   HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
-
-#define INTEL_PLD_CONSTRAINT(c, n)	\
-	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
-
-#define INTEL_PST_CONSTRAINT(c, n)	\
-	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
-
-/* Event constraint, but match on all event flags too. */
-#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
-	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
-
-/* Check only flags, but allow all event/umask */
-#define INTEL_ALL_EVENT_CONSTRAINT(code, n)	\
-	EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
-
-/* Check flags and event code, and set the HSW store flag */
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
-	__EVENT_CONSTRAINT(code, n, 			\
-			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-
-/* Check flags and event code, and set the HSW load flag */
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
-	__EVENT_CONSTRAINT(code, n,			\
-			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
-
-#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
-	__EVENT_CONSTRAINT(code, n,			\
-			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, \
-			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW store flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
-	__EVENT_CONSTRAINT(code, n, 			\
-			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
-	__EVENT_CONSTRAINT(code, n,			\
-			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, \
-			  PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW load flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
-	__EVENT_CONSTRAINT(code, n, 			\
-			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
-
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
-	__EVENT_CONSTRAINT(code, n,			\
-			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, \
-			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
-
-/* Check flags and event code/umask, and set the HSW N/A flag */
-#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
-	__EVENT_CONSTRAINT(code, n, 			\
-			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
-			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
-
-
-/*
- * We define the end marker as having a weight of -1
- * to enable blacklisting of events using a counter bitmask
- * of zero and thus a weight of zero.
- * The end marker has a weight that cannot possibly be
- * obtained from counting the bits in the bitmask.
- */
-#define EVENT_CONSTRAINT_END { .weight = -1 }
-
-/*
- * Check for end marker with weight == -1
- */
-#define for_each_event_constraint(e, c)	\
-	for ((e) = (c); (e)->weight != -1; (e)++)
-
-/*
- * Extra registers for specific events.
- *
- * Some events need large masks and require external MSRs.
- * Those extra MSRs end up being shared for all events on
- * a PMU and sometimes between PMU of sibling HT threads.
- * In either case, the kernel needs to handle conflicting
- * accesses to those extra, shared, regs. The data structure
- * to manage those registers is stored in cpu_hw_event.
- */
-struct extra_reg {
-	unsigned int		event;
-	unsigned int		msr;
-	u64			config_mask;
-	u64			valid_mask;
-	int			idx;  /* per_xxx->regs[] reg index */
-	bool			extra_msr_access;
-};
-
-#define EVENT_EXTRA_REG(e, ms, m, vm, i) {	\
-	.event = (e),			\
-	.msr = (ms),			\
-	.config_mask = (m),		\
-	.valid_mask = (vm),		\
-	.idx = EXTRA_REG_##i,		\
-	.extra_msr_access = true,	\
-	}
-
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)	\
-	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
-
-#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \
-	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \
-			ARCH_PERFMON_EVENTSEL_UMASK, vm, idx)
-
-#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \
-	INTEL_UEVENT_EXTRA_REG(c, \
-			       MSR_PEBS_LD_LAT_THRESHOLD, \
-			       0xffff, \
-			       LDLAT)
-
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
-
-union perf_capabilities {
-	struct {
-		u64	lbr_format:6;
-		u64	pebs_trap:1;
-		u64	pebs_arch_reg:1;
-		u64	pebs_format:4;
-		u64	smm_freeze:1;
-		/*
-		 * PMU supports separate counter range for writing
-		 * values > 32bit.
-		 */
-		u64	full_width_write:1;
-	};
-	u64	capabilities;
-};
-
-struct x86_pmu_quirk {
-	struct x86_pmu_quirk *next;
-	void (*func)(void);
-};
-
-union x86_pmu_config {
-	struct {
-		u64 event:8,
-		    umask:8,
-		    usr:1,
-		    os:1,
-		    edge:1,
-		    pc:1,
-		    interrupt:1,
-		    __reserved1:1,
-		    en:1,
-		    inv:1,
-		    cmask:8,
-		    event2:4,
-		    __reserved2:4,
-		    go:1,
-		    ho:1;
-	} bits;
-	u64 value;
-};
-
-#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
-
-enum {
-	x86_lbr_exclusive_lbr,
-	x86_lbr_exclusive_bts,
-	x86_lbr_exclusive_pt,
-	x86_lbr_exclusive_max,
-};
-
-/*
- * struct x86_pmu - generic x86 pmu
- */
-struct x86_pmu {
-	/*
-	 * Generic x86 PMC bits
-	 */
-	const char	*name;
-	int		version;
-	int		(*handle_irq)(struct pt_regs *);
-	void		(*disable_all)(void);
-	void		(*enable_all)(int added);
-	void		(*enable)(struct perf_event *);
-	void		(*disable)(struct perf_event *);
-	int		(*hw_config)(struct perf_event *event);
-	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
-	unsigned	eventsel;
-	unsigned	perfctr;
-	int		(*addr_offset)(int index, bool eventsel);
-	int		(*rdpmc_index)(int index);
-	u64		(*event_map)(int);
-	int		max_events;
-	int		num_counters;
-	int		num_counters_fixed;
-	int		cntval_bits;
-	u64		cntval_mask;
-	union {
-			unsigned long events_maskl;
-			unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
-	};
-	int		events_mask_len;
-	int		apic;
-	u64		max_period;
-	struct event_constraint *
-			(*get_event_constraints)(struct cpu_hw_events *cpuc,
-						 int idx,
-						 struct perf_event *event);
-
-	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
-						 struct perf_event *event);
-
-	void		(*start_scheduling)(struct cpu_hw_events *cpuc);
-
-	void		(*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
-
-	void		(*stop_scheduling)(struct cpu_hw_events *cpuc);
-
-	struct event_constraint *event_constraints;
-	struct x86_pmu_quirk *quirks;
-	int		perfctr_second_write;
-	bool		late_ack;
-	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
-
-	/*
-	 * sysfs attrs
-	 */
-	int		attr_rdpmc_broken;
-	int		attr_rdpmc;
-	struct attribute **format_attrs;
-	struct attribute **event_attrs;
-
-	ssize_t		(*events_sysfs_show)(char *page, u64 config);
-	struct attribute **cpu_events;
-
-	/*
-	 * CPU Hotplug hooks
-	 */
-	int		(*cpu_prepare)(int cpu);
-	void		(*cpu_starting)(int cpu);
-	void		(*cpu_dying)(int cpu);
-	void		(*cpu_dead)(int cpu);
-
-	void		(*check_microcode)(void);
-	void		(*sched_task)(struct perf_event_context *ctx,
-				      bool sched_in);
-
-	/*
-	 * Intel Arch Perfmon v2+
-	 */
-	u64			intel_ctrl;
-	union perf_capabilities intel_cap;
-
-	/*
-	 * Intel DebugStore bits
-	 */
-	unsigned int	bts		:1,
-			bts_active	:1,
-			pebs		:1,
-			pebs_active	:1,
-			pebs_broken	:1,
-			pebs_prec_dist	:1;
-	int		pebs_record_size;
-	void		(*drain_pebs)(struct pt_regs *regs);
-	struct event_constraint *pebs_constraints;
-	void		(*pebs_aliases)(struct perf_event *event);
-	int 		max_pebs_events;
-	unsigned long	free_running_flags;
-
-	/*
-	 * Intel LBR
-	 */
-	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
-	int		lbr_nr;			   /* hardware stack size */
-	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */
-	const int	*lbr_sel_map;		   /* lbr_select mappings */
-	bool		lbr_double_abort;	   /* duplicated lbr aborts */
-
-	/*
-	 * Intel PT/LBR/BTS are exclusive
-	 */
-	atomic_t	lbr_exclusive[x86_lbr_exclusive_max];
-
-	/*
-	 * Extra registers for events
-	 */
-	struct extra_reg *extra_regs;
-	unsigned int flags;
-
-	/*
-	 * Intel host/guest support (KVM)
-	 */
-	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
-};
-
-struct x86_perf_task_context {
-	u64 lbr_from[MAX_LBR_ENTRIES];
-	u64 lbr_to[MAX_LBR_ENTRIES];
-	u64 lbr_info[MAX_LBR_ENTRIES];
-	int tos;
-	int lbr_callstack_users;
-	int lbr_stack_state;
-};
-
-#define x86_add_quirk(func_)						\
-do {									\
-	static struct x86_pmu_quirk __quirk __initdata = {		\
-		.func = func_,						\
-	};								\
-	__quirk.next = x86_pmu.quirks;					\
-	x86_pmu.quirks = &__quirk;					\
-} while (0)
-
-/*
- * x86_pmu flags
- */
-#define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
-#define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
-#define PMU_FL_EXCL_CNTRS	0x4 /* has exclusive counter requirements  */
-#define PMU_FL_EXCL_ENABLED	0x8 /* exclusive counter active */
-
-#define EVENT_VAR(_id)  event_attr_##_id
-#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
-
-#define EVENT_ATTR(_name, _id)						\
-static struct perf_pmu_events_attr EVENT_VAR(_id) = {			\
-	.attr		= __ATTR(_name, 0444, events_sysfs_show, NULL),	\
-	.id		= PERF_COUNT_HW_##_id,				\
-	.event_str	= NULL,						\
-};
-
-#define EVENT_ATTR_STR(_name, v, str)					\
-static struct perf_pmu_events_attr event_attr_##v = {			\
-	.attr		= __ATTR(_name, 0444, events_sysfs_show, NULL),	\
-	.id		= 0,						\
-	.event_str	= str,						\
-};
-
-extern struct x86_pmu x86_pmu __read_mostly;
-
-static inline bool x86_pmu_has_lbr_callstack(void)
-{
-	return  x86_pmu.lbr_sel_map &&
-		x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
-}
-
-DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
-
-int x86_perf_event_set_period(struct perf_event *event);
-
-/*
- * Generalized hw caching related hw_event table, filled
- * in on a per model basis. A value of 0 means
- * 'not supported', -1 means 'hw_event makes no sense on
- * this CPU', any other value means the raw hw_event
- * ID.
- */
-
-#define C(x) PERF_COUNT_HW_CACHE_##x
-
-extern u64 __read_mostly hw_cache_event_ids
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX];
-extern u64 __read_mostly hw_cache_extra_regs
-				[PERF_COUNT_HW_CACHE_MAX]
-				[PERF_COUNT_HW_CACHE_OP_MAX]
-				[PERF_COUNT_HW_CACHE_RESULT_MAX];
-
-u64 x86_perf_event_update(struct perf_event *event);
-
-static inline unsigned int x86_pmu_config_addr(int index)
-{
-	return x86_pmu.eventsel + (x86_pmu.addr_offset ?
-				   x86_pmu.addr_offset(index, true) : index);
-}
-
-static inline unsigned int x86_pmu_event_addr(int index)
-{
-	return x86_pmu.perfctr + (x86_pmu.addr_offset ?
-				  x86_pmu.addr_offset(index, false) : index);
-}
-
-static inline int x86_pmu_rdpmc_index(int index)
-{
-	return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
-}
-
-int x86_add_exclusive(unsigned int what);
-
-void x86_del_exclusive(unsigned int what);
-
-int x86_reserve_hardware(void);
-
-void x86_release_hardware(void);
-
-void hw_perf_lbr_event_destroy(struct perf_event *event);
-
-int x86_setup_perfctr(struct perf_event *event);
-
-int x86_pmu_hw_config(struct perf_event *event);
-
-void x86_pmu_disable_all(void);
-
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
-					  u64 enable_mask)
-{
-	u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
-
-	if (hwc->extra_reg.reg)
-		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
-	wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
-}
-
-void x86_pmu_enable_all(int added);
-
-int perf_assign_events(struct event_constraint **constraints, int n,
-			int wmin, int wmax, int gpmax, int *assign);
-int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
-
-void x86_pmu_stop(struct perf_event *event, int flags);
-
-static inline void x86_pmu_disable_event(struct perf_event *event)
-{
-	struct hw_perf_event *hwc = &event->hw;
-
-	wrmsrl(hwc->config_base, hwc->config);
-}
-
-void x86_pmu_enable_event(struct perf_event *event);
-
-int x86_pmu_handle_irq(struct pt_regs *regs);
-
-extern struct event_constraint emptyconstraint;
-
-extern struct event_constraint unconstrained;
-
-static inline bool kernel_ip(unsigned long ip)
-{
-#ifdef CONFIG_X86_32
-	return ip > PAGE_OFFSET;
-#else
-	return (long)ip < 0;
-#endif
-}
-
-/*
- * Not all PMUs provide the right context information to place the reported IP
- * into full context. Specifically segment registers are typically not
- * supplied.
- *
- * Assuming the address is a linear address (it is for IBS), we fake the CS and
- * vm86 mode using the known zero-based code segment and 'fix up' the registers
- * to reflect this.
- *
- * Intel PEBS/LBR appear to typically provide the effective address, nothing
- * much we can do about that but pray and treat it like a linear address.
- */
-static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
-{
-	regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
-	if (regs->flags & X86_VM_MASK)
-		regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
-	regs->ip = ip;
-}
-
-ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
-ssize_t intel_event_sysfs_show(char *page, u64 config);
-
-struct attribute **merge_attr(struct attribute **a, struct attribute **b);
-
-#ifdef CONFIG_CPU_SUP_AMD
-
-int amd_pmu_init(void);
-
-#else /* CONFIG_CPU_SUP_AMD */
-
-static inline int amd_pmu_init(void)
-{
-	return 0;
-}
-
-#endif /* CONFIG_CPU_SUP_AMD */
-
-#ifdef CONFIG_CPU_SUP_INTEL
-
-static inline bool intel_pmu_has_bts(struct perf_event *event)
-{
-	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
-	    !event->attr.freq && event->hw.sample_period == 1)
-		return true;
-
-	return false;
-}
-
-int intel_pmu_save_and_restart(struct perf_event *event);
-
-struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
-			  struct perf_event *event);
-
-struct intel_shared_regs *allocate_shared_regs(int cpu);
-
-int intel_pmu_init(void);
-
-void init_debug_store_on_cpu(int cpu);
-
-void fini_debug_store_on_cpu(int cpu);
-
-void release_ds_buffers(void);
-
-void reserve_ds_buffers(void);
-
-extern struct event_constraint bts_constraint;
-
-void intel_pmu_enable_bts(u64 config);
-
-void intel_pmu_disable_bts(void);
-
-int intel_pmu_drain_bts_buffer(void);
-
-extern struct event_constraint intel_core2_pebs_event_constraints[];
-
-extern struct event_constraint intel_atom_pebs_event_constraints[];
-
-extern struct event_constraint intel_slm_pebs_event_constraints[];
-
-extern struct event_constraint intel_nehalem_pebs_event_constraints[];
-
-extern struct event_constraint intel_westmere_pebs_event_constraints[];
-
-extern struct event_constraint intel_snb_pebs_event_constraints[];
-
-extern struct event_constraint intel_ivb_pebs_event_constraints[];
-
-extern struct event_constraint intel_hsw_pebs_event_constraints[];
-
-extern struct event_constraint intel_skl_pebs_event_constraints[];
-
-struct event_constraint *intel_pebs_constraints(struct perf_event *event);
-
-void intel_pmu_pebs_enable(struct perf_event *event);
-
-void intel_pmu_pebs_disable(struct perf_event *event);
-
-void intel_pmu_pebs_enable_all(void);
-
-void intel_pmu_pebs_disable_all(void);
-
-void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
-
-void intel_ds_init(void);
-
-void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
-
-void intel_pmu_lbr_reset(void);
-
-void intel_pmu_lbr_enable(struct perf_event *event);
-
-void intel_pmu_lbr_disable(struct perf_event *event);
-
-void intel_pmu_lbr_enable_all(bool pmi);
-
-void intel_pmu_lbr_disable_all(void);
-
-void intel_pmu_lbr_read(void);
-
-void intel_pmu_lbr_init_core(void);
-
-void intel_pmu_lbr_init_nhm(void);
-
-void intel_pmu_lbr_init_atom(void);
-
-void intel_pmu_lbr_init_snb(void);
-
-void intel_pmu_lbr_init_hsw(void);
-
-void intel_pmu_lbr_init_skl(void);
-
-void intel_pmu_lbr_init_knl(void);
-
-int intel_pmu_setup_lbr_filter(struct perf_event *event);
-
-void intel_pt_interrupt(void);
-
-int intel_bts_interrupt(void);
-
-void intel_bts_enable_local(void);
-
-void intel_bts_disable_local(void);
-
-int p4_pmu_init(void);
-
-int p6_pmu_init(void);
-
-int knc_pmu_init(void);
-
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
-			  char *page);
-
-static inline int is_ht_workaround_enabled(void)
-{
-	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
-}
-
-#else /* CONFIG_CPU_SUP_INTEL */
-
-static inline void reserve_ds_buffers(void)
-{
-}
-
-static inline void release_ds_buffers(void)
-{
-}
-
-static inline int intel_pmu_init(void)
-{
-	return 0;
-}
-
-static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
-{
-	return NULL;
-}
-
-static inline int is_ht_workaround_enabled(void)
-{
-	return 0;
-}
-#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 724a087..9466354 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -11,7 +11,7 @@
 #include "pmu.h"
 
 /* x86_pmu.handle_irq definition */
-#include "../kernel/cpu/perf_event.h"
+#include "../events/perf_event.h"
 
 #define XENPMU_IRQ_PROCESSING    1
 struct xenpmu {
-- 
cgit v0.10.2


From 3b364d7b587db0f0eeafde0f271e0698187de776 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Feb 2016 20:11:40 +0000
Subject: perf/core: Remove unused arguments from a bunch of functions

No functional change, just less confusing to read.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/20160209201007.921540566@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0d58522..94c47e3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6710,7 +6710,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
 	kfree_rcu(hlist, rcu_head);
 }
 
-static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
+static void swevent_hlist_put_cpu(int cpu)
 {
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
@@ -6722,15 +6722,15 @@ static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
 	mutex_unlock(&swhash->hlist_mutex);
 }
 
-static void swevent_hlist_put(struct perf_event *event)
+static void swevent_hlist_put(void)
 {
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		swevent_hlist_put_cpu(event, cpu);
+		swevent_hlist_put_cpu(cpu);
 }
 
-static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
+static int swevent_hlist_get_cpu(int cpu)
 {
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 	int err = 0;
@@ -6753,14 +6753,13 @@ exit:
 	return err;
 }
 
-static int swevent_hlist_get(struct perf_event *event)
+static int swevent_hlist_get(void)
 {
-	int err;
-	int cpu, failed_cpu;
+	int err, cpu, failed_cpu;
 
 	get_online_cpus();
 	for_each_possible_cpu(cpu) {
-		err = swevent_hlist_get_cpu(event, cpu);
+		err = swevent_hlist_get_cpu(cpu);
 		if (err) {
 			failed_cpu = cpu;
 			goto fail;
@@ -6773,7 +6772,7 @@ fail:
 	for_each_possible_cpu(cpu) {
 		if (cpu == failed_cpu)
 			break;
-		swevent_hlist_put_cpu(event, cpu);
+		swevent_hlist_put_cpu(cpu);
 	}
 
 	put_online_cpus();
@@ -6789,7 +6788,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
 	WARN_ON(event->parent);
 
 	static_key_slow_dec(&perf_swevent_enabled[event_id]);
-	swevent_hlist_put(event);
+	swevent_hlist_put();
 }
 
 static int perf_swevent_init(struct perf_event *event)
@@ -6820,7 +6819,7 @@ static int perf_swevent_init(struct perf_event *event)
 	if (!event->parent) {
 		int err;
 
-		err = swevent_hlist_get(event);
+		err = swevent_hlist_get();
 		if (err)
 			return err;
 
-- 
cgit v0.10.2


From 8ad7b378d0d016309014cae0f640434bca7b5e11 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 9 Feb 2016 11:15:13 -0800
Subject: futex: Rename barrier references in ordering guarantees

Ingo suggested we rename how we reference barriers A and B
regarding futex ordering guarantees. This patch replaces,
for both barriers, MB (A) with smp_mb(); (A), such that:

 - We explicitly state that the barriers are SMP, and

 - We standardize how we reference these across futex.c
   helping readers follow what barrier does what and where.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Mason <clm@fb.com>
Cc: Darren Hart <dvhart@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1455045314-8305-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/futex.c b/kernel/futex.c
index 5d6ce64..08ac700 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -124,16 +124,16 @@
  *   futex_wait(futex, val);
  *
  *   waiters++; (a)
- *   mb(); (A) <-- paired with -.
- *                              |
- *   lock(hash_bucket(futex));  |
- *                              |
- *   uval = *futex;             |
- *                              |        *futex = newval;
- *                              |        sys_futex(WAKE, futex);
- *                              |          futex_wake(futex);
- *                              |
- *                              `------->  mb(); (B)
+ *   smp_mb(); (A) <-- paired with -.
+ *                                  |
+ *   lock(hash_bucket(futex));      |
+ *                                  |
+ *   uval = *futex;                 |
+ *                                  |        *futex = newval;
+ *                                  |        sys_futex(WAKE, futex);
+ *                                  |          futex_wake(futex);
+ *                                  |
+ *                                  `--------> smp_mb(); (B)
  *   if (uval == val)
  *     queue();
  *     unlock(hash_bucket(futex));
@@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key)
 	/*
 	 * Ensure futex_get_mm() implies a full barrier such that
 	 * get_futex_key() implies a full barrier. This is relied upon
-	 * as full barrier (B), see the ordering comment above.
+	 * as smp_mb(); (B), see the ordering comment above.
 	 */
 	smp_mb__after_atomic();
 }
@@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key)
 
 	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
 	case FUT_OFF_INODE:
-		ihold(key->shared.inode); /* implies MB (B) */
+		ihold(key->shared.inode); /* implies smp_mb(); (B) */
 		break;
 	case FUT_OFF_MMSHARED:
-		futex_get_mm(key); /* implies MB (B) */
+		futex_get_mm(key); /* implies smp_mb(); (B) */
 		break;
 	default:
 		/*
@@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key)
 		 * mm, therefore the only purpose of calling get_futex_key_refs
 		 * is because we need the barrier for the lockless waiter check.
 		 */
-		smp_mb(); /* explicit MB (B) */
+		smp_mb(); /* explicit smp_mb(); (B) */
 	}
 }
 
@@ -497,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 	if (!fshared) {
 		key->private.mm = mm;
 		key->private.address = address;
-		get_futex_key_refs(key);  /* implies MB (B) */
+		get_futex_key_refs(key);  /* implies smp_mb(); (B) */
 		return 0;
 	}
 
@@ -572,7 +572,7 @@ again:
 		key->shared.pgoff = basepage_index(page);
 	}
 
-	get_futex_key_refs(key); /* implies MB (B) */
+	get_futex_key_refs(key); /* implies smp_mb(); (B) */
 
 out:
 	unlock_page(page);
@@ -1864,7 +1864,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 
 	q->lock_ptr = &hb->lock;
 
-	spin_lock(&hb->lock); /* implies MB (A) */
+	spin_lock(&hb->lock); /* implies smp_mb(); (A) */
 	return hb;
 }
 
-- 
cgit v0.10.2


From 65d8fc777f6dcfee12785c057a6b57f679641c90 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 9 Feb 2016 11:15:14 -0800
Subject: futex: Remove requirement for lock_page() in get_futex_key()

When dealing with key handling for shared futexes, we can drastically reduce
the usage/need of the page lock. 1) For anonymous pages, the associated futex
object is the mm_struct which does not require the page lock. 2) For inode
based, keys, we can check under RCU read lock if the page mapping is still
valid and take reference to the inode. This just leaves one rare race that
requires the page lock in the slow path when examining the swapcache.

Additionally realtime users currently have a problem with the page lock being
contended for unbounded periods of time during futex operations.

Task A
     get_futex_key()
     lock_page()
    ---> preempted

Now any other task trying to lock that page will have to wait until
task A gets scheduled back in, which is an unbound time.

With this patch, we pretty much have a lockless futex_get_key().

Experiments show that this patch can boost/speedup the hashing of shared
futexes with the perf futex benchmarks (which is good for measuring such
change) by up to 45% when there are high (> 100) thread counts on a 60 core
Westmere. Lower counts are pretty much in the noise range or less than 10%,
but mid range can be seen at over 30% overall throughput (hash ops/sec).
This makes anon-mem shared futexes much closer to its private counterpart.

Signed-off-by: Mel Gorman <mgorman@suse.de>
[ Ported on top of thp refcount rework, changelog, comments, fixes. ]
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Mason <clm@fb.com>
Cc: Darren Hart <dvhart@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1455045314-8305-3-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/futex.c b/kernel/futex.c
index 08ac700..bae542e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -520,7 +520,20 @@ again:
 	else
 		err = 0;
 
-	lock_page(page);
+	/*
+	 * The treatment of mapping from this point on is critical. The page
+	 * lock protects many things but in this context the page lock
+	 * stabilizes mapping, prevents inode freeing in the shared
+	 * file-backed region case and guards against movement to swap cache.
+	 *
+	 * Strictly speaking the page lock is not needed in all cases being
+	 * considered here and page lock forces unnecessarily serialization
+	 * From this point on, mapping will be re-verified if necessary and
+	 * page lock will be acquired only if it is unavoidable
+	 */
+	page = compound_head(page);
+	mapping = READ_ONCE(page->mapping);
+
 	/*
 	 * If page->mapping is NULL, then it cannot be a PageAnon
 	 * page; but it might be the ZERO_PAGE or in the gate area or
@@ -536,19 +549,31 @@ again:
 	 * shmem_writepage move it from filecache to swapcache beneath us:
 	 * an unlikely race, but we do need to retry for page->mapping.
 	 */
-	mapping = compound_head(page)->mapping;
-	if (!mapping) {
-		int shmem_swizzled = PageSwapCache(page);
+	if (unlikely(!mapping)) {
+		int shmem_swizzled;
+
+		/*
+		 * Page lock is required to identify which special case above
+		 * applies. If this is really a shmem page then the page lock
+		 * will prevent unexpected transitions.
+		 */
+		lock_page(page);
+		shmem_swizzled = PageSwapCache(page) || page->mapping;
 		unlock_page(page);
 		put_page(page);
+
 		if (shmem_swizzled)
 			goto again;
+
 		return -EFAULT;
 	}
 
 	/*
 	 * Private mappings are handled in a simple way.
 	 *
+	 * If the futex key is stored on an anonymous page, then the associated
+	 * object is the mm which is implicitly pinned by the calling process.
+	 *
 	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 	 * it's a read-only handle, it's expected that futexes attach to
 	 * the object not the particular process.
@@ -566,16 +591,74 @@ again:
 		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 		key->private.mm = mm;
 		key->private.address = address;
+
+		get_futex_key_refs(key); /* implies smp_mb(); (B) */
+
 	} else {
+		struct inode *inode;
+
+		/*
+		 * The associated futex object in this case is the inode and
+		 * the page->mapping must be traversed. Ordinarily this should
+		 * be stabilised under page lock but it's not strictly
+		 * necessary in this case as we just want to pin the inode, not
+		 * update the radix tree or anything like that.
+		 *
+		 * The RCU read lock is taken as the inode is finally freed
+		 * under RCU. If the mapping still matches expectations then the
+		 * mapping->host can be safely accessed as being a valid inode.
+		 */
+		rcu_read_lock();
+
+		if (READ_ONCE(page->mapping) != mapping) {
+			rcu_read_unlock();
+			put_page(page);
+
+			goto again;
+		}
+
+		inode = READ_ONCE(mapping->host);
+		if (!inode) {
+			rcu_read_unlock();
+			put_page(page);
+
+			goto again;
+		}
+
+		/*
+		 * Take a reference unless it is about to be freed. Previously
+		 * this reference was taken by ihold under the page lock
+		 * pinning the inode in place so i_lock was unnecessary. The
+		 * only way for this check to fail is if the inode was
+		 * truncated in parallel so warn for now if this happens.
+		 *
+		 * We are not calling into get_futex_key_refs() in file-backed
+		 * cases, therefore a successful atomic_inc return below will
+		 * guarantee that get_futex_key() will still imply smp_mb(); (B).
+		 */
+		if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+			rcu_read_unlock();
+			put_page(page);
+
+			goto again;
+		}
+
+		/* Should be impossible but lets be paranoid for now */
+		if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
+			err = -EFAULT;
+			rcu_read_unlock();
+			iput(inode);
+
+			goto out;
+		}
+
 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-		key->shared.inode = mapping->host;
+		key->shared.inode = inode;
 		key->shared.pgoff = basepage_index(page);
+		rcu_read_unlock();
 	}
 
-	get_futex_key_refs(key); /* implies smp_mb(); (B) */
-
 out:
-	unlock_page(page);
 	put_page(page);
 	return err;
 }
-- 
cgit v0.10.2


From 07ba4b061a79896315a7be4b123de12df6a9d2bd Mon Sep 17 00:00:00 2001
From: Alban Bedel <albeu@free.fr>
Date: Sat, 23 Jan 2016 13:57:46 +0100
Subject: irqchip/ath79-misc: Move the MISC driver from arch/mips/ath79/

The driver stays the same but the initialization changes a bit.
For OF boards we now get the memory map from the OF node and use
a linear mapping instead of the legacy mapping. For legacy boards
we still use a legacy mapping and just pass down all the parameters
from the board init code.

Signed-off-by: Alban Bedel <albeu@free.fr>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Link: https://lkml.kernel.org/r/1453553867-27003-1-git-send-email-albeu@free.fr
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/arch/mips/ath79/irq.c b/arch/mips/ath79/irq.c
index 511c065..05b4514 100644
--- a/arch/mips/ath79/irq.c
+++ b/arch/mips/ath79/irq.c
@@ -26,90 +26,6 @@
 #include "common.h"
 #include "machtypes.h"
 
-static void __init ath79_misc_intc_domain_init(
-	struct device_node *node, int irq);
-
-static void ath79_misc_irq_handler(struct irq_desc *desc)
-{
-	struct irq_domain *domain = irq_desc_get_handler_data(desc);
-	void __iomem *base = domain->host_data;
-	u32 pending;
-
-	pending = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS) &
-		  __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-
-	if (!pending) {
-		spurious_interrupt();
-		return;
-	}
-
-	while (pending) {
-		int bit = __ffs(pending);
-
-		generic_handle_irq(irq_linear_revmap(domain, bit));
-		pending &= ~BIT(bit);
-	}
-}
-
-static void ar71xx_misc_irq_unmask(struct irq_data *d)
-{
-	void __iomem *base = irq_data_get_irq_chip_data(d);
-	unsigned int irq = d->hwirq;
-	u32 t;
-
-	t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-	__raw_writel(t | (1 << irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-
-	/* flush write */
-	__raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-}
-
-static void ar71xx_misc_irq_mask(struct irq_data *d)
-{
-	void __iomem *base = irq_data_get_irq_chip_data(d);
-	unsigned int irq = d->hwirq;
-	u32 t;
-
-	t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-	__raw_writel(t & ~(1 << irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-
-	/* flush write */
-	__raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-}
-
-static void ar724x_misc_irq_ack(struct irq_data *d)
-{
-	void __iomem *base = irq_data_get_irq_chip_data(d);
-	unsigned int irq = d->hwirq;
-	u32 t;
-
-	t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
-	__raw_writel(t & ~(1 << irq), base + AR71XX_RESET_REG_MISC_INT_STATUS);
-
-	/* flush write */
-	__raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
-}
-
-static struct irq_chip ath79_misc_irq_chip = {
-	.name		= "MISC",
-	.irq_unmask	= ar71xx_misc_irq_unmask,
-	.irq_mask	= ar71xx_misc_irq_mask,
-};
-
-static void __init ath79_misc_irq_init(void)
-{
-	if (soc_is_ar71xx() || soc_is_ar913x())
-		ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
-	else if (soc_is_ar724x() ||
-		 soc_is_ar933x() ||
-		 soc_is_ar934x() ||
-		 soc_is_qca955x())
-		ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
-	else
-		BUG();
-
-	ath79_misc_intc_domain_init(NULL, ATH79_CPU_IRQ(6));
-}
 
 static void ar934x_ip2_irq_dispatch(struct irq_desc *desc)
 {
@@ -248,69 +164,6 @@ asmlinkage void plat_irq_dispatch(void)
 	}
 }
 
-static int misc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw)
-{
-	irq_set_chip_and_handler(irq, &ath79_misc_irq_chip, handle_level_irq);
-	irq_set_chip_data(irq, d->host_data);
-	return 0;
-}
-
-static const struct irq_domain_ops misc_irq_domain_ops = {
-	.xlate = irq_domain_xlate_onecell,
-	.map = misc_map,
-};
-
-static void __init ath79_misc_intc_domain_init(
-	struct device_node *node, int irq)
-{
-	void __iomem *base = ath79_reset_base;
-	struct irq_domain *domain;
-
-	domain = irq_domain_add_legacy(node, ATH79_MISC_IRQ_COUNT,
-			ATH79_MISC_IRQ_BASE, 0, &misc_irq_domain_ops, base);
-	if (!domain)
-		panic("Failed to add MISC irqdomain");
-
-	/* Disable and clear all interrupts */
-	__raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_ENABLE);
-	__raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_STATUS);
-
-	irq_set_chained_handler_and_data(irq, ath79_misc_irq_handler, domain);
-}
-
-static int __init ath79_misc_intc_of_init(
-	struct device_node *node, struct device_node *parent)
-{
-	int irq;
-
-	irq = irq_of_parse_and_map(node, 0);
-	if (!irq)
-		panic("Failed to get MISC IRQ");
-
-	ath79_misc_intc_domain_init(node, irq);
-	return 0;
-}
-
-static int __init ar7100_misc_intc_of_init(
-	struct device_node *node, struct device_node *parent)
-{
-	ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
-	return ath79_misc_intc_of_init(node, parent);
-}
-
-IRQCHIP_DECLARE(ar7100_misc_intc, "qca,ar7100-misc-intc",
-		ar7100_misc_intc_of_init);
-
-static int __init ar7240_misc_intc_of_init(
-	struct device_node *node, struct device_node *parent)
-{
-	ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
-	return ath79_misc_intc_of_init(node, parent);
-}
-
-IRQCHIP_DECLARE(ar7240_misc_intc, "qca,ar7240-misc-intc",
-		ar7240_misc_intc_of_init);
-
 static int __init ar79_cpu_intc_of_init(
 	struct device_node *node, struct device_node *parent)
 {
@@ -348,6 +201,8 @@ IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc",
 
 void __init arch_init_irq(void)
 {
+	bool misc_is_ar71xx;
+
 	if (mips_machtype == ATH79_MACH_GENERIC_OF) {
 		irqchip_init();
 		return;
@@ -362,7 +217,19 @@ void __init arch_init_irq(void)
 	}
 
 	mips_cpu_irq_init();
-	ath79_misc_irq_init();
+
+	if (soc_is_ar71xx() || soc_is_ar913x())
+		misc_is_ar71xx = true;
+	else if (soc_is_ar724x() ||
+		 soc_is_ar933x() ||
+		 soc_is_ar934x() ||
+		 soc_is_qca955x())
+		misc_is_ar71xx = false;
+	else
+		BUG();
+	ath79_misc_irq_init(
+		ath79_reset_base + AR71XX_RESET_REG_MISC_INT_STATUS,
+		ATH79_CPU_IRQ(6), ATH79_MISC_IRQ_BASE, misc_is_ar71xx);
 
 	if (soc_is_ar934x())
 		ar934x_ip2_irq_init();
diff --git a/arch/mips/include/asm/mach-ath79/ath79.h b/arch/mips/include/asm/mach-ath79/ath79.h
index 2b34872..22a2f56 100644
--- a/arch/mips/include/asm/mach-ath79/ath79.h
+++ b/arch/mips/include/asm/mach-ath79/ath79.h
@@ -144,4 +144,7 @@ static inline u32 ath79_reset_rr(unsigned reg)
 void ath79_device_reset_set(u32 mask);
 void ath79_device_reset_clear(u32 mask);
 
+void ath79_misc_irq_init(void __iomem *regs, int irq,
+			int irq_base, bool is_ar71xx);
+
 #endif /* __ASM_MACH_ATH79_H */
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 18caacb..b873d4a 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_IRQCHIP)			+= irqchip.o
 
+obj-$(CONFIG_ATH79)			+= irq-ath79-misc.o
 obj-$(CONFIG_ARCH_BCM2835)		+= irq-bcm2835.o
 obj-$(CONFIG_ARCH_BCM2835)		+= irq-bcm2836.o
 obj-$(CONFIG_ARCH_EXYNOS)		+= exynos-combiner.o
diff --git a/drivers/irqchip/irq-ath79-misc.c b/drivers/irqchip/irq-ath79-misc.c
new file mode 100644
index 0000000..aa72907
--- /dev/null
+++ b/drivers/irqchip/irq-ath79-misc.c
@@ -0,0 +1,189 @@
+/*
+ *  Atheros AR71xx/AR724x/AR913x MISC interrupt controller
+ *
+ *  Copyright (C) 2015 Alban Bedel <albeu@free.fr>
+ *  Copyright (C) 2010-2011 Jaiganesh Narayanan <jnarayanan@atheros.com>
+ *  Copyright (C) 2008-2011 Gabor Juhos <juhosg@openwrt.org>
+ *  Copyright (C) 2008 Imre Kaloz <kaloz@openwrt.org>
+ *
+ *  Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ */
+
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+
+#define AR71XX_RESET_REG_MISC_INT_STATUS	0
+#define AR71XX_RESET_REG_MISC_INT_ENABLE	4
+
+#define ATH79_MISC_IRQ_COUNT			32
+
+static void ath79_misc_irq_handler(struct irq_desc *desc)
+{
+	struct irq_domain *domain = irq_desc_get_handler_data(desc);
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	void __iomem *base = domain->host_data;
+	u32 pending;
+
+	chained_irq_enter(chip, desc);
+
+	pending = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS) &
+		  __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+
+	if (!pending) {
+		spurious_interrupt();
+		chained_irq_exit(chip, desc);
+		return;
+	}
+
+	while (pending) {
+		int bit = __ffs(pending);
+
+		generic_handle_irq(irq_linear_revmap(domain, bit));
+		pending &= ~BIT(bit);
+	}
+
+	chained_irq_exit(chip, desc);
+}
+
+static void ar71xx_misc_irq_unmask(struct irq_data *d)
+{
+	void __iomem *base = irq_data_get_irq_chip_data(d);
+	unsigned int irq = d->hwirq;
+	u32 t;
+
+	t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+	__raw_writel(t | BIT(irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+
+	/* flush write */
+	__raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+}
+
+static void ar71xx_misc_irq_mask(struct irq_data *d)
+{
+	void __iomem *base = irq_data_get_irq_chip_data(d);
+	unsigned int irq = d->hwirq;
+	u32 t;
+
+	t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+	__raw_writel(t & ~BIT(irq), base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+
+	/* flush write */
+	__raw_readl(base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+}
+
+static void ar724x_misc_irq_ack(struct irq_data *d)
+{
+	void __iomem *base = irq_data_get_irq_chip_data(d);
+	unsigned int irq = d->hwirq;
+	u32 t;
+
+	t = __raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
+	__raw_writel(t & ~BIT(irq), base + AR71XX_RESET_REG_MISC_INT_STATUS);
+
+	/* flush write */
+	__raw_readl(base + AR71XX_RESET_REG_MISC_INT_STATUS);
+}
+
+static struct irq_chip ath79_misc_irq_chip = {
+	.name		= "MISC",
+	.irq_unmask	= ar71xx_misc_irq_unmask,
+	.irq_mask	= ar71xx_misc_irq_mask,
+};
+
+static int misc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw)
+{
+	irq_set_chip_and_handler(irq, &ath79_misc_irq_chip, handle_level_irq);
+	irq_set_chip_data(irq, d->host_data);
+	return 0;
+}
+
+static const struct irq_domain_ops misc_irq_domain_ops = {
+	.xlate = irq_domain_xlate_onecell,
+	.map = misc_map,
+};
+
+static void __init ath79_misc_intc_domain_init(
+	struct irq_domain *domain, int irq)
+{
+	void __iomem *base = domain->host_data;
+
+	/* Disable and clear all interrupts */
+	__raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_ENABLE);
+	__raw_writel(0, base + AR71XX_RESET_REG_MISC_INT_STATUS);
+
+	irq_set_chained_handler_and_data(irq, ath79_misc_irq_handler, domain);
+}
+
+static int __init ath79_misc_intc_of_init(
+	struct device_node *node, struct device_node *parent)
+{
+	struct irq_domain *domain;
+	void __iomem *base;
+	int irq;
+
+	irq = irq_of_parse_and_map(node, 0);
+	if (!irq) {
+		pr_err("Failed to get MISC IRQ\n");
+		return -EINVAL;
+	}
+
+	base = of_iomap(node, 0);
+	if (!base) {
+		pr_err("Failed to get MISC IRQ registers\n");
+		return -ENOMEM;
+	}
+
+	domain = irq_domain_add_linear(node, ATH79_MISC_IRQ_COUNT,
+				&misc_irq_domain_ops, base);
+	if (!domain) {
+		pr_err("Failed to add MISC irqdomain\n");
+		return -EINVAL;
+	}
+
+	ath79_misc_intc_domain_init(domain, irq);
+	return 0;
+}
+
+static int __init ar7100_misc_intc_of_init(
+	struct device_node *node, struct device_node *parent)
+{
+	ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
+	return ath79_misc_intc_of_init(node, parent);
+}
+
+IRQCHIP_DECLARE(ar7100_misc_intc, "qca,ar7100-misc-intc",
+		ar7100_misc_intc_of_init);
+
+static int __init ar7240_misc_intc_of_init(
+	struct device_node *node, struct device_node *parent)
+{
+	ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
+	return ath79_misc_intc_of_init(node, parent);
+}
+
+IRQCHIP_DECLARE(ar7240_misc_intc, "qca,ar7240-misc-intc",
+		ar7240_misc_intc_of_init);
+
+void __init ath79_misc_irq_init(void __iomem *regs, int irq,
+				int irq_base, bool is_ar71xx)
+{
+	struct irq_domain *domain;
+
+	if (is_ar71xx)
+		ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask;
+	else
+		ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack;
+
+	domain = irq_domain_add_legacy(NULL, ATH79_MISC_IRQ_COUNT,
+			irq_base, 0, &misc_irq_domain_ops, regs);
+	if (!domain)
+		panic("Failed to create MISC irqdomain");
+
+	ath79_misc_intc_domain_init(domain, irq);
+}
-- 
cgit v0.10.2


From 81ffb18ce4a0c400b051c3df67e452410d6be1ec Mon Sep 17 00:00:00 2001
From: Alban Bedel <albeu@free.fr>
Date: Sat, 23 Jan 2016 13:57:47 +0100
Subject: irqchip/ath79-cpu: Move the CPU IRQ driver from arch/mips/ath79/

Signed-off-by: Alban Bedel <albeu@free.fr>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Link: https://lkml.kernel.org/r/1453553867-27003-2-git-send-email-albeu@free.fr
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/arch/mips/ath79/irq.c b/arch/mips/ath79/irq.c
index 05b4514..2dfff1f 100644
--- a/arch/mips/ath79/irq.c
+++ b/arch/mips/ath79/irq.c
@@ -128,79 +128,10 @@ static void qca955x_irq_init(void)
 	irq_set_chained_handler(ATH79_CPU_IRQ(3), qca955x_ip3_irq_dispatch);
 }
 
-/*
- * The IP2/IP3 lines are tied to a PCI/WMAC/USB device. Drivers for
- * these devices typically allocate coherent DMA memory, however the
- * DMA controller may still have some unsynchronized data in the FIFO.
- * Issue a flush in the handlers to ensure that the driver sees
- * the update.
- *
- * This array map the interrupt lines to the DDR write buffer channels.
- */
-
-static unsigned irq_wb_chan[8] = {
-	-1, -1, -1, -1, -1, -1, -1, -1,
-};
-
-asmlinkage void plat_irq_dispatch(void)
-{
-	unsigned long pending;
-	int irq;
-
-	pending = read_c0_status() & read_c0_cause() & ST0_IM;
-
-	if (!pending) {
-		spurious_interrupt();
-		return;
-	}
-
-	pending >>= CAUSEB_IP;
-	while (pending) {
-		irq = fls(pending) - 1;
-		if (irq < ARRAY_SIZE(irq_wb_chan) && irq_wb_chan[irq] != -1)
-			ath79_ddr_wb_flush(irq_wb_chan[irq]);
-		do_IRQ(MIPS_CPU_IRQ_BASE + irq);
-		pending &= ~BIT(irq);
-	}
-}
-
-static int __init ar79_cpu_intc_of_init(
-	struct device_node *node, struct device_node *parent)
-{
-	int err, i, count;
-
-	/* Fill the irq_wb_chan table */
-	count = of_count_phandle_with_args(
-		node, "qca,ddr-wb-channels", "#qca,ddr-wb-channel-cells");
-
-	for (i = 0; i < count; i++) {
-		struct of_phandle_args args;
-		u32 irq = i;
-
-		of_property_read_u32_index(
-			node, "qca,ddr-wb-channel-interrupts", i, &irq);
-		if (irq >= ARRAY_SIZE(irq_wb_chan))
-			continue;
-
-		err = of_parse_phandle_with_args(
-			node, "qca,ddr-wb-channels",
-			"#qca,ddr-wb-channel-cells",
-			i, &args);
-		if (err)
-			return err;
-
-		irq_wb_chan[irq] = args.args[0];
-		pr_info("IRQ: Set flush channel of IRQ%d to %d\n",
-			irq, args.args[0]);
-	}
-
-	return mips_cpu_irq_of_init(node, parent);
-}
-IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc",
-		ar79_cpu_intc_of_init);
-
 void __init arch_init_irq(void)
 {
+	unsigned irq_wb_chan2 = -1;
+	unsigned irq_wb_chan3 = -1;
 	bool misc_is_ar71xx;
 
 	if (mips_machtype == ATH79_MACH_GENERIC_OF) {
@@ -210,13 +141,13 @@ void __init arch_init_irq(void)
 
 	if (soc_is_ar71xx() || soc_is_ar724x() ||
 	    soc_is_ar913x() || soc_is_ar933x()) {
-		irq_wb_chan[2] = 3;
-		irq_wb_chan[3] = 2;
+		irq_wb_chan2 = 3;
+		irq_wb_chan3 = 2;
 	} else if (soc_is_ar934x()) {
-		irq_wb_chan[3] = 2;
+		irq_wb_chan3 = 2;
 	}
 
-	mips_cpu_irq_init();
+	ath79_cpu_irq_init(irq_wb_chan2, irq_wb_chan3);
 
 	if (soc_is_ar71xx() || soc_is_ar913x())
 		misc_is_ar71xx = true;
diff --git a/arch/mips/include/asm/mach-ath79/ath79.h b/arch/mips/include/asm/mach-ath79/ath79.h
index 22a2f56..441faa9 100644
--- a/arch/mips/include/asm/mach-ath79/ath79.h
+++ b/arch/mips/include/asm/mach-ath79/ath79.h
@@ -144,6 +144,7 @@ static inline u32 ath79_reset_rr(unsigned reg)
 void ath79_device_reset_set(u32 mask);
 void ath79_device_reset_clear(u32 mask);
 
+void ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3);
 void ath79_misc_irq_init(void __iomem *regs, int irq,
 			int irq_base, bool is_ar71xx);
 
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index b873d4a..aeb9200 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_IRQCHIP)			+= irqchip.o
 
+obj-$(CONFIG_ATH79)			+= irq-ath79-cpu.o
 obj-$(CONFIG_ATH79)			+= irq-ath79-misc.o
 obj-$(CONFIG_ARCH_BCM2835)		+= irq-bcm2835.o
 obj-$(CONFIG_ARCH_BCM2835)		+= irq-bcm2836.o
diff --git a/drivers/irqchip/irq-ath79-cpu.c b/drivers/irqchip/irq-ath79-cpu.c
new file mode 100644
index 0000000..befe93c
--- /dev/null
+++ b/drivers/irqchip/irq-ath79-cpu.c
@@ -0,0 +1,97 @@
+/*
+ *  Atheros AR71xx/AR724x/AR913x specific interrupt handling
+ *
+ *  Copyright (C) 2015 Alban Bedel <albeu@free.fr>
+ *  Copyright (C) 2010-2011 Jaiganesh Narayanan <jnarayanan@atheros.com>
+ *  Copyright (C) 2008-2011 Gabor Juhos <juhosg@openwrt.org>
+ *  Copyright (C) 2008 Imre Kaloz <kaloz@openwrt.org>
+ *
+ *  Parts of this file are based on Atheros' 2.6.15/2.6.31 BSP
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License version 2 as published
+ *  by the Free Software Foundation.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/irqchip.h>
+#include <linux/of.h>
+
+#include <asm/irq_cpu.h>
+#include <asm/mach-ath79/ath79.h>
+
+/*
+ * The IP2/IP3 lines are tied to a PCI/WMAC/USB device. Drivers for
+ * these devices typically allocate coherent DMA memory, however the
+ * DMA controller may still have some unsynchronized data in the FIFO.
+ * Issue a flush in the handlers to ensure that the driver sees
+ * the update.
+ *
+ * This array map the interrupt lines to the DDR write buffer channels.
+ */
+
+static unsigned irq_wb_chan[8] = {
+	-1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+asmlinkage void plat_irq_dispatch(void)
+{
+	unsigned long pending;
+	int irq;
+
+	pending = read_c0_status() & read_c0_cause() & ST0_IM;
+
+	if (!pending) {
+		spurious_interrupt();
+		return;
+	}
+
+	pending >>= CAUSEB_IP;
+	while (pending) {
+		irq = fls(pending) - 1;
+		if (irq < ARRAY_SIZE(irq_wb_chan) && irq_wb_chan[irq] != -1)
+			ath79_ddr_wb_flush(irq_wb_chan[irq]);
+		do_IRQ(MIPS_CPU_IRQ_BASE + irq);
+		pending &= ~BIT(irq);
+	}
+}
+
+static int __init ar79_cpu_intc_of_init(
+	struct device_node *node, struct device_node *parent)
+{
+	int err, i, count;
+
+	/* Fill the irq_wb_chan table */
+	count = of_count_phandle_with_args(
+		node, "qca,ddr-wb-channels", "#qca,ddr-wb-channel-cells");
+
+	for (i = 0; i < count; i++) {
+		struct of_phandle_args args;
+		u32 irq = i;
+
+		of_property_read_u32_index(
+			node, "qca,ddr-wb-channel-interrupts", i, &irq);
+		if (irq >= ARRAY_SIZE(irq_wb_chan))
+			continue;
+
+		err = of_parse_phandle_with_args(
+			node, "qca,ddr-wb-channels",
+			"#qca,ddr-wb-channel-cells",
+			i, &args);
+		if (err)
+			return err;
+
+		irq_wb_chan[irq] = args.args[0];
+	}
+
+	return mips_cpu_irq_of_init(node, parent);
+}
+IRQCHIP_DECLARE(ar79_cpu_intc, "qca,ar7100-cpu-intc",
+		ar79_cpu_intc_of_init);
+
+void __init ath79_cpu_irq_init(unsigned irq_wb_chan2, unsigned irq_wb_chan3)
+{
+	irq_wb_chan[2] = irq_wb_chan2;
+	irq_wb_chan[3] = irq_wb_chan3;
+	mips_cpu_irq_init();
+}
-- 
cgit v0.10.2


From 390f0ffe92aea878b763c7fd8afd1dff62e0d20b Mon Sep 17 00:00:00 2001
From: Anton Bondarenko <anton.bondarenko.sama@gmail.com>
Date: Wed, 17 Feb 2016 14:28:47 +0100
Subject: spi: imx: allow only WML aligned transfers to use DMA

RX DMA tail data handling doesn't work correctly in many cases with current
implementation. It happens because SPI core was setup to generates both RX
and RX TAIL events. And RX TAIL event does not work correctly.
This can be easily verified by sending SPI transaction with size modulus
WML(32 in our case) not equal 0.

Also removing change introduced in f6ee9b582d2db652497b73c1f117591dfb6d3a90
since this change only fix usecases with transfer size from 33 to 128 bytes
and doesn't fix 129 bytes and bigger.

This is output from transaction with len 138 bytes in loopback mode at 10Mhz:
TX0000: a3 97 a2 55 53 be f1 fc f9 79 6b 52 14 13 e9 e2
TX0010: 2d 51 8e 1f 56 08 57 27 a7 05 d4 d0 52 82 77 75
TX0020: 1b 99 4a ed 58 3d 6a 52 36 d5 24 4a 68 8e ad 95
TX0030: 5f 3c 35 b5 c4 8c dd 6c 11 32 3d e2 b4 b4 59 cf
TX0040: ce 23 3d 27 df a7 f9 96 fc 1e e0 66 2c 0e 7b 8c
TX0050: ca 30 42 8f bc 9f 7b ce d1 b8 b1 87 ec 8a d6 bb
TX0060: 2e 15 63 0e 3c dc a4 3a 7a 06 20 a7 93 1b 34 dd
TX0070: 4c f5 ec 88 96 68 d6 68 a0 09 6f 8e 93 47 c9 41
TX0080: db ac cf 97 89 f3 51 05 79 71

RX0000: a3 97 a2 55 53 be f1 fc f9 79 6b 52 14 13 e9 e2
RX0010: 2d 51 8e 1f 56 08 57 27 a7 05 d4 d0 52 82 77 75
RX0020: 1b 99 4a ed 58 3d 6a 52 36 d5 24 4a 68 8e ad 95
RX0030: 5f 3c 35 00 00 b5 00 00 00 c4 00 00 8c 00 00 dd
RX0040: 6c 11 32 3d e2 b4 b4 59 cf ce 23 3d 27 df a7 f9
RX0050: 96 fc 1e e0 66 2c 0e 7b 8c ca 30 42 8f 1f 1f bc
RX0060: 9f 7b ce d1 b8 b1 87 ec 8a d6 bb 2e 15 63 0e ed
RX0070: ed 3c 58 58 58 dc 3d 3d a4 6a 6a 3a 52 52 7a 36
RX0080: 06 20 a7 93 1b 34 dd 4c f5 ec

Zeros at offset 33 and 34 caused by reading empty RX FIFO which not possible
if DMA RX read was triggered by RX event. This mean DMA was triggered
by RX TAIL event.

Signed-off-by: Anton Bondarenko <anton.bondarenko.sama@gmail.com>
Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
index d98c33c..08492d6 100644
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -204,8 +204,8 @@ static bool spi_imx_can_dma(struct spi_master *master, struct spi_device *spi,
 {
 	struct spi_imx_data *spi_imx = spi_master_get_devdata(master);
 
-	if (spi_imx->dma_is_inited &&
-	    transfer->len > spi_imx->wml * sizeof(u32))
+	if (spi_imx->dma_is_inited && transfer->len >= spi_imx->wml &&
+	    (transfer->len % spi_imx->wml) == 0)
 		return true;
 	return false;
 }
@@ -919,8 +919,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
 	struct dma_async_tx_descriptor *desc_tx = NULL, *desc_rx = NULL;
 	int ret;
 	unsigned long timeout;
-	u32 dma;
-	int left;
 	struct spi_master *master = spi_imx->bitbang.master;
 	struct sg_table *tx = &transfer->tx_sg, *rx = &transfer->rx_sg;
 
@@ -954,13 +952,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
 	/* Trigger the cspi module. */
 	spi_imx->dma_finished = 0;
 
-	dma = readl(spi_imx->base + MX51_ECSPI_DMA);
-	dma = dma & (~MX51_ECSPI_DMA_RXT_WML_MASK);
-	/* Change RX_DMA_LENGTH trigger dma fetch tail data */
-	left = transfer->len % spi_imx->wml;
-	if (left)
-		writel(dma | (left << MX51_ECSPI_DMA_RXT_WML_OFFSET),
-				spi_imx->base + MX51_ECSPI_DMA);
 	/*
 	 * Set these order to avoid potential RX overflow. The overflow may
 	 * happen if we enable SPI HW before starting RX DMA due to rescheduling
@@ -992,10 +983,6 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
 			spi_imx->devtype_data->reset(spi_imx);
 			dmaengine_terminate_all(master->dma_rx);
 		}
-		dma &= ~MX51_ECSPI_DMA_RXT_WML_MASK;
-		writel(dma |
-		       spi_imx->wml << MX51_ECSPI_DMA_RXT_WML_OFFSET,
-		       spi_imx->base + MX51_ECSPI_DMA);
 	}
 
 	spi_imx->dma_finished = 1;
-- 
cgit v0.10.2


From 8f8e2aec9944dd12671182a1a26b8e1a35872a1d Mon Sep 17 00:00:00 2001
From: Alan <gnomes@lxorguk.ukuu.org.uk>
Date: Wed, 17 Feb 2016 14:10:15 +0000
Subject: x86/platform/intel/mid: Remove dead code

Neither ratio nor fsb are ever zero, so remove the 0 case.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c
index 23381d2..1eb47b6 100644
--- a/arch/x86/platform/intel-mid/mfld.c
+++ b/arch/x86/platform/intel-mid/mfld.c
@@ -52,10 +52,7 @@ static unsigned long __init mfld_calibrate_tsc(void)
 	/* mark tsc clocksource as reliable */
 	set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
 
-	if (fast_calibrate)
-		return fast_calibrate;
-
-	return 0;
+	return fast_calibrate;
 }
 
 static void __init penwell_arch_setup(void)
diff --git a/arch/x86/platform/intel-mid/mrfl.c b/arch/x86/platform/intel-mid/mrfl.c
index aaca917..bd1adc6 100644
--- a/arch/x86/platform/intel-mid/mrfl.c
+++ b/arch/x86/platform/intel-mid/mrfl.c
@@ -81,10 +81,7 @@ static unsigned long __init tangier_calibrate_tsc(void)
 	/* mark tsc clocksource as reliable */
 	set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
 
-	if (fast_calibrate)
-		return fast_calibrate;
-
-	return 0;
+	return fast_calibrate;
 }
 
 static void __init tangier_arch_setup(void)
-- 
cgit v0.10.2


From d08b82c9e11e0997d2fb7dbf07a1f0c4112e587e Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Wed, 20 Jan 2016 18:07:16 +0000
Subject: devicetree: Add binding for Sigma Designs SMP86xx interrupt
 controller

This adds a binding for the secondary interrupt controller in
Sigma Designs SMP86xx and SMP87xx chips.

Signed-off-by: Mans Rullgard <mans@mansr.com>
[ jac: use 'interrupt-controller@XXX' notation in binding doc ]
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lkml.kernel.org/r/1453313237-18570-1-git-send-email-mans@mansr.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt
new file mode 100644
index 0000000..1f441fa
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/sigma,smp8642-intc.txt
@@ -0,0 +1,49 @@
+Sigma Designs SMP86xx/SMP87xx secondary interrupt controller
+
+Required properties:
+- compatible: should be "sigma,smp8642-intc"
+- reg: physical address of MMIO region
+- ranges: address space mapping of child nodes
+- interrupt-parent: phandle of parent interrupt controller
+- interrupt-controller: boolean
+- #address-cells: should be <1>
+- #size-cells: should be <1>
+
+One child node per control block with properties:
+- reg: address of registers for this control block
+- interrupt-controller: boolean
+- #interrupt-cells: should be <2>, interrupt index and flags per interrupts.txt
+- interrupts: interrupt spec of primary interrupt controller
+
+Example:
+
+interrupt-controller@6e000 {
+	compatible = "sigma,smp8642-intc";
+	reg = <0x6e000 0x400>;
+	ranges = <0x0 0x6e000 0x400>;
+	interrupt-parent = <&gic>;
+	interrupt-controller;
+	#address-cells = <1>;
+	#size-cells = <1>;
+
+	irq0: interrupt-controller@0 {
+		reg = <0x000 0x100>;
+		interrupt-controller;
+		#interrupt-cells = <2>;
+		interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>;
+	};
+
+	irq1: interrupt-controller@100 {
+		reg = <0x100 0x100>;
+		interrupt-controller;
+		#interrupt-cells = <2>;
+		interrupts = <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>;
+	};
+
+	irq2: interrupt-controller@300 {
+		reg = <0x300 0x100>;
+		interrupt-controller;
+		#interrupt-cells = <2>;
+		interrupts = <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>;
+	};
+};
-- 
cgit v0.10.2


From 4bba66899ac654cb7c940a3af35d496a0b1952f0 Mon Sep 17 00:00:00 2001
From: Mans Rullgard <mans@mansr.com>
Date: Wed, 20 Jan 2016 18:07:17 +0000
Subject: irqchip/tango: Add support for Sigma Designs SMP86xx/SMP87xx
 interrupt controller

This adds support for the secondary interrupt controller used in Sigma
Designs SMP86xx and SMP87xx chips.

Signed-off-by: Mans Rullgard <mans@mansr.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Link: https://lkml.kernel.org/r/1453313237-18570-2-git-send-email-mans@mansr.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 715923d..6bdfbce 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -151,6 +151,11 @@ config ST_IRQCHIP
 	help
 	  Enables SysCfg Controlled IRQs on STi based platforms.
 
+config TANGO_IRQ
+	bool
+	select IRQ_DOMAIN
+	select GENERIC_IRQ_CHIP
+
 config TB10X_IRQC
 	bool
 	select IRQ_DOMAIN
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 18caacb..ec83deb 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_VERSATILE_FPGA_IRQ)	+= irq-versatile-fpga.o
 obj-$(CONFIG_ARCH_NSPIRE)		+= irq-zevio.o
 obj-$(CONFIG_ARCH_VT8500)		+= irq-vt8500.o
 obj-$(CONFIG_ST_IRQCHIP)		+= irq-st.o
+obj-$(CONFIG_TANGO_IRQ)			+= irq-tango.o
 obj-$(CONFIG_TB10X_IRQC)		+= irq-tb10x.o
 obj-$(CONFIG_TS4800_IRQ)		+= irq-ts4800.o
 obj-$(CONFIG_XTENSA)			+= irq-xtensa-pic.o
diff --git a/drivers/irqchip/irq-tango.c b/drivers/irqchip/irq-tango.c
new file mode 100644
index 0000000..bdbb5c0
--- /dev/null
+++ b/drivers/irqchip/irq-tango.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (C) 2014 Mans Rullgard <mans@mansr.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/slab.h>
+
+#define IRQ0_CTL_BASE		0x0000
+#define IRQ1_CTL_BASE		0x0100
+#define EDGE_CTL_BASE		0x0200
+#define IRQ2_CTL_BASE		0x0300
+
+#define IRQ_CTL_HI		0x18
+#define EDGE_CTL_HI		0x20
+
+#define IRQ_STATUS		0x00
+#define IRQ_RAWSTAT		0x04
+#define IRQ_EN_SET		0x08
+#define IRQ_EN_CLR		0x0c
+#define IRQ_SOFT_SET		0x10
+#define IRQ_SOFT_CLR		0x14
+
+#define EDGE_STATUS		0x00
+#define EDGE_RAWSTAT		0x04
+#define EDGE_CFG_RISE		0x08
+#define EDGE_CFG_FALL		0x0c
+#define EDGE_CFG_RISE_SET	0x10
+#define EDGE_CFG_RISE_CLR	0x14
+#define EDGE_CFG_FALL_SET	0x18
+#define EDGE_CFG_FALL_CLR	0x1c
+
+struct tangox_irq_chip {
+	void __iomem *base;
+	unsigned long ctl;
+};
+
+static inline u32 intc_readl(struct tangox_irq_chip *chip, int reg)
+{
+	return readl_relaxed(chip->base + reg);
+}
+
+static inline void intc_writel(struct tangox_irq_chip *chip, int reg, u32 val)
+{
+	writel_relaxed(val, chip->base + reg);
+}
+
+static void tangox_dispatch_irqs(struct irq_domain *dom, unsigned int status,
+				 int base)
+{
+	unsigned int hwirq;
+	unsigned int virq;
+
+	while (status) {
+		hwirq = __ffs(status);
+		virq = irq_find_mapping(dom, base + hwirq);
+		if (virq)
+			generic_handle_irq(virq);
+		status &= ~BIT(hwirq);
+	}
+}
+
+static void tangox_irq_handler(struct irq_desc *desc)
+{
+	struct irq_domain *dom = irq_desc_get_handler_data(desc);
+	struct irq_chip *host_chip = irq_desc_get_chip(desc);
+	struct tangox_irq_chip *chip = dom->host_data;
+	unsigned int status_lo, status_hi;
+
+	chained_irq_enter(host_chip, desc);
+
+	status_lo = intc_readl(chip, chip->ctl + IRQ_STATUS);
+	status_hi = intc_readl(chip, chip->ctl + IRQ_CTL_HI + IRQ_STATUS);
+
+	tangox_dispatch_irqs(dom, status_lo, 0);
+	tangox_dispatch_irqs(dom, status_hi, 32);
+
+	chained_irq_exit(host_chip, desc);
+}
+
+static int tangox_irq_set_type(struct irq_data *d, unsigned int flow_type)
+{
+	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+	struct tangox_irq_chip *chip = gc->domain->host_data;
+	struct irq_chip_regs *regs = &gc->chip_types[0].regs;
+
+	switch (flow_type & IRQ_TYPE_SENSE_MASK) {
+	case IRQ_TYPE_EDGE_RISING:
+		intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask);
+		intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask);
+		break;
+
+	case IRQ_TYPE_EDGE_FALLING:
+		intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask);
+		intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask);
+		break;
+
+	case IRQ_TYPE_LEVEL_HIGH:
+		intc_writel(chip, regs->type + EDGE_CFG_RISE_CLR, d->mask);
+		intc_writel(chip, regs->type + EDGE_CFG_FALL_CLR, d->mask);
+		break;
+
+	case IRQ_TYPE_LEVEL_LOW:
+		intc_writel(chip, regs->type + EDGE_CFG_RISE_SET, d->mask);
+		intc_writel(chip, regs->type + EDGE_CFG_FALL_SET, d->mask);
+		break;
+
+	default:
+		pr_err("Invalid trigger mode %x for IRQ %d\n",
+		       flow_type, d->irq);
+		return -EINVAL;
+	}
+
+	return irq_setup_alt_chip(d, flow_type);
+}
+
+static void __init tangox_irq_init_chip(struct irq_chip_generic *gc,
+					unsigned long ctl_offs,
+					unsigned long edge_offs)
+{
+	struct tangox_irq_chip *chip = gc->domain->host_data;
+	struct irq_chip_type *ct = gc->chip_types;
+	unsigned long ctl_base = chip->ctl + ctl_offs;
+	unsigned long edge_base = EDGE_CTL_BASE + edge_offs;
+	int i;
+
+	gc->reg_base = chip->base;
+	gc->unused = 0;
+
+	for (i = 0; i < 2; i++) {
+		ct[i].chip.irq_ack = irq_gc_ack_set_bit;
+		ct[i].chip.irq_mask = irq_gc_mask_disable_reg;
+		ct[i].chip.irq_mask_ack = irq_gc_mask_disable_reg_and_ack;
+		ct[i].chip.irq_unmask = irq_gc_unmask_enable_reg;
+		ct[i].chip.irq_set_type = tangox_irq_set_type;
+		ct[i].chip.name = gc->domain->name;
+
+		ct[i].regs.enable = ctl_base + IRQ_EN_SET;
+		ct[i].regs.disable = ctl_base + IRQ_EN_CLR;
+		ct[i].regs.ack = edge_base + EDGE_RAWSTAT;
+		ct[i].regs.type = edge_base;
+	}
+
+	ct[0].type = IRQ_TYPE_LEVEL_MASK;
+	ct[0].handler = handle_level_irq;
+
+	ct[1].type = IRQ_TYPE_EDGE_BOTH;
+	ct[1].handler = handle_edge_irq;
+
+	intc_writel(chip, ct->regs.disable, 0xffffffff);
+	intc_writel(chip, ct->regs.ack, 0xffffffff);
+}
+
+static void __init tangox_irq_domain_init(struct irq_domain *dom)
+{
+	struct irq_chip_generic *gc;
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		gc = irq_get_domain_generic_chip(dom, i * 32);
+		tangox_irq_init_chip(gc, i * IRQ_CTL_HI, i * EDGE_CTL_HI);
+	}
+}
+
+static int __init tangox_irq_init(void __iomem *base, struct resource *baseres,
+				  struct device_node *node)
+{
+	struct tangox_irq_chip *chip;
+	struct irq_domain *dom;
+	struct resource res;
+	int irq;
+	int err;
+
+	irq = irq_of_parse_and_map(node, 0);
+	if (!irq)
+		panic("%s: failed to get IRQ", node->name);
+
+	err = of_address_to_resource(node, 0, &res);
+	if (err)
+		panic("%s: failed to get address", node->name);
+
+	chip = kzalloc(sizeof(*chip), GFP_KERNEL);
+	chip->ctl = res.start - baseres->start;
+	chip->base = base;
+
+	dom = irq_domain_add_linear(node, 64, &irq_generic_chip_ops, chip);
+	if (!dom)
+		panic("%s: failed to create irqdomain", node->name);
+
+	err = irq_alloc_domain_generic_chips(dom, 32, 2, node->name,
+					     handle_level_irq, 0, 0, 0);
+	if (err)
+		panic("%s: failed to allocate irqchip", node->name);
+
+	tangox_irq_domain_init(dom);
+
+	irq_set_chained_handler(irq, tangox_irq_handler);
+	irq_set_handler_data(irq, dom);
+
+	return 0;
+}
+
+static int __init tangox_of_irq_init(struct device_node *node,
+				     struct device_node *parent)
+{
+	struct device_node *c;
+	struct resource res;
+	void __iomem *base;
+
+	base = of_iomap(node, 0);
+	if (!base)
+		panic("%s: of_iomap failed", node->name);
+
+	of_address_to_resource(node, 0, &res);
+
+	for_each_child_of_node(node, c)
+		tangox_irq_init(base, &res, c);
+
+	return 0;
+}
+IRQCHIP_DECLARE(tangox_intc, "sigma,smp8642-intc", tangox_of_irq_init);
-- 
cgit v0.10.2


From d2b383dcf49db953a7438810803fdd7ac1e1f385 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 9 Feb 2016 11:19:20 +0100
Subject: irqchip/ts4800: Add hardware dependency

The Technologic Systems TS-4800 is an i.MX515 board, so its drivers
are useless unless building a SOC_IMX51 kernel, except for build
testing purposes.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Cc: Damien Riegel <damien.riegel@savoirfairelinux.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: https://lkml.kernel.org/r/20160209111920.1ec318bd@endymion
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 8115a32..72e5a84 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -165,6 +165,7 @@ config TS4800_IRQ
 	tristate "TS-4800 IRQ controller"
 	select IRQ_DOMAIN
 	depends on HAS_IOMEM
+	depends on SOC_IMX51 || COMPILE_TEST
 	help
 	  Support for the TS-4800 FPGA IRQ controller
 
-- 
cgit v0.10.2


From b6bc902ddca18749253093f5f8dc15391d8c2356 Mon Sep 17 00:00:00 2001
From: Eric Anholt <eric@anholt.net>
Date: Thu, 4 Feb 2016 13:14:28 -0800
Subject: irqchip/bcm2836: Drop extra memory barrier in SMP boot.

The writel() immediately after this has a barrier, anyway.

Signed-off-by: Eric Anholt <eric@anholt.net>
Link: https://lkml.kernel.org/r/1454620468-31303-1-git-send-email-eric@anholt.net
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/drivers/irqchip/irq-bcm2836.c b/drivers/irqchip/irq-bcm2836.c
index 963065a..b6e950d 100644
--- a/drivers/irqchip/irq-bcm2836.c
+++ b/drivers/irqchip/irq-bcm2836.c
@@ -229,7 +229,6 @@ int __init bcm2836_smp_boot_secondary(unsigned int cpu,
 	unsigned long secondary_startup_phys =
 		(unsigned long)virt_to_phys((void *)secondary_startup);
 
-	dsb();
 	writel(secondary_startup_phys,
 	       intc.base + LOCAL_MAILBOX3_SET0 + 16 * cpu);
 
-- 
cgit v0.10.2


From e4e1c0ea731e4a3df470110bfdf8a18d11d59351 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Sun, 14 Feb 2016 21:50:04 +0800
Subject: irqchip/ts4800: Make ts4800_ic_ops static const

ts4800_ic_ops is only referenced in this driver, so make it static.
In additional, it's never get modified thus also make it const.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Reviewed-by: Damien Riegel <damien.riegel@savoirfairelinux.com>
Link: https://lkml.kernel.org/r/1455457804.13175.1.camel@ingics.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/drivers/irqchip/irq-ts4800.c b/drivers/irqchip/irq-ts4800.c
index 4192bdc..2325fb3 100644
--- a/drivers/irqchip/irq-ts4800.c
+++ b/drivers/irqchip/irq-ts4800.c
@@ -59,7 +59,7 @@ static int ts4800_irqdomain_map(struct irq_domain *d, unsigned int irq,
 	return 0;
 }
 
-struct irq_domain_ops ts4800_ic_ops = {
+static const struct irq_domain_ops ts4800_ic_ops = {
 	.map = ts4800_irqdomain_map,
 	.xlate = irq_domain_xlate_onecell,
 };
-- 
cgit v0.10.2


From 548acf19234dbda5a52d5a8e7e205af46e9da840 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 17 Feb 2016 10:20:12 -0800
Subject: x86/mm: Expand the exception table logic to allow new handling
 options

Huge amounts of help from  Andy Lutomirski and Borislav Petkov to
produce this. Andy provided the inspiration to add classes to the
exception table with a clever bit-squeezing trick, Boris pointed
out how much cleaner it would all be if we just had a new field.

Linus Torvalds blessed the expansion with:

  ' I'd rather not be clever in order to save just a tiny amount of space
    in the exception table, which isn't really criticial for anybody. '

The third field is another relative function pointer, this one to a
handler that executes the actions.

We start out with three handlers:

 1: Legacy - just jumps the to fixup IP
 2: Fault - provide the trap number in %ax to the fixup code
 3: Cleaned up legacy for the uaccess error hack

Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f6af78fcbd348cf4939875cfda9c19689b5e50b8.1455732970.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/Documentation/x86/exception-tables.txt b/Documentation/x86/exception-tables.txt
index 32901aa..e396bcd 100644
--- a/Documentation/x86/exception-tables.txt
+++ b/Documentation/x86/exception-tables.txt
@@ -290,3 +290,38 @@ Due to the way that the exception table is built and needs to be ordered,
 only use exceptions for code in the .text section.  Any other section
 will cause the exception table to not be sorted correctly, and the
 exceptions will fail.
+
+Things changed when 64-bit support was added to x86 Linux. Rather than
+double the size of the exception table by expanding the two entries
+from 32-bits to 64 bits, a clever trick was used to store addresses
+as relative offsets from the table itself. The assembly code changed
+from:
+	.long 1b,3b
+to:
+        .long (from) - .
+        .long (to) - .
+
+and the C-code that uses these values converts back to absolute addresses
+like this:
+
+	ex_insn_addr(const struct exception_table_entry *x)
+	{
+		return (unsigned long)&x->insn + x->insn;
+	}
+
+In v4.6 the exception table entry was expanded with a new field "handler".
+This is also 32-bits wide and contains a third relative function
+pointer which points to one of:
+
+1) int ex_handler_default(const struct exception_table_entry *fixup)
+   This is legacy case that just jumps to the fixup code
+2) int ex_handler_fault(const struct exception_table_entry *fixup)
+   This case provides the fault number of the trap that occurred at
+   entry->insn. It is used to distinguish page faults from machine
+   check.
+3) int ex_handler_ext(const struct exception_table_entry *fixup)
+   This case is used for uaccess_err ... we need to set a flag
+   in the task structure. Before the handler functions existed this
+   case was handled by adding a large offset to the fixup to tag
+   it as special.
+More functions can easily be added.
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679a..f5063b6 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -44,19 +44,22 @@
 
 /* Exception table entry */
 #ifdef __ASSEMBLY__
-# define _ASM_EXTABLE(from,to)					\
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
+	.balign 4 ;						\
 	.long (from) - . ;					\
 	.long (to) - . ;					\
+	.long (handler) - . ;					\
 	.popsection
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	.pushsection "__ex_table","a" ;				\
-	.balign 8 ;						\
-	.long (from) - . ;					\
-	.long (to) - . + 0x7ffffff0 ;				\
-	.popsection
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
 
 # define _ASM_NOKPROBE(entry)					\
 	.pushsection "_kprobe_blacklist","aw" ;			\
@@ -89,19 +92,24 @@
 	.endm
 
 #else
-# define _ASM_EXTABLE(from,to)					\
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
+	" .balign 4\n"						\
 	" .long (" #from ") - .\n"				\
 	" .long (" #to ") - .\n"				\
+	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
 	" .popsection\n"
 
-# define _ASM_EXTABLE_EX(from,to)				\
-	" .pushsection \"__ex_table\",\"a\"\n"			\
-	" .balign 8\n"						\
-	" .long (" #from ") - .\n"				\
-	" .long (" #to ") - . + 0x7ffffff0\n"			\
-	" .popsection\n"
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a4a30e4..c0f27d7 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -90,12 +90,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 	likely(!__range_not_ok(addr, size, user_addr_max()))
 
 /*
- * The exception table consists of pairs of addresses relative to the
- * exception table enty itself: the first is the address of an
- * instruction that is allowed to fault, and the second is the address
- * at which the program should continue.  No registers are modified,
- * so it is entirely up to the continuation code to figure out what to
- * do.
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -104,13 +103,14 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  */
 
 struct exception_table_entry {
-	int insn, fixup;
+	int insn, fixup, handler;
 };
 /* This is not the generic standard exception_table_entry format */
 #define ARCH_HAS_SORT_EXTABLE
 #define ARCH_HAS_SEARCH_EXTABLE
 
-extern int fixup_exception(struct pt_regs *regs);
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
 extern int early_fixup_exception(unsigned long *ip);
 
 /*
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 1deffe6..0f05dee 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -988,7 +988,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 		 * In case the user-specified fault handler returned
 		 * zero, try to fix up.
 		 */
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, trapnr))
 			return 1;
 
 		/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ade185a..211c11c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -199,7 +199,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 	}
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
@@ -453,7 +453,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	tsk = current;
 	if (!user_mode(regs)) {
-		if (fixup_exception(regs))
+		if (fixup_exception(regs, X86_TRAP_GP))
 			return;
 
 		tsk->thread.error_code = error_code;
@@ -699,7 +699,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	conditional_sti(regs);
 
 	if (!user_mode(regs)) {
-		if (!fixup_exception(regs)) {
+		if (!fixup_exception(regs, trapnr)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
 			die(str, regs, error_code);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 903ec1e..9dd7e4b 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -3,6 +3,9 @@
 #include <linux/sort.h>
 #include <asm/uaccess.h>
 
+typedef bool (*ex_handler_t)(const struct exception_table_entry *,
+			    struct pt_regs *, int);
+
 static inline unsigned long
 ex_insn_addr(const struct exception_table_entry *x)
 {
@@ -13,11 +16,56 @@ ex_fixup_addr(const struct exception_table_entry *x)
 {
 	return (unsigned long)&x->fixup + x->fixup;
 }
+static inline ex_handler_t
+ex_fixup_handler(const struct exception_table_entry *x)
+{
+	return (ex_handler_t)((unsigned long)&x->handler + x->handler);
+}
 
-int fixup_exception(struct pt_regs *regs)
+bool ex_handler_default(const struct exception_table_entry *fixup,
+		       struct pt_regs *regs, int trapnr)
 {
-	const struct exception_table_entry *fixup;
-	unsigned long new_ip;
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_default);
+
+bool ex_handler_fault(const struct exception_table_entry *fixup,
+		     struct pt_regs *regs, int trapnr)
+{
+	regs->ip = ex_fixup_addr(fixup);
+	regs->ax = trapnr;
+	return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fault);
+
+bool ex_handler_ext(const struct exception_table_entry *fixup,
+		   struct pt_regs *regs, int trapnr)
+{
+	/* Special hack for uaccess_err */
+	current_thread_info()->uaccess_err = 1;
+	regs->ip = ex_fixup_addr(fixup);
+	return true;
+}
+EXPORT_SYMBOL(ex_handler_ext);
+
+bool ex_has_fault_handler(unsigned long ip)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
+
+	e = search_exception_tables(ip);
+	if (!e)
+		return false;
+	handler = ex_fixup_handler(e);
+
+	return handler == ex_handler_fault;
+}
+
+int fixup_exception(struct pt_regs *regs, int trapnr)
+{
+	const struct exception_table_entry *e;
+	ex_handler_t handler;
 
 #ifdef CONFIG_PNPBIOS
 	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
@@ -33,42 +81,34 @@ int fixup_exception(struct pt_regs *regs)
 	}
 #endif
 
-	fixup = search_exception_tables(regs->ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
-
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* Special hack for uaccess_err */
-			current_thread_info()->uaccess_err = 1;
-			new_ip -= 0x7ffffff0;
-		}
-		regs->ip = new_ip;
-		return 1;
-	}
+	e = search_exception_tables(regs->ip);
+	if (!e)
+		return 0;
 
-	return 0;
+	handler = ex_fixup_handler(e);
+	return handler(e, regs, trapnr);
 }
 
 /* Restricted version used during very early boot */
 int __init early_fixup_exception(unsigned long *ip)
 {
-	const struct exception_table_entry *fixup;
+	const struct exception_table_entry *e;
 	unsigned long new_ip;
+	ex_handler_t handler;
 
-	fixup = search_exception_tables(*ip);
-	if (fixup) {
-		new_ip = ex_fixup_addr(fixup);
+	e = search_exception_tables(*ip);
+	if (!e)
+		return 0;
 
-		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
-			/* uaccess handling not supported during early boot */
-			return 0;
-		}
+	new_ip  = ex_fixup_addr(e);
+	handler = ex_fixup_handler(e);
 
-		*ip = new_ip;
-		return 1;
-	}
+	/* special handling not supported during early boot */
+	if (handler != ex_handler_default)
+		return 0;
 
-	return 0;
+	*ip = new_ip;
+	return 1;
 }
 
 /*
@@ -133,6 +173,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup += i;
 		i += 4;
+		p->handler += i;
+		i += 4;
 	}
 
 	sort(start, finish - start, sizeof(struct exception_table_entry),
@@ -145,6 +187,8 @@ void sort_extable(struct exception_table_entry *start,
 		i += 4;
 		p->fixup -= i;
 		i += 4;
+		p->handler -= i;
+		i += 4;
 	}
 }
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index eef44d9..495946c 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -656,7 +656,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 	int sig;
 
 	/* Are we prepared to handle this kernel fault? */
-	if (fixup_exception(regs)) {
+	if (fixup_exception(regs, X86_TRAP_PF)) {
 		/*
 		 * Any interrupt that takes a fault gets the fixup. This makes
 		 * the below recursive fault logic only apply to a faults from
diff --git a/scripts/sortextable.c b/scripts/sortextable.c
index c2423d9..7b29fb1 100644
--- a/scripts/sortextable.c
+++ b/scripts/sortextable.c
@@ -209,6 +209,35 @@ static int compare_relative_table(const void *a, const void *b)
 	return 0;
 }
 
+static void x86_sort_relative_table(char *extab_image, int image_size)
+{
+	int i;
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) + i, loc);
+		w(r(loc + 1) + i + 4, loc + 1);
+		w(r(loc + 2) + i + 8, loc + 2);
+
+		i += sizeof(uint32_t) * 3;
+	}
+
+	qsort(extab_image, image_size / 12, 12, compare_relative_table);
+
+	i = 0;
+	while (i < image_size) {
+		uint32_t *loc = (uint32_t *)(extab_image + i);
+
+		w(r(loc) - i, loc);
+		w(r(loc + 1) - (i + 4), loc + 1);
+		w(r(loc + 2) - (i + 8), loc + 2);
+
+		i += sizeof(uint32_t) * 3;
+	}
+}
+
 static void sort_relative_table(char *extab_image, int image_size)
 {
 	int i;
@@ -281,6 +310,9 @@ do_file(char const *const fname)
 		break;
 	case EM_386:
 	case EM_X86_64:
+		custom_sort = x86_sort_relative_table;
+		break;
+
 	case EM_S390:
 		custom_sort = sort_relative_table;
 		break;
-- 
cgit v0.10.2


From b2f9d678e28ca71ce650eac82f26dd287b47e89a Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 17 Feb 2016 10:20:13 -0800
Subject: x86/mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception
 table entries

Extend the severity checking code to add a new context IN_KERN_RECOV
which is used to indicate that the machine check was triggered by code
in the kernel tagged with _ASM_EXTABLE_FAULT() so that the ex_handler_fault()
handler will provide the fixup code with the trap number.

Major re-work to the tail code in do_machine_check() to make all this
readable/maintainable. One functional change is that tolerant=3 no longer
stops recovery actions. Revert to only skipping sending SIGBUS to the
current process.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/89d243d05a7943bb187d1074bb30d9c4f482d5f5.1455732970.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 9c682c2..5119766 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/debugfs.h>
 #include <asm/mce.h>
+#include <asm/uaccess.h>
 
 #include "mce-internal.h"
 
@@ -29,7 +30,7 @@
  * panic situations)
  */
 
-enum context { IN_KERNEL = 1, IN_USER = 2 };
+enum context { IN_KERNEL = 1, IN_USER = 2, IN_KERNEL_RECOV = 3 };
 enum ser { SER_REQUIRED = 1, NO_SER = 2 };
 enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
 
@@ -48,6 +49,7 @@ static struct severity {
 #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
 #define  KERNEL		.context = IN_KERNEL
 #define  USER		.context = IN_USER
+#define  KERNEL_RECOV	.context = IN_KERNEL_RECOV
 #define  SER		.ser = SER_REQUIRED
 #define  NOSER		.ser = NO_SER
 #define  EXCP		.excp = EXCP_CONTEXT
@@ -87,6 +89,10 @@ static struct severity {
 		EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
 		),
 	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		EXCP, KERNEL_RECOV, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
 		DEFERRED, "Deferred error",
 		NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
 		),
@@ -123,6 +129,11 @@ static struct severity {
 		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV)
 		),
 	MCESEV(
+		AR, "Action required: data load in error recoverable area of kernel",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+		KERNEL_RECOV
+		),
+	MCESEV(
 		AR, "Action required: data load error in a user process",
 		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
 		USER
@@ -170,6 +181,9 @@ static struct severity {
 		)	/* always matches. keep at end */
 };
 
+#define mc_recoverable(mcg) (((mcg) & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) == \
+				(MCG_STATUS_RIPV|MCG_STATUS_EIPV))
+
 /*
  * If mcgstatus indicated that ip/cs on the stack were
  * no good, then "m->cs" will be zero and we will have
@@ -183,7 +197,11 @@ static struct severity {
  */
 static int error_context(struct mce *m)
 {
-	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
+	if ((m->cs & 3) == 3)
+		return IN_USER;
+	if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+		return IN_KERNEL_RECOV;
+	return IN_KERNEL;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b718080..524f2a8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -961,6 +961,20 @@ static void mce_clear_state(unsigned long *toclear)
 	}
 }
 
+static int do_memory_failure(struct mce *m)
+{
+	int flags = MF_ACTION_REQUIRED;
+	int ret;
+
+	pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
+	if (!(m->mcgstatus & MCG_STATUS_RIPV))
+		flags |= MF_MUST_KILL;
+	ret = memory_failure(m->addr >> PAGE_SHIFT, MCE_VECTOR, flags);
+	if (ret)
+		pr_err("Memory error not recovered");
+	return ret;
+}
+
 /*
  * The actual machine check handler. This only handles real
  * exceptions when something got corrupted coming in through int 18.
@@ -998,8 +1012,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
 	char *msg = "Unknown";
-	u64 recover_paddr = ~0ull;
-	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
 	/* If this CPU is offline, just bail out. */
@@ -1136,22 +1148,13 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	}
 
 	/*
-	 * At insane "tolerant" levels we take no action. Otherwise
-	 * we only die if we have no other choice. For less serious
-	 * issues we try to recover, or limit damage to the current
-	 * process.
+	 * If tolerant is at an insane level we drop requests to kill
+	 * processes and continue even when there is no way out.
 	 */
-	if (cfg->tolerant < 3) {
-		if (no_way_out)
-			mce_panic("Fatal machine check on current CPU", &m, msg);
-		if (worst == MCE_AR_SEVERITY) {
-			recover_paddr = m.addr;
-			if (!(m.mcgstatus & MCG_STATUS_RIPV))
-				flags |= MF_MUST_KILL;
-		} else if (kill_it) {
-			force_sig(SIGBUS, current);
-		}
-	}
+	if (cfg->tolerant == 3)
+		kill_it = 0;
+	else if (no_way_out)
+		mce_panic("Fatal machine check on current CPU", &m, msg);
 
 	if (worst > 0)
 		mce_report_event(regs);
@@ -1159,25 +1162,24 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 out:
 	sync_core();
 
-	if (recover_paddr == ~0ull)
-		goto done;
+	if (worst != MCE_AR_SEVERITY && !kill_it)
+		goto out_ist;
 
-	pr_err("Uncorrected hardware memory error in user-access at %llx",
-		 recover_paddr);
-	/*
-	 * We must call memory_failure() here even if the current process is
-	 * doomed. We still need to mark the page as poisoned and alert any
-	 * other users of the page.
-	 */
-	ist_begin_non_atomic(regs);
-	local_irq_enable();
-	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
-		pr_err("Memory error not recovered");
-		force_sig(SIGBUS, current);
+	/* Fault was in user mode and we need to take some action */
+	if ((m.cs & 3) == 3) {
+		ist_begin_non_atomic(regs);
+		local_irq_enable();
+
+		if (kill_it || do_memory_failure(&m))
+			force_sig(SIGBUS, current);
+		local_irq_disable();
+		ist_end_non_atomic();
+	} else {
+		if (!fixup_exception(regs, X86_TRAP_MC))
+			mce_panic("Failed kernel mode recovery", &m, NULL);
 	}
-	local_irq_disable();
-	ist_end_non_atomic();
-done:
+
+out_ist:
 	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
-- 
cgit v0.10.2


From 0f68c088c0adb3c3bbeb487c4ebcde91fd5d34be Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 17 Feb 2016 10:20:13 -0800
Subject: x86/cpufeature: Create a new synthetic cpu capability for machine
 check recovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Intel Software Developer Manual describes bit 24 in the MCG_CAP
MSR:

   MCG_SER_P (software error recovery support present) flag,
   bit 24 — Indicates (when set) that the processor supports
   software error recovery

But only some models with this capability bit set will actually
generate recoverable machine checks.

Check the model name and set a synthetic capability bit. Provide
a command line option to set this bit anyway in case the kernel
doesn't recognise the model name.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2e5bfb23c89800a036fb8a45fa97a74bb16bc362.1455732970.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 68ed311..0965a71 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -60,6 +60,8 @@ Machine check
 		threshold to 1. Enabling this may make memory predictive failure
 		analysis less effective if the bios sets thresholds for memory
 		errors since we will not see details for all errors.
+   mce=recovery
+		Force-enable recoverable machine check code paths
 
    nomce (for compatibility with i386): same as mce=off
 
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 0ceb6ad..6663fae 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -106,6 +106,7 @@
 #define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
 #define X86_FEATURE_EAGER_FPU	( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
 #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2ea4527..18d2ba9 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -113,6 +113,7 @@ struct mca_config {
 	bool ignore_ce;
 	bool disabled;
 	bool ser;
+	bool recovery;
 	bool bios_cmci_threshold;
 	u8 banks;
 	s8 bootlog;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a006f4c..b5b187c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1576,6 +1576,17 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 
 		if (c->x86 == 6 && c->x86_model == 45)
 			quirk_no_way_out = quirk_sandybridge_ifu;
+		/*
+		 * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
+		 * whether this processor will actually generate recoverable
+		 * machine checks. Check to see if this is an E7 model Xeon.
+		 * We can't do a model number check because E5 and E7 use the
+		 * same model number. E5 doesn't support recovery, E7 does.
+		 */
+		if (mca_cfg.recovery || (mca_cfg.ser &&
+			!strncmp(c->x86_model_id,
+				 "Intel(R) Xeon(R) CPU E7-", 24)))
+			set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
 	}
 	if (cfg->monarch_timeout < 0)
 		cfg->monarch_timeout = 0;
@@ -2028,6 +2039,8 @@ static int __init mcheck_enable(char *str)
 		cfg->bootlog = (str[0] == 'b');
 	else if (!strcmp(str, "bios_cmci_threshold"))
 		cfg->bios_cmci_threshold = true;
+	else if (!strcmp(str, "recovery"))
+		cfg->recovery = true;
 	else if (isdigit(str[0])) {
 		if (get_option(&str, &cfg->tolerant) == 2)
 			get_option(&str, &(cfg->monarch_timeout));
-- 
cgit v0.10.2


From a55e5663761366fb883f6f25375dd68bc958b9db Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 17 Feb 2016 10:57:19 -0300
Subject: perf evlist: Reference count the cpu and thread maps at set_maps()

We were dropping the reference we possibly held but not obtaining one
for the new maps, which we will drop at perf_evlist__delete(), fix it.

This was caught by Steven Noonan in some of the machines which would
produce this output when caught by glibc debug mechanisms:

  $ sudo perf test 21
  21: Test object code reading                                 :***
  Error in `perf': corrupted double-linked list: 0x00000000023ffcd0 ***
  ======= Backtrace: =========
  /usr/lib/libc.so.6(+0x72055)[0x7f25be0f3055]
  /usr/lib/libc.so.6(+0x779b6)[0x7f25be0f89b6]
  /usr/lib/libc.so.6(+0x7a0ed)[0x7f25be0fb0ed]
  /usr/lib/libc.so.6(__libc_calloc+0xba)[0x7f25be0fceda]
  perf(parse_events_lex_init_extra+0x38)[0x4cfff8]
  perf(parse_events+0x55)[0x4a0615]
  perf(perf_evlist__config+0xcf)[0x4eeb2f]
  perf[0x479f82]
  perf(test__code_reading+0x1e)[0x47ad4e]
  perf(cmd_test+0x5dd)[0x46452d]
  perf[0x47f4e3]
  perf(main+0x603)[0x42c723]
  /usr/lib/libc.so.6(__libc_start_main+0xf0)[0x7f25be0a1610]
  perf(_start+0x29)[0x42c859]

Further investigation using valgrind led to the reference count imbalance fixed
in this patch.

Reported-and-Tested-by: Steven Noonan <steven@uplinklabs.net>
Report-Link: http://lkml.kernel.org/r/CAKbGBLjC2Dx5vshxyGmQkcD+VwiAQLbHoXA9i7kvRB2-2opHZQ@mail.gmail.com
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: f30a79b012e5 ("perf tools: Add reference counting for cpu_map object")
Link: http://lkml.kernel.org/n/tip-j0u1bdhr47sa511sgg76kb8h@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index d81f13d..a7eb0ea 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1181,12 +1181,12 @@ void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus,
 	 */
 	if (cpus != evlist->cpus) {
 		cpu_map__put(evlist->cpus);
-		evlist->cpus = cpus;
+		evlist->cpus = cpu_map__get(cpus);
 	}
 
 	if (threads != evlist->threads) {
 		thread_map__put(evlist->threads);
-		evlist->threads = threads;
+		evlist->threads = thread_map__get(threads);
 	}
 
 	perf_evlist__propagate_maps(evlist);
-- 
cgit v0.10.2


From 85723885feb823b4fc352b727ece0b6d00306c4d Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 15 Feb 2016 09:34:31 +0100
Subject: perf record: Add --all-user/--all-kernel options

Allow user to easily switch all events to user or kernel space with simple
--all-user or --all-kernel options.

This will be handy within perf mem/c2c wrappers to switch easily monitoring
modes.

Committer note:

Testing it:

  # perf record --all-kernel --all-user -a sleep 2
   Error: option `all-user' cannot be used with all-kernel
   Usage: perf record [<options>] [<command>]
      or: perf record [<options>] -- <command> [<options>]

        --all-user        Configure all used events to run in user space.
        --all-kernel      Configure all used events to run in kernel space.
  # perf record --all-user --all-kernel -a sleep 2
   Error: option `all-kernel' cannot be used with all-user
   Usage: perf record [<options>] [<command>]
      or: perf record [<options>] -- <command> [<options>]

        --all-kernel      Configure all used events to run in kernel space.
        --all-user        Configure all used events to run in user space.
  # perf record --all-user -a sleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 1.416 MB perf.data (162 samples) ]
  # perf report | grep '\[k\]'
  # perf record --all-kernel -a sleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 1.423 MB perf.data (296 samples) ]
  # perf report | grep '\[\.\]'
  #

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1455525293-8671-2-git-send-email-jolsa@kernel.org
[ Made those options to be mutually exclusive ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index fbceb63..19aa175 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -341,6 +341,12 @@ Specify vmlinux path which has debuginfo.
 --buildid-all::
 Record build-id of all DSOs regardless whether it's actually hit or not.
 
+--all-kernel::
+Configure all used events to run in kernel space.
+
+--all-user::
+Configure all used events to run in user space.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 0ee0d5c..cf3a28d 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1140,6 +1140,12 @@ struct option __record_options[] = {
 			"per thread proc mmap processing timeout in ms"),
 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
 		    "Record context switch events"),
+	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
+			 "Configure all used events to run in kernel space.",
+			 PARSE_OPT_EXCLUSIVE),
+	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
+			 "Configure all used events to run in user space.",
+			 PARSE_OPT_EXCLUSIVE),
 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
 		   "clang binary to use for compiling BPF scriptlets"),
 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 90129ac..5381a01 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -58,6 +58,8 @@ struct record_opts {
 	bool	     full_auxtrace;
 	bool	     auxtrace_snapshot_mode;
 	bool	     record_switch_events;
+	bool	     all_kernel;
+	bool	     all_user;
 	unsigned int freq;
 	unsigned int mmap_pages;
 	unsigned int auxtrace_mmap_pages;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 4678086..6ae20d0 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -898,6 +898,16 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)
 	if (evsel->precise_max)
 		perf_event_attr__set_max_precise_ip(attr);
 
+	if (opts->all_user) {
+		attr->exclude_kernel = 1;
+		attr->exclude_user   = 0;
+	}
+
+	if (opts->all_kernel) {
+		attr->exclude_kernel = 0;
+		attr->exclude_user   = 1;
+	}
+
 	/*
 	 * Apply event specific term settings,
 	 * it overloads any global configuration.
-- 
cgit v0.10.2


From 018361767a21fb2d5ebd3ac182c04baf8a8b4e08 Mon Sep 17 00:00:00 2001
From: Gabor Juhos <juhosg@openwrt.org>
Date: Wed, 17 Feb 2016 12:58:20 +0100
Subject: pata-rb532-cf: get rid of the irq_to_gpio() call

The RB532 platform specific irq_to_gpio() implementation has been
removed with commit 832f5dacfa0b ("MIPS: Remove all the uses of
custom gpio.h"). Now the platform uses the generic stub which causes
the following error:

  pata-rb532-cf pata-rb532-cf: no GPIO found for irq149
  pata-rb532-cf: probe of pata-rb532-cf failed with error -2

Drop the irq_to_gpio() call and get the GPIO number from platform
data instead. After this change, the driver works again:

  scsi host0: pata-rb532-cf
  ata1: PATA max PIO4 irq 149
  ata1.00: CFA: CF 1GB, 20080820, max MWDMA4
  ata1.00: 1989792 sectors, multi 0: LBA
  ata1.00: configured for PIO4
  scsi 0:0:0:0: Direct-Access     ATA      CF 1GB           0820 PQ: 0\
  ANSI: 5
  sd 0:0:0:0: [sda] 1989792 512-byte logical blocks: (1.01 GB/971 MiB)
  sd 0:0:0:0: [sda] Write Protect is off
  sd 0:0:0:0: [sda] Write cache: disabled, read cache: enabled, doesn't\
  support DPO or FUA
   sda: sda1 sda2
  sd 0:0:0:0: [sda] Attached SCSI disk

Fixes: 832f5dacfa0b ("MIPS: Remove all the uses of custom gpio.h")
Cc: Alban Bedel <albeu@free.fr>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: <stable@vger.kernel.org> #v4.3+
Signed-off-by: Gabor Juhos <juhosg@openwrt.org>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ata/pata_rb532_cf.c b/drivers/ata/pata_rb532_cf.c
index 12fe0f3..c8b6a78 100644
--- a/drivers/ata/pata_rb532_cf.c
+++ b/drivers/ata/pata_rb532_cf.c
@@ -32,6 +32,8 @@
 #include <linux/libata.h>
 #include <scsi/scsi_host.h>
 
+#include <asm/mach-rc32434/rb.h>
+
 #define DRV_NAME	"pata-rb532-cf"
 #define DRV_VERSION	"0.1.0"
 #define DRV_DESC	"PATA driver for RouterBOARD 532 Compact Flash"
@@ -107,6 +109,7 @@ static int rb532_pata_driver_probe(struct platform_device *pdev)
 	int gpio;
 	struct resource *res;
 	struct ata_host *ah;
+	struct cf_device *pdata;
 	struct rb532_cf_info *info;
 	int ret;
 
@@ -122,7 +125,13 @@ static int rb532_pata_driver_probe(struct platform_device *pdev)
 		return -ENOENT;
 	}
 
-	gpio = irq_to_gpio(irq);
+	pdata = dev_get_platdata(&pdev->dev);
+	if (!pdata) {
+		dev_err(&pdev->dev, "no platform data specified\n");
+		return -EINVAL;
+	}
+
+	gpio = pdata->gpio_pin;
 	if (gpio < 0) {
 		dev_err(&pdev->dev, "no GPIO found for irq%d\n", irq);
 		return -ENOENT;
-- 
cgit v0.10.2


From f5bdd66c705484b4bc77eb914be15c1b7881fae7 Mon Sep 17 00:00:00 2001
From: Alexandra Yates <alexandra.yates@linux.intel.com>
Date: Wed, 17 Feb 2016 19:36:20 -0800
Subject: Adding Intel Lewisburg device IDs for SATA

This patch complements the list of device IDs previously
added for lewisburg sata.

Signed-off-by: Alexandra Yates <alexandra.yates@linux.intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 546a369..b6263b3 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -367,15 +367,21 @@ static const struct pci_device_id ahci_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, 0xa107), board_ahci }, /* Sunrise Point-H RAID */
 	{ PCI_VDEVICE(INTEL, 0xa10f), board_ahci }, /* Sunrise Point-H RAID */
 	{ PCI_VDEVICE(INTEL, 0x2822), board_ahci }, /* Lewisburg RAID*/
+	{ PCI_VDEVICE(INTEL, 0x2823), board_ahci }, /* Lewisburg AHCI*/
 	{ PCI_VDEVICE(INTEL, 0x2826), board_ahci }, /* Lewisburg RAID*/
+	{ PCI_VDEVICE(INTEL, 0x2827), board_ahci }, /* Lewisburg RAID*/
 	{ PCI_VDEVICE(INTEL, 0xa182), board_ahci }, /* Lewisburg AHCI*/
 	{ PCI_VDEVICE(INTEL, 0xa184), board_ahci }, /* Lewisburg RAID*/
 	{ PCI_VDEVICE(INTEL, 0xa186), board_ahci }, /* Lewisburg RAID*/
 	{ PCI_VDEVICE(INTEL, 0xa18e), board_ahci }, /* Lewisburg RAID*/
+	{ PCI_VDEVICE(INTEL, 0xa1d2), board_ahci }, /* Lewisburg RAID*/
+	{ PCI_VDEVICE(INTEL, 0xa1d6), board_ahci }, /* Lewisburg RAID*/
 	{ PCI_VDEVICE(INTEL, 0xa202), board_ahci }, /* Lewisburg AHCI*/
 	{ PCI_VDEVICE(INTEL, 0xa204), board_ahci }, /* Lewisburg RAID*/
 	{ PCI_VDEVICE(INTEL, 0xa206), board_ahci }, /* Lewisburg RAID*/
 	{ PCI_VDEVICE(INTEL, 0xa20e), board_ahci }, /* Lewisburg RAID*/
+	{ PCI_VDEVICE(INTEL, 0xa252), board_ahci }, /* Lewisburg RAID*/
+	{ PCI_VDEVICE(INTEL, 0xa256), board_ahci }, /* Lewisburg RAID*/
 
 	/* JMicron 360/1/3/5/6, match class to avoid IDE function */
 	{ PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
-- 
cgit v0.10.2


From ba4a3550e93415f4ff300810b8d0659949fea193 Mon Sep 17 00:00:00 2001
From: Gao Pan <pandy.gao@nxp.com>
Date: Mon, 18 Jan 2016 15:44:01 +0800
Subject: spi: imx: fix spi resource leak with dma transfer

In spi_imx_dma_transfer(), when desc_rx = dmaengine_prep_slave_sg()
fails, the context goes to label no_dma and then return. However,
the memory allocated for desc_tx has not been freed yet, which leads
to resource leak.

Signed-off-by: Gao Pan <pandy.gao@nxp.com>
Reviewed-by: Fugang Duan <B38611@freescale.com>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/drivers/spi/spi-imx.c b/drivers/spi/spi-imx.c
index 08492d6..c688efa 100644
--- a/drivers/spi/spi-imx.c
+++ b/drivers/spi/spi-imx.c
@@ -927,7 +927,7 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
 					tx->sgl, tx->nents, DMA_MEM_TO_DEV,
 					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 		if (!desc_tx)
-			goto no_dma;
+			goto tx_nodma;
 
 		desc_tx->callback = spi_imx_dma_tx_callback;
 		desc_tx->callback_param = (void *)spi_imx;
@@ -939,7 +939,7 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
 					rx->sgl, rx->nents, DMA_DEV_TO_MEM,
 					DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 		if (!desc_rx)
-			goto no_dma;
+			goto rx_nodma;
 
 		desc_rx->callback = spi_imx_dma_rx_callback;
 		desc_rx->callback_param = (void *)spi_imx;
@@ -995,7 +995,9 @@ static int spi_imx_dma_transfer(struct spi_imx_data *spi_imx,
 
 	return ret;
 
-no_dma:
+rx_nodma:
+	dmaengine_terminate_all(master->dma_tx);
+tx_nodma:
 	pr_warn_once("%s %s: DMA not available, falling back to PIO\n",
 		     dev_driver_string(&master->dev),
 		     dev_name(&master->dev));
-- 
cgit v0.10.2


From 316fa9e09ad76e095b9d7e9350c628b918370a22 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Date: Thu, 18 Feb 2016 15:47:13 +0000
Subject: ASoC: samsung: Use IRQ safe spin lock calls

Lockdep warns of a potential lock inversion, i2s->lock is held numerous
times whilst we are under the substream lock (snd_pcm_stream_lock). If
we use the IRQ unsafe spin lock calls, you can also end up locking
snd_pcm_stream_lock whilst under i2s->lock (if an IRQ happens whilst we
are holding i2s->lock). This could result in deadlock.

[   18.147001]        CPU0                    CPU1
[   18.151509]        ----                    ----
[   18.156022]   lock(&(&pri_dai->spinlock)->rlock);
[   18.160701]                                local_irq_disable();
[   18.166622]                                lock(&(&substream->self_group.lock)->rlock);
[   18.174595]                                lock(&(&pri_dai->spinlock)->rlock);
[   18.181806]   <Interrupt>
[   18.184408]     lock(&(&substream->self_group.lock)->rlock);
[   18.190045]
[   18.190045]  *** DEADLOCK ***

This patch changes to using the irq safe spinlock calls, to avoid this
issue.

Fixes: ce8bcdbb61d9 ("ASoC: samsung: i2s: Protect more registers with a spinlock")
Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Tested-by: Anand Moon <linux.amoon@gmail.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org

diff --git a/sound/soc/samsung/i2s.c b/sound/soc/samsung/i2s.c
index 84d9e77..70a2559 100644
--- a/sound/soc/samsung/i2s.c
+++ b/sound/soc/samsung/i2s.c
@@ -481,10 +481,11 @@ static int i2s_set_sysclk(struct snd_soc_dai *dai,
 	unsigned int cdcon_mask = 1 << i2s_regs->cdclkcon_off;
 	unsigned int rsrc_mask = 1 << i2s_regs->rclksrc_off;
 	u32 mod, mask, val = 0;
+	unsigned long flags;
 
-	spin_lock(i2s->lock);
+	spin_lock_irqsave(i2s->lock, flags);
 	mod = readl(i2s->addr + I2SMOD);
-	spin_unlock(i2s->lock);
+	spin_unlock_irqrestore(i2s->lock, flags);
 
 	switch (clk_id) {
 	case SAMSUNG_I2S_OPCLK:
@@ -575,11 +576,11 @@ static int i2s_set_sysclk(struct snd_soc_dai *dai,
 		return -EINVAL;
 	}
 
-	spin_lock(i2s->lock);
+	spin_lock_irqsave(i2s->lock, flags);
 	mod = readl(i2s->addr + I2SMOD);
 	mod = (mod & ~mask) | val;
 	writel(mod, i2s->addr + I2SMOD);
-	spin_unlock(i2s->lock);
+	spin_unlock_irqrestore(i2s->lock, flags);
 
 	return 0;
 }
@@ -590,6 +591,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
 	struct i2s_dai *i2s = to_info(dai);
 	int lrp_shift, sdf_shift, sdf_mask, lrp_rlow, mod_slave;
 	u32 mod, tmp = 0;
+	unsigned long flags;
 
 	lrp_shift = i2s->variant_regs->lrp_off;
 	sdf_shift = i2s->variant_regs->sdf_off;
@@ -649,7 +651,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
 		return -EINVAL;
 	}
 
-	spin_lock(i2s->lock);
+	spin_lock_irqsave(i2s->lock, flags);
 	mod = readl(i2s->addr + I2SMOD);
 	/*
 	 * Don't change the I2S mode if any controller is active on this
@@ -657,7 +659,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
 	 */
 	if (any_active(i2s) &&
 		((mod & (sdf_mask | lrp_rlow | mod_slave)) != tmp)) {
-		spin_unlock(i2s->lock);
+		spin_unlock_irqrestore(i2s->lock, flags);
 		dev_err(&i2s->pdev->dev,
 				"%s:%d Other DAI busy\n", __func__, __LINE__);
 		return -EAGAIN;
@@ -666,7 +668,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai,
 	mod &= ~(sdf_mask | lrp_rlow | mod_slave);
 	mod |= tmp;
 	writel(mod, i2s->addr + I2SMOD);
-	spin_unlock(i2s->lock);
+	spin_unlock_irqrestore(i2s->lock, flags);
 
 	return 0;
 }
@@ -676,6 +678,7 @@ static int i2s_hw_params(struct snd_pcm_substream *substream,
 {
 	struct i2s_dai *i2s = to_info(dai);
 	u32 mod, mask = 0, val = 0;
+	unsigned long flags;
 
 	if (!is_secondary(i2s))
 		mask |= (MOD_DC2_EN | MOD_DC1_EN);
@@ -744,11 +747,11 @@ static int i2s_hw_params(struct snd_pcm_substream *substream,
 		return -EINVAL;
 	}
 
-	spin_lock(i2s->lock);
+	spin_lock_irqsave(i2s->lock, flags);
 	mod = readl(i2s->addr + I2SMOD);
 	mod = (mod & ~mask) | val;
 	writel(mod, i2s->addr + I2SMOD);
-	spin_unlock(i2s->lock);
+	spin_unlock_irqrestore(i2s->lock, flags);
 
 	samsung_asoc_init_dma_data(dai, &i2s->dma_playback, &i2s->dma_capture);
 
-- 
cgit v0.10.2


From 65b4bcb82967fd5a60694c3477e58a04a9170aea Mon Sep 17 00:00:00 2001
From: Alan <gnomes@lxorguk.ukuu.org.uk>
Date: Fri, 19 Feb 2016 11:42:32 +0530
Subject: ASoC: Intel: Skylake: fix pointer scaling

skl_tplg_tlv_control_set does pointer maths on data but forgets that data
is not uint8_t so the maths is already scaled in the pointer type.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c
index 4624556..b77c253 100644
--- a/sound/soc/intel/skylake/skl-topology.c
+++ b/sound/soc/intel/skylake/skl-topology.c
@@ -950,7 +950,7 @@ static int skl_tplg_tlv_control_set(struct snd_kcontrol *kcontrol,
 				return -EFAULT;
 		} else {
 			if (copy_from_user(ac->params,
-					   data + 2 * sizeof(u32), size))
+					   data + 2, size))
 				return -EFAULT;
 		}
 
-- 
cgit v0.10.2


From c27f29bbbf02168c9b1e8ba0fe7a8cb917e5a50f Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Fri, 19 Feb 2016 14:34:43 +0100
Subject: irqchip/mvebu-odmi: Add new driver for platform MSI on Marvell 7K/8K

This commits adds a new irqchip driver that handles the ODMI
controller found on Marvell 7K/8K processors. The ODMI controller
provide MSI interrupt functionality to on-board peripherals, much like
the GIC-v2m.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Link: https://lkml.kernel.org/r/1455888883-5127-1-git-send-email-thomas.petazzoni@free-electrons.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt
new file mode 100644
index 0000000..252d5c9
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt
@@ -0,0 +1,41 @@
+
+* Marvell ODMI for MSI support
+
+Some Marvell SoCs have an On-Die Message Interrupt (ODMI) controller
+which can be used by on-board peripheral for MSI interrupts.
+
+Required properties:
+
+- compatible           : The value here should contain "marvell,odmi-controller".
+
+- interrupt,controller : Identifies the node as an interrupt controller.
+
+- msi-controller       : Identifies the node as an MSI controller.
+
+- marvell,odmi-frames  : Number of ODMI frames available. Each frame
+                         provides a number of events.
+
+- reg                  : List of register definitions, one for each
+                         ODMI frame.
+
+- marvell,spi-base     : List of GIC base SPI interrupts, one for each
+                         ODMI frame. Those SPI interrupts are 0-based,
+                         i.e marvell,spi-base = <128> will use SPI #96.
+                         See Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt
+                         for details about the GIC Device Tree binding.
+
+- interrupt-parent     : Reference to the parent interrupt controller.
+
+Example:
+
+	odmi: odmi@300000 {
+		compatible = "marvell,odmi-controller";
+		interrupt-controller;
+		msi-controller;
+		marvell,odmi-frames = <4>;
+		reg = <0x300000 0x4000>,
+		      <0x304000 0x4000>,
+		      <0x308000 0x4000>,
+		      <0x30C000 0x4000>;
+		marvell,spi-base = <128>, <136>, <144>, <152>;
+	};
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 8115a32..fa5e98c 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -223,3 +223,7 @@ config IRQ_MXS
 	def_bool y if MACH_ASM9260 || ARCH_MXS
 	select IRQ_DOMAIN
 	select STMP_DEVICE
+
+config MVEBU_ODMI
+	bool
+	select GENERIC_MSI_IRQ_DOMAIN
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 30dba04..b37f708 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -59,3 +59,4 @@ obj-$(CONFIG_ARCH_SA1100)		+= irq-sa11x0.o
 obj-$(CONFIG_INGENIC_IRQ)		+= irq-ingenic.o
 obj-$(CONFIG_IMX_GPCV2)			+= irq-imx-gpcv2.o
 obj-$(CONFIG_PIC32_EVIC)		+= irq-pic32-evic.o
+obj-$(CONFIG_MVEBU_ODMI)		+= irq-mvebu-odmi.o
diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c
new file mode 100644
index 0000000..0ae98b2
--- /dev/null
+++ b/drivers/irqchip/irq-mvebu-odmi.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (C) 2016 Marvell
+ *
+ * Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#define pr_fmt(fmt) "GIC-ODMI: " fmt
+
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/kernel.h>
+#include <linux/msi.h>
+#include <linux/of_address.h>
+#include <linux/slab.h>
+#include <dt-bindings/interrupt-controller/arm-gic.h>
+
+#define GICP_ODMIN_SET			0x40
+#define   GICP_ODMI_INT_NUM_SHIFT	12
+#define GICP_ODMIN_GM_EP_R0		0x110
+#define GICP_ODMIN_GM_EP_R1		0x114
+#define GICP_ODMIN_GM_EA_R0		0x108
+#define GICP_ODMIN_GM_EA_R1		0x118
+
+/*
+ * We don't support the group events, so we simply have 8 interrupts
+ * per frame.
+ */
+#define NODMIS_SHIFT		3
+#define NODMIS_PER_FRAME	(1 << NODMIS_SHIFT)
+#define NODMIS_MASK		(NODMIS_PER_FRAME - 1)
+
+struct odmi_data {
+	struct resource res;
+	void __iomem *base;
+	unsigned int spi_base;
+};
+
+static struct odmi_data *odmis;
+static unsigned long *odmis_bm;
+static unsigned int odmis_count;
+
+/* Protects odmis_bm */
+static DEFINE_SPINLOCK(odmis_bm_lock);
+
+static int odmi_set_affinity(struct irq_data *d,
+			     const struct cpumask *mask, bool force)
+{
+	int ret;
+
+	ret = irq_chip_set_affinity_parent(d, mask, force);
+	if (ret == IRQ_SET_MASK_OK)
+		ret = IRQ_SET_MASK_OK_DONE;
+
+	return ret;
+}
+
+static void odmi_compose_msi_msg(struct irq_data *d, struct msi_msg *msg)
+{
+	struct odmi_data *odmi;
+	phys_addr_t addr;
+	unsigned int odmin;
+
+	if (WARN_ON(d->hwirq >= odmis_count * NODMIS_PER_FRAME))
+		return;
+
+	odmi = &odmis[d->hwirq >> NODMIS_SHIFT];
+	odmin = d->hwirq & NODMIS_MASK;
+
+	addr = odmi->res.start + GICP_ODMIN_SET;
+
+	msg->address_hi = upper_32_bits(addr);
+	msg->address_lo = lower_32_bits(addr);
+	msg->data = odmin << GICP_ODMI_INT_NUM_SHIFT;
+}
+
+static struct irq_chip odmi_irq_chip = {
+	.name			= "ODMI",
+	.irq_mask		= irq_chip_mask_parent,
+	.irq_unmask		= irq_chip_unmask_parent,
+	.irq_eoi		= irq_chip_eoi_parent,
+	.irq_set_affinity	= odmi_set_affinity,
+	.irq_compose_msi_msg	= odmi_compose_msi_msg,
+};
+
+static int odmi_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+				 unsigned int nr_irqs, void *args)
+{
+	struct odmi_data *odmi = NULL;
+	struct irq_fwspec fwspec;
+	struct irq_data *d;
+	unsigned int hwirq, odmin;
+	int ret;
+
+	spin_lock(&odmis_bm_lock);
+	hwirq = find_first_zero_bit(odmis_bm, NODMIS_PER_FRAME * odmis_count);
+	if (hwirq >= NODMIS_PER_FRAME * odmis_count) {
+		spin_unlock(&odmis_bm_lock);
+		return -ENOSPC;
+	}
+
+	__set_bit(hwirq, odmis_bm);
+	spin_unlock(&odmis_bm_lock);
+
+	odmi = &odmis[hwirq >> NODMIS_SHIFT];
+	odmin = hwirq & NODMIS_MASK;
+
+	fwspec.fwnode = domain->parent->fwnode;
+	fwspec.param_count = 3;
+	fwspec.param[0] = GIC_SPI;
+	fwspec.param[1] = odmi->spi_base - 32 + odmin;
+	fwspec.param[2] = IRQ_TYPE_EDGE_RISING;
+
+	ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
+	if (ret) {
+		pr_err("Cannot allocate parent IRQ\n");
+		spin_lock(&odmis_bm_lock);
+		__clear_bit(odmin, odmis_bm);
+		spin_unlock(&odmis_bm_lock);
+		return ret;
+	}
+
+	/* Configure the interrupt line to be edge */
+	d = irq_domain_get_irq_data(domain->parent, virq);
+	d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
+
+	irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
+				      &odmi_irq_chip, NULL);
+
+	return 0;
+}
+
+static void odmi_irq_domain_free(struct irq_domain *domain,
+				 unsigned int virq, unsigned int nr_irqs)
+{
+	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+
+	if (d->hwirq >= odmis_count * NODMIS_PER_FRAME) {
+		pr_err("Failed to teardown msi. Invalid hwirq %lu\n", d->hwirq);
+		return;
+	}
+
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+
+	/* Actually free the MSI */
+	spin_lock(&odmis_bm_lock);
+	__clear_bit(d->hwirq, odmis_bm);
+	spin_unlock(&odmis_bm_lock);
+}
+
+static const struct irq_domain_ops odmi_domain_ops = {
+	.alloc	= odmi_irq_domain_alloc,
+	.free	= odmi_irq_domain_free,
+};
+
+static struct irq_chip odmi_msi_irq_chip = {
+	.name	= "ODMI",
+};
+
+static struct msi_domain_ops odmi_msi_ops = {
+};
+
+static struct msi_domain_info odmi_msi_domain_info = {
+	.flags	= (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS),
+	.ops	= &odmi_msi_ops,
+	.chip	= &odmi_msi_irq_chip,
+};
+
+static int __init mvebu_odmi_init(struct device_node *node,
+				  struct device_node *parent)
+{
+	struct irq_domain *inner_domain, *plat_domain;
+	int ret, i;
+
+	if (of_property_read_u32(node, "marvell,odmi-frames", &odmis_count))
+		return -EINVAL;
+
+	odmis = kcalloc(odmis_count, sizeof(struct odmi_data), GFP_KERNEL);
+	if (!odmis)
+		return -ENOMEM;
+
+	odmis_bm = kcalloc(BITS_TO_LONGS(odmis_count * NODMIS_PER_FRAME),
+			   sizeof(long), GFP_KERNEL);
+	if (!odmis_bm) {
+		ret = -ENOMEM;
+		goto err_alloc;
+	}
+
+	for (i = 0; i < odmis_count; i++) {
+		struct odmi_data *odmi = &odmis[i];
+
+		ret = of_address_to_resource(node, i, &odmi->res);
+		if (ret)
+			goto err_unmap;
+
+		odmi->base = of_io_request_and_map(node, i, "odmi");
+		if (IS_ERR(odmi->base)) {
+			ret = PTR_ERR(odmi->base);
+			goto err_unmap;
+		}
+
+		if (of_property_read_u32_index(node, "marvell,spi-base",
+					       i, &odmi->spi_base)) {
+			ret = -EINVAL;
+			goto err_unmap;
+		}
+	}
+
+	inner_domain = irq_domain_create_linear(of_node_to_fwnode(node),
+						odmis_count * NODMIS_PER_FRAME,
+						&odmi_domain_ops, NULL);
+	if (!inner_domain) {
+		ret = -ENOMEM;
+		goto err_unmap;
+	}
+
+	inner_domain->parent = irq_find_host(parent);
+
+	plat_domain = platform_msi_create_irq_domain(of_node_to_fwnode(node),
+						     &odmi_msi_domain_info,
+						     inner_domain);
+	if (!plat_domain) {
+		ret = -ENOMEM;
+		goto err_remove_inner;
+	}
+
+	return 0;
+
+err_remove_inner:
+	irq_domain_remove(inner_domain);
+err_unmap:
+	for (i = 0; i < odmis_count; i++) {
+		struct odmi_data *odmi = &odmis[i];
+
+		if (odmi->base && !IS_ERR(odmi->base))
+			iounmap(odmis[i].base);
+	}
+	kfree(odmis_bm);
+err_alloc:
+	kfree(odmis);
+	return ret;
+}
+
+IRQCHIP_DECLARE(mvebu_odmi, "marvell,odmi-controller", mvebu_odmi_init);
-- 
cgit v0.10.2


From 0407daceedfed003eaacb850d06cbbe359348367 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Fri, 19 Feb 2016 15:00:29 +0000
Subject: irqchip/gic: Return IRQ_SET_MASK_OK_DONE in the set_affinity method

Moving an SPI around doesn't require any extra work from the rest
of the stack, and specially not for MSI-generated SPIs.

It is then worth returning IRQ_SET_MASK_OK_DONE instead of
IRQ_SET_MASK_OK, and simplify the other irqchips that rely on
this behaviour (GICv2m and Marvell's ODMI controller).

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Link: https://lkml.kernel.org/r/1455894029-17270-1-git-send-email-marc.zyngier@arm.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c
index c779f83..28f047c 100644
--- a/drivers/irqchip/irq-gic-v2m.c
+++ b/drivers/irqchip/irq-gic-v2m.c
@@ -92,18 +92,6 @@ static struct msi_domain_info gicv2m_msi_domain_info = {
 	.chip	= &gicv2m_msi_irq_chip,
 };
 
-static int gicv2m_set_affinity(struct irq_data *irq_data,
-			       const struct cpumask *mask, bool force)
-{
-	int ret;
-
-	ret = irq_chip_set_affinity_parent(irq_data, mask, force);
-	if (ret == IRQ_SET_MASK_OK)
-		ret = IRQ_SET_MASK_OK_DONE;
-
-	return ret;
-}
-
 static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 {
 	struct v2m_data *v2m = irq_data_get_irq_chip_data(data);
@@ -122,7 +110,7 @@ static struct irq_chip gicv2m_irq_chip = {
 	.irq_mask		= irq_chip_mask_parent,
 	.irq_unmask		= irq_chip_unmask_parent,
 	.irq_eoi		= irq_chip_eoi_parent,
-	.irq_set_affinity	= gicv2m_set_affinity,
+	.irq_set_affinity	= irq_chip_set_affinity_parent,
 	.irq_compose_msi_msg	= gicv2m_compose_msi_msg,
 };
 
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 911758c..e14f2f2 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -319,7 +319,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
 	writel_relaxed(val | bit, reg);
 	raw_spin_unlock_irqrestore(&irq_controller_lock, flags);
 
-	return IRQ_SET_MASK_OK;
+	return IRQ_SET_MASK_OK_DONE;
 }
 #endif
 
diff --git a/drivers/irqchip/irq-mvebu-odmi.c b/drivers/irqchip/irq-mvebu-odmi.c
index 0ae98b2..b4d3678 100644
--- a/drivers/irqchip/irq-mvebu-odmi.c
+++ b/drivers/irqchip/irq-mvebu-odmi.c
@@ -47,18 +47,6 @@ static unsigned int odmis_count;
 /* Protects odmis_bm */
 static DEFINE_SPINLOCK(odmis_bm_lock);
 
-static int odmi_set_affinity(struct irq_data *d,
-			     const struct cpumask *mask, bool force)
-{
-	int ret;
-
-	ret = irq_chip_set_affinity_parent(d, mask, force);
-	if (ret == IRQ_SET_MASK_OK)
-		ret = IRQ_SET_MASK_OK_DONE;
-
-	return ret;
-}
-
 static void odmi_compose_msi_msg(struct irq_data *d, struct msi_msg *msg)
 {
 	struct odmi_data *odmi;
@@ -83,7 +71,7 @@ static struct irq_chip odmi_irq_chip = {
 	.irq_mask		= irq_chip_mask_parent,
 	.irq_unmask		= irq_chip_unmask_parent,
 	.irq_eoi		= irq_chip_eoi_parent,
-	.irq_set_affinity	= odmi_set_affinity,
+	.irq_set_affinity	= irq_chip_set_affinity_parent,
 	.irq_compose_msi_msg	= odmi_compose_msi_msg,
 };
 
-- 
cgit v0.10.2


From d9aade7fd27a604bbffd363e6a68416ef51bab88 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 18 Feb 2016 13:34:09 -0300
Subject: perf evlist: Handle -EINVAL for sample_freq > max_sample_rate in
 strerror_open()

When running the "code reading" test we get:

  # perf test -v "code reading" 2>&1 | tail -5
  Parsing event 'cycles:u'
  perf_evlist__open failed
  test child finished with -1
  ---- end ----
  Test object code reading: FAILED!
  #

And with -vv we get the errno value, -22, i.e. -EINVAL, but we can do
better and handle the case at hand, with this patch it becomes:

  # perf test -v "code reading" 2>&1 | tail -7
  perf_evlist__open() failed!
  Error: Invalid argument.
  Hint:  Check /proc/sys/kernel/perf_event_max_sample_rate.
  Hint:  The current value is 1000 and 4000 is being requested.
  test child finished with -1
  ---- end ----
  Test object code reading: FAILED!
  #

Next patch will make this 'perf test' entry to use perf_evlist__strerror()

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Steven Noonan <steven@uplinklabs.net>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-i31ai6kfefn75eapejjokfhc@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index a7eb0ea..0f57716 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1624,7 +1624,7 @@ size_t perf_evlist__fprintf(struct perf_evlist *evlist, FILE *fp)
 	return printed + fprintf(fp, "\n");
 }
 
-int perf_evlist__strerror_open(struct perf_evlist *evlist __maybe_unused,
+int perf_evlist__strerror_open(struct perf_evlist *evlist,
 			       int err, char *buf, size_t size)
 {
 	int printed, value;
@@ -1652,7 +1652,25 @@ int perf_evlist__strerror_open(struct perf_evlist *evlist __maybe_unused,
 				    "Hint:\tTry: 'sudo sh -c \"echo -1 > /proc/sys/kernel/perf_event_paranoid\"'\n"
 				    "Hint:\tThe current value is %d.", value);
 		break;
+	case EINVAL: {
+		struct perf_evsel *first = perf_evlist__first(evlist);
+		int max_freq;
+
+		if (sysctl__read_int("kernel/perf_event_max_sample_rate", &max_freq) < 0)
+			goto out_default;
+
+		if (first->attr.sample_freq < (u64)max_freq)
+			goto out_default;
+
+		printed = scnprintf(buf, size,
+				    "Error:\t%s.\n"
+				    "Hint:\tCheck /proc/sys/kernel/perf_event_max_sample_rate.\n"
+				    "Hint:\tThe current value is %d and %" PRIu64 " is being requested.",
+				    emsg, max_freq, first->attr.sample_freq);
+		break;
+	}
 	default:
+out_default:
 		scnprintf(buf, size, "%s", emsg);
 		break;
 	}
-- 
cgit v0.10.2


From 6880bbf96930ec6f8b40b5b93f21973f3297672a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 18 Feb 2016 13:40:57 -0300
Subject: perf tests: Use perf_evlist__strerror_open() to provide hints about
 max_freq

Before:

  # perf test -v "code reading" 2>&1 | tail -4
  perf_evlist__open failed
  test child finished with -1
  ---- end ----
  Test object code reading: FAILED!
  #

After:

  # perf test -v "code reading" 2>&1 | tail -7
  perf_evlist__open() failed!
  Error: Invalid argument.
  Hint:  Check /proc/sys/kernel/perf_event_max_sample_rate.
  Hint:  The current value is 1000 and 4000 is being requested.
  test child finished with -1
  ---- end ----
  Test object code reading: FAILED!
  #

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Steven Noonan <steven@uplinklabs.net>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-ifbx7vmrc38loe6317owz2jx@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index 313a48c..f84339c 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -559,7 +559,13 @@ static int do_test_code_reading(bool try_kcore)
 				evlist = NULL;
 				continue;
 			}
-			pr_debug("perf_evlist__open failed\n");
+
+			if (verbose) {
+				char errbuf[512];
+				perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
+				pr_debug("perf_evlist__open() failed!\n%s\n", errbuf);
+			}
+
 			goto out_put;
 		}
 		break;
-- 
cgit v0.10.2


From 5243ba76a585a6481c4d7b931e7e3d98900cbdbe Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 18 Feb 2016 13:45:25 -0300
Subject: perf test: Reduce the sample_freq for the 'object code reading' test

Using 4 kHz is not necessary and sometimes is more than what was
auto-tuned:

  # dmesg | grep max_sample_rate | tail -2
  [ 2499.144373] perf interrupt took too long (2501 > 2500), lowering kernel.perf_event_max_sample_rate to 50000
  [ 3592.413606] perf interrupt took too long (5069 > 5000), lowering kernel.perf_event_max_sample_rate to 25000

Simulating a auto-tune of 2000 we make the test fail, as reported
by Steven Noonan for one of his machines, so reduce it to 500 HZ,
it is enough to get a good number of samples for this test:

  # perf test -v 21 2>&1  | grep '^Reading object code for memory address' | tee /tmp/out | tail -5
  Reading object code for memory address: 0x479f40
  Reading object code for memory address: 0x7f29b7eea80d
  Reading object code for memory address: 0x7f29b7eea80d
  Reading object code for memory address: 0x7f29b7eea800
  Reading object code for memory address: 0xffffffff813b2f23
  [root@jouet ~]# wc -l /tmp/out
  40 /tmp/out
  [root@jouet ~]#

For systems that auto-tune below that, the previous patches will tell the
user what is happening so that he may either ignore the result of this test or
bump /proc/sys/kernel/perf_event_max_sample_rate.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Steven Noonan <steven@uplinklabs.net>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-6kufyy1iprdfzrbtuqgxir70@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index f84339c..afc9ad0 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -439,7 +439,7 @@ static int do_test_code_reading(bool try_kcore)
 		.mmap_pages	     = UINT_MAX,
 		.user_freq	     = UINT_MAX,
 		.user_interval	     = ULLONG_MAX,
-		.freq		     = 4000,
+		.freq		     = 500,
 		.target		     = {
 			.uses_mmap   = true,
 		},
-- 
cgit v0.10.2


From b002f3bbd321993c1a6d56b86544065420156ab9 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 17 Feb 2016 14:44:00 -0800
Subject: perf stat: Handled scaled == -1 case for counters

Arnaldo pointed out that the earlier cb110f471025 ("perf stat: Move
noise/running printing into printout") change changed behavior for not
counted counters. This patch fixes it again.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Fixes: cb110f471025 ("perf stat: Move noise/running printing into printout")
Link: http://lkml.kernel.org/r/1455749045-18098-2-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 15e4fcf..86289df 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -860,7 +860,7 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 
 	nl = new_line_std;
 
-	if (run == 0 || ena == 0) {
+	if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
 		aggr_printout(counter, id, nr);
 
 		fprintf(stat_config.output, "%*s%s",
-- 
cgit v0.10.2


From 80cdce7666924158af63ba5071805a28800ebe4b Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 19 Feb 2016 11:43:51 +0000
Subject: perf bpf: Rename bpf_prog_priv__clear() to clear_prog_priv()

The name of bpf_prog_priv__clear() doesn't follow perf's naming
convention. bpf_prog_priv__delete() seems to be a better name. However,
bpf_prog_priv__delete() should be a method of 'struct bpf_prog_priv',
but its first parameter is 'struct bpf_program'.

It is callback from libbpf to clear priv structures when destroying a
bpf program. It is actually a method of bpf_program (libbpf object), but
bpf_program__ functions should be provided by libbpf.

This patch removes the prefix of that function.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1455882283-79592-4-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 540a7ef..0bdccf4 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -108,8 +108,8 @@ void bpf__clear(void)
 }
 
 static void
-bpf_prog_priv__clear(struct bpf_program *prog __maybe_unused,
-		     void *_priv)
+clear_prog_priv(struct bpf_program *prog __maybe_unused,
+		void *_priv)
 {
 	struct bpf_prog_priv *priv = _priv;
 
@@ -337,7 +337,7 @@ config_bpf_program(struct bpf_program *prog)
 	}
 	pr_debug("bpf: config '%s' is ok\n", config_str);
 
-	err = bpf_program__set_private(prog, priv, bpf_prog_priv__clear);
+	err = bpf_program__set_private(prog, priv, clear_prog_priv);
 	if (err) {
 		pr_debug("Failed to set priv for program '%s'\n", config_str);
 		goto errout;
-- 
cgit v0.10.2


From 26dee028d365fbc0e3326606a8520260b4462381 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 19 Feb 2016 11:43:52 +0000
Subject: perf tools: Fix checking asprintf return value

According to man pages, asprintf returns -1 when failure. This patch
fixes two incorrect return value checker.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Cc: stable@vger.kernel.org # v4.4+
Fixes: ffeb883e5662 ("perf tools: Show proper error message for wrong terms of hw/sw events")
Link: http://lkml.kernel.org/r/1455882283-79592-5-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index e5583fd..72524c7 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -2110,11 +2110,11 @@ char *parse_events_formats_error_string(char *additional_terms)
 
 	/* valid terms */
 	if (additional_terms) {
-		if (!asprintf(&str, "valid terms: %s,%s",
-			      additional_terms, static_terms))
+		if (asprintf(&str, "valid terms: %s,%s",
+			     additional_terms, static_terms) < 0)
 			goto fail;
 	} else {
-		if (!asprintf(&str, "valid terms: %s", static_terms))
+		if (asprintf(&str, "valid terms: %s", static_terms) < 0)
 			goto fail;
 	}
 	return str;
-- 
cgit v0.10.2


From 17cb5f84b89fd39a143f1c899836f40420a6b799 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 19 Feb 2016 11:43:57 +0000
Subject: perf tools: Create config_term_names array

config_term_names[] is introduced for future commits which will be able
to retrieve the config name through the config term.

Utilize this array in parse_events_formats_error_string() so the missing
'{,no-}inherit' terms are added.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1455882283-79592-10-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 72524c7..fd085d5 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -746,6 +746,25 @@ static int check_type_val(struct parse_events_term *term,
 	return -EINVAL;
 }
 
+/*
+ * Update according to parse-events.l
+ */
+static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = {
+	[PARSE_EVENTS__TERM_TYPE_USER]			= "<sysfs term>",
+	[PARSE_EVENTS__TERM_TYPE_CONFIG]		= "config",
+	[PARSE_EVENTS__TERM_TYPE_CONFIG1]		= "config1",
+	[PARSE_EVENTS__TERM_TYPE_CONFIG2]		= "config2",
+	[PARSE_EVENTS__TERM_TYPE_NAME]			= "name",
+	[PARSE_EVENTS__TERM_TYPE_SAMPLE_PERIOD]		= "period",
+	[PARSE_EVENTS__TERM_TYPE_SAMPLE_FREQ]		= "freq",
+	[PARSE_EVENTS__TERM_TYPE_BRANCH_SAMPLE_TYPE]	= "branch_type",
+	[PARSE_EVENTS__TERM_TYPE_TIME]			= "time",
+	[PARSE_EVENTS__TERM_TYPE_CALLGRAPH]		= "call-graph",
+	[PARSE_EVENTS__TERM_TYPE_STACKSIZE]		= "stack-size",
+	[PARSE_EVENTS__TERM_TYPE_NOINHERIT]		= "no-inherit",
+	[PARSE_EVENTS__TERM_TYPE_INHERIT]		= "inherit",
+};
+
 typedef int config_term_func_t(struct perf_event_attr *attr,
 			       struct parse_events_term *term,
 			       struct parse_events_error *err);
@@ -2097,6 +2116,31 @@ void parse_events_evlist_error(struct parse_events_evlist *data,
 	WARN_ONCE(!err->str, "WARNING: failed to allocate error string");
 }
 
+static void config_terms_list(char *buf, size_t buf_sz)
+{
+	int i;
+	bool first = true;
+
+	buf[0] = '\0';
+	for (i = 0; i < __PARSE_EVENTS__TERM_TYPE_NR; i++) {
+		const char *name = config_term_names[i];
+
+		if (!name)
+			continue;
+		if (name[0] == '<')
+			continue;
+
+		if (strlen(buf) + strlen(name) + 2 >= buf_sz)
+			return;
+
+		if (!first)
+			strcat(buf, ",");
+		else
+			first = false;
+		strcat(buf, name);
+	}
+}
+
 /*
  * Return string contains valid config terms of an event.
  * @additional_terms: For terms such as PMU sysfs terms.
@@ -2104,10 +2148,11 @@ void parse_events_evlist_error(struct parse_events_evlist *data,
 char *parse_events_formats_error_string(char *additional_terms)
 {
 	char *str;
-	static const char *static_terms = "config,config1,config2,name,"
-					  "period,freq,branch_type,time,"
-					  "call-graph,stack-size\n";
+	/* "branch_type" is the longest name */
+	char static_terms[__PARSE_EVENTS__TERM_TYPE_NR *
+			  (sizeof("branch_type") - 1)];
 
+	config_terms_list(static_terms, sizeof(static_terms));
 	/* valid terms */
 	if (additional_terms) {
 		if (asprintf(&str, "valid terms: %s,%s",
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 53628bf..b50d50b 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -68,7 +68,8 @@ enum {
 	PARSE_EVENTS__TERM_TYPE_CALLGRAPH,
 	PARSE_EVENTS__TERM_TYPE_STACKSIZE,
 	PARSE_EVENTS__TERM_TYPE_NOINHERIT,
-	PARSE_EVENTS__TERM_TYPE_INHERIT
+	PARSE_EVENTS__TERM_TYPE_INHERIT,
+	__PARSE_EVENTS__TERM_TYPE_NR,
 };
 
 struct parse_events_term {
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 58c5831..99486e6 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -178,8 +178,7 @@ modifier_bp	[rwx]{1,3}
 
 <config>{
 	/*
-	 * Please update parse_events_formats_error_string any time
-	 * new static term is added.
+	 * Please update config_term_names when new static term is added.
 	 */
 config			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG); }
 config1			{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CONFIG1); }
-- 
cgit v0.10.2


From 1669e509ea25e4e3e871d913d21b1cac4a96d1e8 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 19 Feb 2016 11:43:58 +0000
Subject: perf stat: Bail out on unsupported event config modifiers

'perf stat' accepts some config terms but doesn't apply them. For
example:

  # perf stat -e 'instructions/no-inherit/' -e 'instructions/inherit/' bash
  # ls
  # exit

  Performance counter stats for 'bash':

         266258061      instructions/no-inherit/
         266258061      instructions/inherit/

       1.402183915 seconds time elapsed

The result is confusing, because user may expect the first
'instructions' event exclude the 'ls' command.

This patch forbid most of these config terms for 'perf stat'.

Result:

  # ./perf stat -e 'instructions/no-inherit/' -e 'instructions/inherit/' bash
  event syntax error: 'instructions/no-inherit/'
                       \___ 'no-inherit' is not usable in 'perf stat'
  ...

We can add blocked config terms back when 'perf stat' really supports them.

This patch also removes unavailable config term from error message:

  # ./perf stat -e 'instructions/badterm/' ls
  event syntax error: 'instructions/badterm/'
                                    \___ unknown term

  valid terms: config,config1,config2,name

  # ./perf stat -e 'cpu/badterm/' ls
  event syntax error: 'cpu/badterm/'
                           \___ unknown term

  valid terms: pc,any,inv,edge,cmask,event,in_tx,ldlat,umask,in_tx_cp,offcore_rsp,config,config1,config2,name

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1455882283-79592-11-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 86289df..8c0bc0f 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1831,6 +1831,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (evsel_list == NULL)
 		return -ENOMEM;
 
+	parse_events__shrink_config_terms();
 	argc = parse_options_subcommand(argc, argv, stat_options, stat_subcommands,
 					(const char **) stat_usage,
 					PARSE_OPT_STOP_AT_NON_OPTION);
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index fd085d5..eb5df43 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -765,6 +765,41 @@ static const char *config_term_names[__PARSE_EVENTS__TERM_TYPE_NR] = {
 	[PARSE_EVENTS__TERM_TYPE_INHERIT]		= "inherit",
 };
 
+static bool config_term_shrinked;
+
+static bool
+config_term_avail(int term_type, struct parse_events_error *err)
+{
+	if (term_type < 0 || term_type >= __PARSE_EVENTS__TERM_TYPE_NR) {
+		err->str = strdup("Invalid term_type");
+		return false;
+	}
+	if (!config_term_shrinked)
+		return true;
+
+	switch (term_type) {
+	case PARSE_EVENTS__TERM_TYPE_CONFIG:
+	case PARSE_EVENTS__TERM_TYPE_CONFIG1:
+	case PARSE_EVENTS__TERM_TYPE_CONFIG2:
+	case PARSE_EVENTS__TERM_TYPE_NAME:
+		return true;
+	default:
+		if (!err)
+			return false;
+
+		/* term_type is validated so indexing is safe */
+		if (asprintf(&err->str, "'%s' is not usable in 'perf stat'",
+			     config_term_names[term_type]) < 0)
+			err->str = NULL;
+		return false;
+	}
+}
+
+void parse_events__shrink_config_terms(void)
+{
+	config_term_shrinked = true;
+}
+
 typedef int config_term_func_t(struct perf_event_attr *attr,
 			       struct parse_events_term *term,
 			       struct parse_events_error *err);
@@ -834,6 +869,17 @@ do {									   \
 		return -EINVAL;
 	}
 
+	/*
+	 * Check term availbility after basic checking so
+	 * PARSE_EVENTS__TERM_TYPE_USER can be found and filtered.
+	 *
+	 * If check availbility at the entry of this function,
+	 * user will see "'<sysfs term>' is not usable in 'perf stat'"
+	 * if an invalid config term is provided for legacy events
+	 * (for example, instructions/badterm/...), which is confusing.
+	 */
+	if (!config_term_avail(term->type_term, err))
+		return -EINVAL;
 	return 0;
 #undef CHECK_TYPE_VAL
 }
@@ -2125,6 +2171,8 @@ static void config_terms_list(char *buf, size_t buf_sz)
 	for (i = 0; i < __PARSE_EVENTS__TERM_TYPE_NR; i++) {
 		const char *name = config_term_names[i];
 
+		if (!config_term_avail(i, NULL))
+			continue;
 		if (!name)
 			continue;
 		if (name[0] == '<')
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index b50d50b..76151f9 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -105,6 +105,7 @@ struct parse_events_terms {
 	struct list_head *terms;
 };
 
+void parse_events__shrink_config_terms(void);
 int parse_events__is_hardcoded_term(struct parse_events_term *term);
 int parse_events_term__num(struct parse_events_term **term,
 			   int type_term, char *config, u64 num,
-- 
cgit v0.10.2


From e814fddde18fec43fa41a27ae94c09b54772697e Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 19 Feb 2016 11:43:59 +0000
Subject: perf tools: Rename and move pmu_event_name to get_config_name

Following commits will make more events obey /name=newname/ options.
This patch makes pmu_event_name() a generic helper.

Makes new get_config_name() accept NULL input to make life easier.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1455882283-79592-12-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index eb5df43..3243e95 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -279,7 +279,24 @@ const char *event_type(int type)
 	return "unknown";
 }
 
+static int parse_events__is_name_term(struct parse_events_term *term)
+{
+	return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME;
+}
 
+static char *get_config_name(struct list_head *head_terms)
+{
+	struct parse_events_term *term;
+
+	if (!head_terms)
+		return NULL;
+
+	list_for_each_entry(term, head_terms, list)
+		if (parse_events__is_name_term(term))
+			return term->val.str;
+
+	return NULL;
+}
 
 static struct perf_evsel *
 __add_event(struct list_head *list, int *idx,
@@ -1029,22 +1046,6 @@ int parse_events_add_numeric(struct parse_events_evlist *data,
 	return add_event(list, &data->idx, &attr, NULL, &config_terms);
 }
 
-static int parse_events__is_name_term(struct parse_events_term *term)
-{
-	return term->type_term == PARSE_EVENTS__TERM_TYPE_NAME;
-}
-
-static char *pmu_event_name(struct list_head *head_terms)
-{
-	struct parse_events_term *term;
-
-	list_for_each_entry(term, head_terms, list)
-		if (parse_events__is_name_term(term))
-			return term->val.str;
-
-	return NULL;
-}
-
 int parse_events_add_pmu(struct parse_events_evlist *data,
 			 struct list_head *list, char *name,
 			 struct list_head *head_config)
@@ -1089,7 +1090,7 @@ int parse_events_add_pmu(struct parse_events_evlist *data,
 		return -EINVAL;
 
 	evsel = __add_event(list, &data->idx, &attr,
-			    pmu_event_name(head_config), pmu->cpus,
+			    get_config_name(head_config), pmu->cpus,
 			    &config_terms);
 	if (evsel) {
 		evsel->unit = info.unit;
-- 
cgit v0.10.2


From 1d55e8ef340dad1ccd5aaf53071de41fc3d8dba4 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 19 Feb 2016 18:45:12 -0300
Subject: perf tools: Introduce opt_event_config nonterminal

To remove duplicated code that differs only in using the matching
'/a,b,c/' part or NULL if no event configuration is done ('//' or no
pair of slashes at all).

Will be used by some new targets allowing the configuration of hardware
events, etc.

Lifted part of the 'opt_event_config' nonterminal from a patch by Wang
Nan.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/n/tip-e3xzpx9cqsmwnaguaxyw6r42@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index c0eac88..ce68746 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -64,6 +64,7 @@ static inc_group_count(struct list_head *list,
 %type <str> PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT
 %type <num> value_sym
 %type <head> event_config
+%type <head> opt_event_config
 %type <term> event_term
 %type <head> event_pmu
 %type <head> event_legacy_symbol
@@ -222,16 +223,6 @@ PE_NAME '/' event_config '/'
 	$$ = list;
 }
 |
-PE_NAME '/' '/'
-{
-	struct parse_events_evlist *data = _data;
-	struct list_head *list;
-
-	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_pmu(data, list, $1, NULL));
-	$$ = list;
-}
-|
 PE_KERNEL_PMU_EVENT sep_dc
 {
 	struct parse_events_evlist *data = _data;
@@ -378,7 +369,7 @@ PE_PREFIX_MEM PE_VALUE sep_dc
 }
 
 event_legacy_tracepoint:
-tracepoint_name
+tracepoint_name opt_event_config
 {
 	struct parse_events_evlist *data = _data;
 	struct parse_events_error *error = data->error;
@@ -389,24 +380,7 @@ tracepoint_name
 		error->idx = @1.first_column;
 
 	if (parse_events_add_tracepoint(list, &data->idx, $1.sys, $1.event,
-					error, NULL))
-		return -1;
-
-	$$ = list;
-}
-|
-tracepoint_name '/' event_config '/'
-{
-	struct parse_events_evlist *data = _data;
-	struct parse_events_error *error = data->error;
-	struct list_head *list;
-
-	ALLOC_LIST(list);
-	if (error)
-		error->idx = @1.first_column;
-
-	if (parse_events_add_tracepoint(list, &data->idx, $1.sys, $1.event,
-					error, $3))
+					error, $2))
 		return -1;
 
 	$$ = list;
@@ -476,6 +450,21 @@ PE_BPF_SOURCE
 	$$ = list;
 }
 
+opt_event_config:
+'/' event_config '/'
+{
+	$$ = $2;
+}
+|
+'/' '/'
+{
+	$$ = NULL;
+}
+|
+{
+	$$ = NULL;
+}
+
 start_terms: event_config
 {
 	struct parse_events_terms *data = _data;
-- 
cgit v0.10.2


From 10bf358a1b79fa1311eb05ee31f2cefdcad01741 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 19 Feb 2016 11:44:00 +0000
Subject: perf tools: Enable config raw and numeric events

This patch allows setting config terms for raw and numeric events.
For example:

  # perf stat -e cycles/name=cyc/ ls
  ...
  1821108      cyc
  ...

  # perf stat -e r6530160/name=event/ ls
  ...
  1103195      event
  ...

  # perf record -e cycles -e 4:0x6530160/name=evtx,call-graph=fp/ -a sleep 1
  ...
  # perf report --stdio
  ...
  # Samples: 124  of event 'cycles'
  46.61%     0.00%  swapper        [kernel.vmlinux]            [k] cpu_startup_entry
  41.26%     0.00%  swapper        [kernel.vmlinux]            [k] start_secondary
  ...
  # Samples: 91  of event 'evtx'
  ...
  93.76%     0.00%  swapper      [kernel.vmlinux]            [k] cpu_startup_entry
          |
          ---cpu_startup_entry
             |
             |--66.63%--call_cpuidle
             |          cpuidle_enter
             |          |
  ...

3 test cases are introduced to test config terms for symbol, raw and
numeric events.

Committer note:

Further testing shows that we can retrieve the event name using 'perf
evlist -v' and looking at the 'config' perf_event_attr field, i.e.:

  # perf record -e cycles -e 4:0x6530160/name=evtx,call-graph=fp/ -a sleep 1
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 1.724 MB perf.data (2076 samples) ]
  # perf evlist
  cycles
  evtx
  # perf evlist -v
  cycles: size: 112, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CPU|PERIOD|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1
evtx: type: 4, size: 112, config: 0x6530160, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|CALLCHAIN|CPU|PERIOD|IDENTIFIER, read_format: ID, disabled: 1, inherit: 1, freq: 1, sample_id_all: 1, exclude_guest: 1
  #

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1455882283-79592-13-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 6648274..15e2d05 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1271,6 +1271,31 @@ static int test__checkevent_precise_max_modifier(struct perf_evlist *evlist)
 	return 0;
 }
 
+static int test__checkevent_config_symbol(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "insn") == 0);
+	return 0;
+}
+
+static int test__checkevent_config_raw(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "rawpmu") == 0);
+	return 0;
+}
+
+static int test__checkevent_config_num(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "numpmu") == 0);
+	return 0;
+}
+
+
 static int count_tracepoints(void)
 {
 	struct dirent *events_ent;
@@ -1579,6 +1604,21 @@ static struct evlist_test test__events[] = {
 		.check = test__checkevent_precise_max_modifier,
 		.id    = 47,
 	},
+	{
+		.name  = "instructions/name=insn/",
+		.check = test__checkevent_config_symbol,
+		.id    = 48,
+	},
+	{
+		.name  = "r1234/name=rawpmu/",
+		.check = test__checkevent_config_raw,
+		.id    = 49,
+	},
+	{
+		.name  = "4:0x6530160/name=numpmu/",
+		.check = test__checkevent_config_num,
+		.id    = 50,
+	},
 };
 
 static struct evlist_test test__events_pmu[] = {
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 3243e95..75576e1 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1043,7 +1043,8 @@ int parse_events_add_numeric(struct parse_events_evlist *data,
 			return -ENOMEM;
 	}
 
-	return add_event(list, &data->idx, &attr, NULL, &config_terms);
+	return add_event(list, &data->idx, &attr,
+			 get_config_name(head_config), &config_terms);
 }
 
 int parse_events_add_pmu(struct parse_events_evlist *data,
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index ce68746..82029f9 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -407,24 +407,26 @@ PE_NAME ':' PE_NAME
 }
 
 event_legacy_numeric:
-PE_VALUE ':' PE_VALUE
+PE_VALUE ':' PE_VALUE opt_event_config
 {
 	struct parse_events_evlist *data = _data;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, NULL));
+	ABORT_ON(parse_events_add_numeric(data, list, (u32)$1, $3, $4));
+	parse_events_terms__delete($4);
 	$$ = list;
 }
 
 event_legacy_raw:
-PE_RAW
+PE_RAW opt_event_config
 {
 	struct parse_events_evlist *data = _data;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, NULL));
+	ABORT_ON(parse_events_add_numeric(data, list, PERF_TYPE_RAW, $1, $2));
+	parse_events_terms__delete($2);
 	$$ = list;
 }
 
-- 
cgit v0.10.2


From 43d0b97817a41b274aaec0476e912dae3ae1f93d Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 19 Feb 2016 11:44:01 +0000
Subject: perf tools: Enable config and setting names for legacy cache events

This patch allows setting config terms for legacy cache events.
For example:

  # perf stat -e L1-icache-misses/name=valA/ -e branches/name=valB/ ls
  ...
   Performance counter stats for 'ls':

              11299      valA
             451605      valB

        0.000779091 seconds time elapsed

  # perf record -e cache-misses/name=inh/ -e cache-misses/name=noinh,no-inherit/ bash
  # ls
  # exit
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.023 MB perf.data (131 samples) ]
  # perf report --stdio | grep -B 1 'Event count'
  # Samples: 105  of event 'inh'
  # Event count (approx.): 109118
  --
  # Samples: 26  of event 'noinh'
  # Event count (approx.): 48302

A test case is introduced to test this feature.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1455882283-79592-14-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c
index 15e2d05..7865f68 100644
--- a/tools/perf/tests/parse-events.c
+++ b/tools/perf/tests/parse-events.c
@@ -1295,6 +1295,13 @@ static int test__checkevent_config_num(struct perf_evlist *evlist)
 	return 0;
 }
 
+static int test__checkevent_config_cache(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evlist__first(evlist);
+
+	TEST_ASSERT_VAL("wrong name setting", strcmp(evsel->name, "cachepmu") == 0);
+	return 0;
+}
 
 static int count_tracepoints(void)
 {
@@ -1619,6 +1626,11 @@ static struct evlist_test test__events[] = {
 		.check = test__checkevent_config_num,
 		.id    = 50,
 	},
+	{
+		.name  = "L1-dcache-misses/name=cachepmu/",
+		.check = test__checkevent_config_cache,
+		.id    = 51,
+	},
 };
 
 static struct evlist_test test__events_pmu[] = {
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 75576e1..2996aa4 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -350,11 +350,25 @@ static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES]
 	return -1;
 }
 
+typedef int config_term_func_t(struct perf_event_attr *attr,
+			       struct parse_events_term *term,
+			       struct parse_events_error *err);
+static int config_term_common(struct perf_event_attr *attr,
+			      struct parse_events_term *term,
+			      struct parse_events_error *err);
+static int config_attr(struct perf_event_attr *attr,
+		       struct list_head *head,
+		       struct parse_events_error *err,
+		       config_term_func_t config_term);
+
 int parse_events_add_cache(struct list_head *list, int *idx,
-			   char *type, char *op_result1, char *op_result2)
+			   char *type, char *op_result1, char *op_result2,
+			   struct parse_events_error *error,
+			   struct list_head *head_config)
 {
 	struct perf_event_attr attr;
-	char name[MAX_NAME_LEN];
+	LIST_HEAD(config_terms);
+	char name[MAX_NAME_LEN], *config_name;
 	int cache_type = -1, cache_op = -1, cache_result = -1;
 	char *op_result[2] = { op_result1, op_result2 };
 	int i, n;
@@ -368,6 +382,7 @@ int parse_events_add_cache(struct list_head *list, int *idx,
 	if (cache_type == -1)
 		return -EINVAL;
 
+	config_name = get_config_name(head_config);
 	n = snprintf(name, MAX_NAME_LEN, "%s", type);
 
 	for (i = 0; (i < 2) && (op_result[i]); i++) {
@@ -408,7 +423,16 @@ int parse_events_add_cache(struct list_head *list, int *idx,
 	memset(&attr, 0, sizeof(attr));
 	attr.config = cache_type | (cache_op << 8) | (cache_result << 16);
 	attr.type = PERF_TYPE_HW_CACHE;
-	return add_event(list, idx, &attr, name, NULL);
+
+	if (head_config) {
+		if (config_attr(&attr, head_config, error,
+				config_term_common))
+			return -EINVAL;
+
+		if (get_config_terms(head_config, &config_terms))
+			return -ENOMEM;
+	}
+	return add_event(list, idx, &attr, config_name ? : name, &config_terms);
 }
 
 static void tracepoint_error(struct parse_events_error *e, int err,
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 76151f9..d5eb2af 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -140,7 +140,9 @@ int parse_events_add_numeric(struct parse_events_evlist *data,
 			     u32 type, u64 config,
 			     struct list_head *head_config);
 int parse_events_add_cache(struct list_head *list, int *idx,
-			   char *type, char *op_result1, char *op_result2);
+			   char *type, char *op_result1, char *op_result2,
+			   struct parse_events_error *error,
+			   struct list_head *head_config);
 int parse_events_add_breakpoint(struct list_head *list, int *idx,
 				void *ptr, char *type, u64 len);
 int parse_events_add_pmu(struct parse_events_evlist *data,
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index 82029f9..6a2d006 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -293,33 +293,39 @@ value_sym sep_slash_dc
 }
 
 event_legacy_cache:
-PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT
+PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT '-' PE_NAME_CACHE_OP_RESULT opt_event_config
 {
 	struct parse_events_evlist *data = _data;
+	struct parse_events_error *error = data->error;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, $5));
+	ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, $5, error, $6));
+	parse_events_terms__delete($6);
 	$$ = list;
 }
 |
-PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT
+PE_NAME_CACHE_TYPE '-' PE_NAME_CACHE_OP_RESULT opt_event_config
 {
 	struct parse_events_evlist *data = _data;
+	struct parse_events_error *error = data->error;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, NULL));
+	ABORT_ON(parse_events_add_cache(list, &data->idx, $1, $3, NULL, error, $4));
+	parse_events_terms__delete($4);
 	$$ = list;
 }
 |
-PE_NAME_CACHE_TYPE
+PE_NAME_CACHE_TYPE opt_event_config
 {
 	struct parse_events_evlist *data = _data;
+	struct parse_events_error *error = data->error;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_cache(list, &data->idx, $1, NULL, NULL));
+	ABORT_ON(parse_events_add_cache(list, &data->idx, $1, NULL, NULL, error, $2));
+	parse_events_terms__delete($2);
 	$$ = list;
 }
 
-- 
cgit v0.10.2


From 467ef10c68b90b940412390dcd14bbfe8cc40e73 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 16 Feb 2016 23:08:19 +0900
Subject: perf hists browser: Fix percentage update on key press

Currently 'perf top --tui' decrements percentage of all entries on any
key press.  This is because it adds total period as new samples are
added to hists.  As perf-top does it currently but added samples are not
passed to the display thread, the percentages are decresing
continuously.

So separate total period stat into a different variable so that it
cannot affect the output total period.  This new total period stats are
used only for calcualating callchain percent limit.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: 0f58474ec835 ("perf hists: Update hists' total period when adding entries")
Link: http://lkml.kernel.org/r/1455631723-17345-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 561e947..a856617 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -405,6 +405,16 @@ static u8 symbol__parent_filter(const struct symbol *parent)
 	return 0;
 }
 
+static void hist_entry__add_callchain_period(struct hist_entry *he, u64 period)
+{
+	if (!symbol_conf.use_callchain)
+		return;
+
+	he->hists->callchain_period += period;
+	if (!he->filtered)
+		he->hists->callchain_non_filtered_period += period;
+}
+
 static struct hist_entry *hists__findnew_entry(struct hists *hists,
 					       struct hist_entry *entry,
 					       struct addr_location *al,
@@ -434,9 +444,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 		if (!cmp) {
 			if (sample_self) {
 				he_stat__add_period(&he->stat, period, weight);
-				hists->stats.total_period += period;
-				if (!he->filtered)
-					hists->stats.total_non_filtered_period += period;
+				hist_entry__add_callchain_period(he, period);
 			}
 			if (symbol_conf.cumulate_callchain)
 				he_stat__add_period(he->stat_acc, period, weight);
@@ -471,9 +479,8 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists,
 		return NULL;
 
 	if (sample_self)
-		hists__inc_stats(hists, he);
-	else
-		hists->nr_entries++;
+		hist_entry__add_callchain_period(he, period);
+	hists->nr_entries++;
 
 	rb_link_node(&he->rb_node_in, parent, p);
 	rb_insert_color(&he->rb_node_in, hists->entries_in);
@@ -1227,9 +1234,14 @@ static void output_resort(struct hists *hists, struct ui_progress *prog,
 	struct rb_root *root;
 	struct rb_node *next;
 	struct hist_entry *n;
+	u64 callchain_total;
 	u64 min_callchain_hits;
 
-	min_callchain_hits = hists__total_period(hists) * (callchain_param.min_percent / 100);
+	callchain_total = hists->callchain_period;
+	if (symbol_conf.filter_relative)
+		callchain_total = hists->callchain_non_filtered_period;
+
+	min_callchain_hits = callchain_total * (callchain_param.min_percent / 100);
 
 	if (sort__need_collapse)
 		root = &hists->entries_collapsed;
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 840b6d6..045a9e7 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -66,6 +66,8 @@ struct hists {
 	struct rb_root		entries_collapsed;
 	u64			nr_entries;
 	u64			nr_non_filtered_entries;
+	u64			callchain_period;
+	u64			callchain_non_filtered_period;
 	struct thread		*thread_filter;
 	const struct dso	*dso_filter;
 	const char		*uid_filter_str;
-- 
cgit v0.10.2


From 7565bd39c1a63c82350d26a66ea1a1f1bb49ad2e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 16 Feb 2016 23:08:20 +0900
Subject: perf callchain: Check return value of add_child()

The create_child() in add_child() can return NULL in case of memory
allocation failure.  So check the return value and bail out.  The proper
error handling will be added later.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1455631723-17345-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 53c43eb..134d88b 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -453,6 +453,9 @@ add_child(struct callchain_node *parent,
 	struct callchain_node *new;
 
 	new = create_child(parent, false);
+	if (new == NULL)
+		return NULL;
+
 	fill_node(new, cursor);
 
 	new->children_hit = 0;
@@ -524,6 +527,8 @@ split_add_child(struct callchain_node *parent,
 
 		node = callchain_cursor_current(cursor);
 		new = add_child(parent, cursor, period);
+		if (new == NULL)
+			return;
 
 		/*
 		 * This is second child since we moved parent's children
@@ -585,6 +590,9 @@ append_chain_children(struct callchain_node *root,
 	}
 	/* nothing in children, add to the current node */
 	rnode = add_child(root, cursor, period);
+	if (rnode == NULL)
+		return;
+
 	rb_link_node(&rnode->rb_node_in, parent, p);
 	rb_insert_color(&rnode->rb_node_in, &root->rb_root_in);
 
-- 
cgit v0.10.2


From 8451cbb9b174a9b6e016d7f1bff81ff12dbd1990 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 16 Feb 2016 23:08:21 +0900
Subject: perf callchain: Check return value of fill_node()

Memory allocation in the fill_node() can fail so change its return type
to int and check it in add_child() too.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1455631723-17345-4-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 134d88b..a82ea6f 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -416,7 +416,7 @@ create_child(struct callchain_node *parent, bool inherit_children)
 /*
  * Fill the node with callchain values
  */
-static void
+static int
 fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
 {
 	struct callchain_cursor_node *cursor_node;
@@ -433,7 +433,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
 		call = zalloc(sizeof(*call));
 		if (!call) {
 			perror("not enough memory for the code path tree");
-			return;
+			return -1;
 		}
 		call->ip = cursor_node->ip;
 		call->ms.sym = cursor_node->sym;
@@ -443,6 +443,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
 		callchain_cursor_advance(cursor);
 		cursor_node = callchain_cursor_current(cursor);
 	}
+	return 0;
 }
 
 static struct callchain_node *
@@ -456,7 +457,16 @@ add_child(struct callchain_node *parent,
 	if (new == NULL)
 		return NULL;
 
-	fill_node(new, cursor);
+	if (fill_node(new, cursor) < 0) {
+		struct callchain_list *call, *tmp;
+
+		list_for_each_entry_safe(call, tmp, &new->val, list) {
+			list_del(&call->list);
+			free(call);
+		}
+		free(new);
+		return NULL;
+	}
 
 	new->children_hit = 0;
 	new->hit = period;
-- 
cgit v0.10.2


From 2d713b809d89a3d10c6a85162bf7cce0468e45d9 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 16 Feb 2016 23:08:22 +0900
Subject: perf callchain: Add enum match_result for match_chain()

The append_chain() might return either result of match_chain() or other
(error) code.  But match_chain() can return any value in s64 type so
it's hard to check the error case.  Add new enum match_result and make
match_chain() return non-negative values only so that we can check the
error cases.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1455631723-17345-5-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index a82ea6f..dab2c1f 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -475,16 +475,32 @@ add_child(struct callchain_node *parent,
 	return new;
 }
 
-static s64 match_chain(struct callchain_cursor_node *node,
-		      struct callchain_list *cnode)
+enum match_result {
+	MATCH_ERROR  = -1,
+	MATCH_EQ,
+	MATCH_LT,
+	MATCH_GT,
+};
+
+static enum match_result match_chain(struct callchain_cursor_node *node,
+				     struct callchain_list *cnode)
 {
 	struct symbol *sym = node->sym;
+	u64 left, right;
 
 	if (cnode->ms.sym && sym &&
-	    callchain_param.key == CCKEY_FUNCTION)
-		return cnode->ms.sym->start - sym->start;
-	else
-		return cnode->ip - node->ip;
+	    callchain_param.key == CCKEY_FUNCTION) {
+		left = cnode->ms.sym->start;
+		right = sym->start;
+	} else {
+		left = cnode->ip;
+		right = node->ip;
+	}
+
+	if (left == right)
+		return MATCH_EQ;
+
+	return left > right ? MATCH_GT : MATCH_LT;
 }
 
 /*
@@ -549,7 +565,7 @@ split_add_child(struct callchain_node *parent,
 		cnode = list_first_entry(&first->val, struct callchain_list,
 					 list);
 
-		if (match_chain(node, cnode) < 0)
+		if (match_chain(node, cnode) == MATCH_LT)
 			pp = &p->rb_left;
 		else
 			pp = &p->rb_right;
@@ -562,7 +578,7 @@ split_add_child(struct callchain_node *parent,
 	}
 }
 
-static int
+static enum match_result
 append_chain(struct callchain_node *root,
 	     struct callchain_cursor *cursor,
 	     u64 period);
@@ -583,17 +599,17 @@ append_chain_children(struct callchain_node *root,
 
 	/* lookup in childrens */
 	while (*p) {
-		s64 ret;
+		enum match_result ret;
 
 		parent = *p;
 		rnode = rb_entry(parent, struct callchain_node, rb_node_in);
 
 		/* If at least first entry matches, rely to children */
 		ret = append_chain(rnode, cursor, period);
-		if (ret == 0)
+		if (ret == MATCH_EQ)
 			goto inc_children_hit;
 
-		if (ret < 0)
+		if (ret == MATCH_LT)
 			p = &parent->rb_left;
 		else
 			p = &parent->rb_right;
@@ -611,7 +627,7 @@ inc_children_hit:
 	root->children_count++;
 }
 
-static int
+static enum match_result
 append_chain(struct callchain_node *root,
 	     struct callchain_cursor *cursor,
 	     u64 period)
@@ -620,7 +636,7 @@ append_chain(struct callchain_node *root,
 	u64 start = cursor->pos;
 	bool found = false;
 	u64 matches;
-	int cmp = 0;
+	enum match_result cmp = MATCH_ERROR;
 
 	/*
 	 * Lookup in the current node
@@ -636,7 +652,7 @@ append_chain(struct callchain_node *root,
 			break;
 
 		cmp = match_chain(node, cnode);
-		if (cmp)
+		if (cmp != MATCH_EQ)
 			break;
 
 		found = true;
@@ -646,7 +662,7 @@ append_chain(struct callchain_node *root,
 
 	/* matches not, relay no the parent */
 	if (!found) {
-		WARN_ONCE(!cmp, "Chain comparison error\n");
+		WARN_ONCE(cmp == MATCH_ERROR, "Chain comparison error\n");
 		return cmp;
 	}
 
@@ -655,20 +671,20 @@ append_chain(struct callchain_node *root,
 	/* we match only a part of the node. Split it and add the new chain */
 	if (matches < root->val_nr) {
 		split_add_child(root, cursor, cnode, start, matches, period);
-		return 0;
+		return MATCH_EQ;
 	}
 
 	/* we match 100% of the path, increment the hit */
 	if (matches == root->val_nr && cursor->pos == cursor->nr) {
 		root->hit += period;
 		root->count++;
-		return 0;
+		return MATCH_EQ;
 	}
 
 	/* We match the node and still have a part remaining */
 	append_chain_children(root, cursor, period);
 
-	return 0;
+	return MATCH_EQ;
 }
 
 int callchain_append(struct callchain_root *root,
-- 
cgit v0.10.2


From f2bb4c5af4fe16d8b1e4ae371e1ceaa817380a88 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 16 Feb 2016 23:08:23 +0900
Subject: perf callchain: Check return value of split_add_child()

Now create_child() and add_child() return errors so check and pass it
to the caller.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1455631723-17345-6-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index dab2c1f..5259379 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -508,7 +508,7 @@ static enum match_result match_chain(struct callchain_cursor_node *node,
  * give a part of its callchain to the created child.
  * Then create another child to host the given callchain of new branch
  */
-static void
+static int
 split_add_child(struct callchain_node *parent,
 		struct callchain_cursor *cursor,
 		struct callchain_list *to_split,
@@ -520,6 +520,8 @@ split_add_child(struct callchain_node *parent,
 
 	/* split */
 	new = create_child(parent, true);
+	if (new == NULL)
+		return -1;
 
 	/* split the callchain and move a part to the new child */
 	old_tail = parent->val.prev;
@@ -554,7 +556,7 @@ split_add_child(struct callchain_node *parent,
 		node = callchain_cursor_current(cursor);
 		new = add_child(parent, cursor, period);
 		if (new == NULL)
-			return;
+			return -1;
 
 		/*
 		 * This is second child since we moved parent's children
@@ -576,6 +578,7 @@ split_add_child(struct callchain_node *parent,
 		parent->hit = period;
 		parent->count = 1;
 	}
+	return 0;
 }
 
 static enum match_result
@@ -670,7 +673,10 @@ append_chain(struct callchain_node *root,
 
 	/* we match only a part of the node. Split it and add the new chain */
 	if (matches < root->val_nr) {
-		split_add_child(root, cursor, cnode, start, matches, period);
+		if (split_add_child(root, cursor, cnode, start, matches,
+				    period) < 0)
+			return MATCH_ERROR;
+
 		return MATCH_EQ;
 	}
 
-- 
cgit v0.10.2


From dca0d122e498c054b117bd4aa5568ce90ee142d5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 16 Feb 2016 23:08:24 +0900
Subject: perf callchain: Check return value of append_chain_children()

Now it can check the error case, so check and pass it to the caller.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1455631723-17345-7-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index 5259379..24b4bd0 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -586,7 +586,7 @@ append_chain(struct callchain_node *root,
 	     struct callchain_cursor *cursor,
 	     u64 period);
 
-static void
+static int
 append_chain_children(struct callchain_node *root,
 		      struct callchain_cursor *cursor,
 		      u64 period)
@@ -598,7 +598,7 @@ append_chain_children(struct callchain_node *root,
 
 	node = callchain_cursor_current(cursor);
 	if (!node)
-		return;
+		return -1;
 
 	/* lookup in childrens */
 	while (*p) {
@@ -611,6 +611,8 @@ append_chain_children(struct callchain_node *root,
 		ret = append_chain(rnode, cursor, period);
 		if (ret == MATCH_EQ)
 			goto inc_children_hit;
+		if (ret == MATCH_ERROR)
+			return -1;
 
 		if (ret == MATCH_LT)
 			p = &parent->rb_left;
@@ -620,7 +622,7 @@ append_chain_children(struct callchain_node *root,
 	/* nothing in children, add to the current node */
 	rnode = add_child(root, cursor, period);
 	if (rnode == NULL)
-		return;
+		return -1;
 
 	rb_link_node(&rnode->rb_node_in, parent, p);
 	rb_insert_color(&rnode->rb_node_in, &root->rb_root_in);
@@ -628,6 +630,7 @@ append_chain_children(struct callchain_node *root,
 inc_children_hit:
 	root->children_hit += period;
 	root->children_count++;
+	return 0;
 }
 
 static enum match_result
@@ -688,7 +691,8 @@ append_chain(struct callchain_node *root,
 	}
 
 	/* We match the node and still have a part remaining */
-	append_chain_children(root, cursor, period);
+	if (append_chain_children(root, cursor, period) < 0)
+		return MATCH_ERROR;
 
 	return MATCH_EQ;
 }
@@ -702,7 +706,8 @@ int callchain_append(struct callchain_root *root,
 
 	callchain_cursor_commit(cursor);
 
-	append_chain_children(&root->node, cursor, period);
+	if (append_chain_children(&root->node, cursor, period) < 0)
+		return -1;
 
 	if (cursor->nr > root->max_depth)
 		root->max_depth = cursor->nr;
@@ -730,7 +735,8 @@ merge_chain_branch(struct callchain_cursor *cursor,
 
 	if (src->hit) {
 		callchain_cursor_commit(cursor);
-		append_chain_children(dst, cursor, src->hit);
+		if (append_chain_children(dst, cursor, src->hit) < 0)
+			return -1;
 	}
 
 	n = rb_first(&src->rb_root_in);
-- 
cgit v0.10.2


From bba58cdfaace2eb96d2b3cabc610d2ba033371c8 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 16 Feb 2016 23:08:25 +0900
Subject: perf hists: Return error from hists__collapse_resort()

Currently hists__collapse_resort() and hists__collapse_insert_entry()
don't return an error code. Now that callchain_merge() can check for
errors, abort and pass the error to the user.  A later patch can add
more work which also can fail.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1455631723-17345-8-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index a856617..827c6cb 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1046,8 +1046,8 @@ int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp,
  * collapse the histogram
  */
 
-bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
-				  struct rb_root *root, struct hist_entry *he)
+int hists__collapse_insert_entry(struct hists *hists, struct rb_root *root,
+				 struct hist_entry *he)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -1061,18 +1061,21 @@ bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
 		cmp = hist_entry__collapse(iter, he);
 
 		if (!cmp) {
+			int ret = 0;
+
 			he_stat__add_stat(&iter->stat, &he->stat);
 			if (symbol_conf.cumulate_callchain)
 				he_stat__add_stat(iter->stat_acc, he->stat_acc);
 
 			if (symbol_conf.use_callchain) {
 				callchain_cursor_reset(&callchain_cursor);
-				callchain_merge(&callchain_cursor,
-						iter->callchain,
-						he->callchain);
+				if (callchain_merge(&callchain_cursor,
+						    iter->callchain,
+						    he->callchain) < 0)
+					ret = -1;
 			}
 			hist_entry__delete(he);
-			return false;
+			return ret;
 		}
 
 		if (cmp < 0)
@@ -1084,7 +1087,7 @@ bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
 
 	rb_link_node(&he->rb_node_in, parent, p);
 	rb_insert_color(&he->rb_node_in, root);
-	return true;
+	return 1;
 }
 
 struct rb_root *hists__get_rotate_entries_in(struct hists *hists)
@@ -1110,14 +1113,15 @@ static void hists__apply_filters(struct hists *hists, struct hist_entry *he)
 	hists__filter_entry_by_socket(hists, he);
 }
 
-void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
+int hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
 {
 	struct rb_root *root;
 	struct rb_node *next;
 	struct hist_entry *n;
+	int ret;
 
 	if (!sort__need_collapse)
-		return;
+		return 0;
 
 	hists->nr_entries = 0;
 
@@ -1132,7 +1136,11 @@ void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
 		next = rb_next(&n->rb_node_in);
 
 		rb_erase(&n->rb_node_in, root);
-		if (hists__collapse_insert_entry(hists, &hists->entries_collapsed, n)) {
+		ret = hists__collapse_insert_entry(hists, &hists->entries_collapsed, n);
+		if (ret < 0)
+			return -1;
+
+		if (ret) {
 			/*
 			 * If it wasn't combined with one of the entries already
 			 * collapsed, we need to apply the filters that may have
@@ -1143,6 +1151,7 @@ void hists__collapse_resort(struct hists *hists, struct ui_progress *prog)
 		if (prog)
 			ui_progress__update(prog, 1);
 	}
+	return 0;
 }
 
 static int hist_entry__sort(struct hist_entry *a, struct hist_entry *b)
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 045a9e7..97baa1d 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -138,7 +138,7 @@ void hist_entry__delete(struct hist_entry *he);
 
 void perf_evsel__output_resort(struct perf_evsel *evsel, struct ui_progress *prog);
 void hists__output_resort(struct hists *hists, struct ui_progress *prog);
-void hists__collapse_resort(struct hists *hists, struct ui_progress *prog);
+int hists__collapse_resort(struct hists *hists, struct ui_progress *prog);
 
 void hists__decay_entries(struct hists *hists, bool zap_user, bool zap_kernel);
 void hists__delete_entries(struct hists *hists);
@@ -197,7 +197,7 @@ int hists__init(void);
 int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list);
 
 struct rb_root *hists__get_rotate_entries_in(struct hists *hists);
-bool hists__collapse_insert_entry(struct hists *hists __maybe_unused,
+int hists__collapse_insert_entry(struct hists *hists,
 				  struct rb_root *root, struct hist_entry *he);
 
 struct perf_hpp {
-- 
cgit v0.10.2


From 5b2ea6f2f6ac81a230e6cc68e1473e796a583f00 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Tue, 16 Feb 2016 23:08:26 +0900
Subject: perf report: Check error during report__collapse_hists()

If it returns an error, warn user and bail out instead of silently
ignoring it.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1455631723-17345-9-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 1eab50a..760e886 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -469,10 +469,11 @@ static int report__browse_hists(struct report *rep)
 	return ret;
 }
 
-static void report__collapse_hists(struct report *rep)
+static int report__collapse_hists(struct report *rep)
 {
 	struct ui_progress prog;
 	struct perf_evsel *pos;
+	int ret = 0;
 
 	ui_progress__init(&prog, rep->nr_entries, "Merging related events...");
 
@@ -484,7 +485,9 @@ static void report__collapse_hists(struct report *rep)
 
 		hists->socket_filter = rep->socket_filter;
 
-		hists__collapse_resort(hists, &prog);
+		ret = hists__collapse_resort(hists, &prog);
+		if (ret < 0)
+			break;
 
 		/* Non-group events are considered as leader */
 		if (symbol_conf.event_group &&
@@ -497,6 +500,7 @@ static void report__collapse_hists(struct report *rep)
 	}
 
 	ui_progress__finish();
+	return ret;
 }
 
 static void report__output_resort(struct report *rep)
@@ -564,7 +568,11 @@ static int __cmd_report(struct report *rep)
 		}
 	}
 
-	report__collapse_hists(rep);
+	ret = report__collapse_hists(rep);
+	if (ret) {
+		ui__error("failed to process hist entry\n");
+		return ret;
+	}
 
 	if (session_done())
 		return 0;
-- 
cgit v0.10.2


From b176862fca8625b0a8bee207bca9b611413e5e24 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 18 Feb 2016 21:00:41 +0100
Subject: x86/mm/ptdump: Remove paravirt_enabled()

is_hypervisor_range() can simply check if the PGD index is
within ffff800000000000 - ffff87ffffffffff which is the range
reserved for a hypervisor. That range is practically an ABI, see
Documentation/x86/x86_64/mm.txt.

Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> # Under Xen, as PV guest
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455825641-19585-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 4a6f1d9..99bfb19 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -358,20 +358,19 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
 #define pgd_none(a)  pud_none(__pud(pgd_val(a)))
 #endif
 
-#ifdef CONFIG_X86_64
 static inline bool is_hypervisor_range(int idx)
 {
+#ifdef CONFIG_X86_64
 	/*
 	 * ffff800000000000 - ffff87ffffffffff is reserved for
 	 * the hypervisor.
 	 */
-	return paravirt_enabled() &&
-		(idx >= pgd_index(__PAGE_OFFSET) - 16) &&
-		(idx < pgd_index(__PAGE_OFFSET));
-}
+	return	(idx >= pgd_index(__PAGE_OFFSET) - 16) &&
+		(idx <  pgd_index(__PAGE_OFFSET));
 #else
-static inline bool is_hypervisor_range(int idx) { return false; }
+	return false;
 #endif
+}
 
 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 				       bool checkwx)
-- 
cgit v0.10.2


From c25323c07345a843a56a294047b130dfd9250fad Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 18 Feb 2016 20:53:43 +0100
Subject: x86/tsc: Use topology functions

It's simpler to look at the topology mask than iterating over all online cpus
to find a cpu on the same package.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3d743da..5a6cb46 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1246,14 +1246,14 @@ void __init tsc_init(void)
  */
 unsigned long calibrate_delay_is_known(void)
 {
-	int i, cpu = smp_processor_id();
+	int sibling, cpu = smp_processor_id();
 
 	if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
 		return 0;
 
-	for_each_online_cpu(i)
-		if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
-			return cpu_data(i).loops_per_jiffy;
+	sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+	if (sibling < nr_cpu_ids)
+		return cpu_data(sibling).loops_per_jiffy;
 	return 0;
 }
 #endif
-- 
cgit v0.10.2


From 56e5fd8feb286ab71f4ca7674505b0d17967376d Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@nxp.com>
Date: Sun, 21 Feb 2016 11:35:00 -0300
Subject: ASoC: fsl_ssi: Go back to explicit register defaults
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 5c408fee2546 ("ASoC: fsl_ssi: remove explicit register defaults")
causes the driver to fail to probe:

fsl-ssi-dai 2028000.ssi: No cache defaults, reading back from HW
fsl-ssi-dai 2028000.ssi: Failed to init register map
fsl-ssi-dai: probe of 2028000.ssi failed with error -22

, so revert this commit.

Reported-by: Mika Penttilä <mika.penttila@nextfour.com>
Signed-off-by: Fabio Estevam <fabio.estevam@nxp.com>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c
index ed8de10..40dfd8a 100644
--- a/sound/soc/fsl/fsl_ssi.c
+++ b/sound/soc/fsl/fsl_ssi.c
@@ -112,6 +112,20 @@ struct fsl_ssi_rxtx_reg_val {
 	struct fsl_ssi_reg_val tx;
 };
 
+static const struct reg_default fsl_ssi_reg_defaults[] = {
+	{CCSR_SSI_SCR,     0x00000000},
+	{CCSR_SSI_SIER,    0x00003003},
+	{CCSR_SSI_STCR,    0x00000200},
+	{CCSR_SSI_SRCR,    0x00000200},
+	{CCSR_SSI_STCCR,   0x00040000},
+	{CCSR_SSI_SRCCR,   0x00040000},
+	{CCSR_SSI_SACNT,   0x00000000},
+	{CCSR_SSI_STMSK,   0x00000000},
+	{CCSR_SSI_SRMSK,   0x00000000},
+	{CCSR_SSI_SACCEN,  0x00000000},
+	{CCSR_SSI_SACCDIS, 0x00000000},
+};
+
 static bool fsl_ssi_readable_reg(struct device *dev, unsigned int reg)
 {
 	switch (reg) {
@@ -176,7 +190,8 @@ static const struct regmap_config fsl_ssi_regconfig = {
 	.val_bits = 32,
 	.reg_stride = 4,
 	.val_format_endian = REGMAP_ENDIAN_NATIVE,
-	.num_reg_defaults_raw = CCSR_SSI_SACCDIS / sizeof(uint32_t) + 1,
+	.reg_defaults = fsl_ssi_reg_defaults,
+	.num_reg_defaults = ARRAY_SIZE(fsl_ssi_reg_defaults),
 	.readable_reg = fsl_ssi_readable_reg,
 	.volatile_reg = fsl_ssi_volatile_reg,
 	.precious_reg = fsl_ssi_precious_reg,
@@ -186,7 +201,6 @@ static const struct regmap_config fsl_ssi_regconfig = {
 
 struct fsl_ssi_soc_data {
 	bool imx;
-	bool imx21regs; /* imx21-class SSI - no SACC{ST,EN,DIS} regs */
 	bool offline_config;
 	u32 sisr_write_mask;
 };
@@ -289,7 +303,6 @@ static struct fsl_ssi_soc_data fsl_ssi_mpc8610 = {
 
 static struct fsl_ssi_soc_data fsl_ssi_imx21 = {
 	.imx = true,
-	.imx21regs = true,
 	.offline_config = true,
 	.sisr_write_mask = 0,
 };
@@ -573,12 +586,8 @@ static void fsl_ssi_setup_ac97(struct fsl_ssi_private *ssi_private)
 	 */
 	regmap_write(regs, CCSR_SSI_SACNT,
 			CCSR_SSI_SACNT_AC97EN | CCSR_SSI_SACNT_FV);
-
-	/* no SACC{ST,EN,DIS} regs on imx21-class SSI */
-	if (!ssi_private->soc->imx21regs) {
-		regmap_write(regs, CCSR_SSI_SACCDIS, 0xff);
-		regmap_write(regs, CCSR_SSI_SACCEN, 0x300);
-	}
+	regmap_write(regs, CCSR_SSI_SACCDIS, 0xff);
+	regmap_write(regs, CCSR_SSI_SACCEN, 0x300);
 
 	/*
 	 * Enable SSI, Transmit and Receive. AC97 has to communicate with the
@@ -1388,7 +1397,6 @@ static int fsl_ssi_probe(struct platform_device *pdev)
 	struct resource *res;
 	void __iomem *iomem;
 	char name[64];
-	struct regmap_config regconfig = fsl_ssi_regconfig;
 
 	of_id = of_match_device(fsl_ssi_ids, &pdev->dev);
 	if (!of_id || !of_id->data)
@@ -1436,25 +1444,15 @@ static int fsl_ssi_probe(struct platform_device *pdev)
 		return PTR_ERR(iomem);
 	ssi_private->ssi_phys = res->start;
 
-	if (ssi_private->soc->imx21regs) {
-		/*
-		 * According to datasheet imx21-class SSI
-		 * don't have SACC{ST,EN,DIS} regs.
-		 */
-		regconfig.max_register = CCSR_SSI_SRMSK;
-		regconfig.num_reg_defaults_raw =
-			CCSR_SSI_SRMSK / sizeof(uint32_t) + 1;
-	}
-
 	ret = of_property_match_string(np, "clock-names", "ipg");
 	if (ret < 0) {
 		ssi_private->has_ipg_clk_name = false;
 		ssi_private->regs = devm_regmap_init_mmio(&pdev->dev, iomem,
-			&regconfig);
+			&fsl_ssi_regconfig);
 	} else {
 		ssi_private->has_ipg_clk_name = true;
 		ssi_private->regs = devm_regmap_init_mmio_clk(&pdev->dev,
-			"ipg", iomem, &regconfig);
+			"ipg", iomem, &fsl_ssi_regconfig);
 	}
 	if (IS_ERR(ssi_private->regs)) {
 		dev_err(&pdev->dev, "Failed to init register map\n");
-- 
cgit v0.10.2


From e267d97b83d9cecc16c54825f9f3ac7f72dc1e1e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:12 -0800
Subject: asm-generic: Consolidate mark_rodata_ro()

Instead of defining mark_rodata_ro() in each architecture, consolidate it.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Will Deacon <will.deacon@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Gross <agross@codeaurora.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ashok Kumar <ashoks@broadcom.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Brown <david.brown@linaro.org>
Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Helge Deller <deller@gmx.de>
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Link: http://lkml.kernel.org/r/1455748879-21872-2-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index d5525bf..9156fc3 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -491,7 +491,6 @@ static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
 #endif
 
 #ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
 #else
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 7fc294c..22dda61 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -156,8 +156,4 @@ int set_memory_rw(unsigned long addr, int numpages);
 int set_memory_x(unsigned long addr, int numpages);
 int set_memory_nx(unsigned long addr, int numpages);
 
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
 #endif
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index 845272c..7bd69bd 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -121,10 +121,6 @@ flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vma
 	}
 }
 
-#ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
-#endif
-
 #include <asm/kmap_types.h>
 
 #define ARCH_HAS_KMAP
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index e63aa38..c8cff75 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -92,7 +92,6 @@ void clflush_cache_range(void *addr, unsigned int size);
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
 #ifdef CONFIG_DEBUG_RODATA
-void mark_rodata_ro(void);
 extern const int rodata_test_data;
 extern int kernel_set_to_readonly;
 void set_kernel_text_rw(void);
diff --git a/include/linux/init.h b/include/linux/init.h
index b449f37..aedb254 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -142,6 +142,10 @@ void prepare_namespace(void);
 void __init load_default_modules(void);
 int __init init_rootfs(void);
 
+#ifdef CONFIG_DEBUG_RODATA
+void mark_rodata_ro(void);
+#endif
+
 extern void (*late_time_init)(void);
 
 extern bool initcall_debug;
-- 
cgit v0.10.2


From d2aa1acad22f1bdd0cfa67b3861800e392254454 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:13 -0800
Subject: mm/init: Add 'rodata=off' boot cmdline parameter to disable read-only
 kernel mappings

It may be useful to debug writes to the readonly sections of memory,
so provide a cmdline "rodata=off" to allow for this. This can be
expanded in the future to support "log" and "write" modes, but that
will need to be architecture-specific.

This also makes KDB software breakpoints more usable, as read-only
mappings can now be disabled on any kernel.

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-3-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9a53c92..0003367 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3491,6 +3491,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	ro		[KNL] Mount root device read-only on boot
 
+	rodata=		[KNL]
+		on	Mark read-only kernel memory as read-only (default).
+		off	Leave read-only kernel memory writable for debugging.
+
 	root=		[KNL] Root filesystem
 			See name_to_dev_t comment in init/do_mounts.c.
 
diff --git a/init/main.c b/init/main.c
index 58c9e37..928a343 100644
--- a/init/main.c
+++ b/init/main.c
@@ -93,9 +93,6 @@ static int kernel_init(void *);
 extern void init_IRQ(void);
 extern void fork_init(void);
 extern void radix_tree_init(void);
-#ifndef CONFIG_DEBUG_RODATA
-static inline void mark_rodata_ro(void) { }
-#endif
 
 /*
  * Debug helper: via this flag we know that we are in 'early bootup code'
@@ -929,6 +926,28 @@ static int try_to_run_init_process(const char *init_filename)
 
 static noinline void __init kernel_init_freeable(void);
 
+#ifdef CONFIG_DEBUG_RODATA
+static bool rodata_enabled = true;
+static int __init set_debug_rodata(char *str)
+{
+	return strtobool(str, &rodata_enabled);
+}
+__setup("rodata=", set_debug_rodata);
+
+static void mark_readonly(void)
+{
+	if (rodata_enabled)
+		mark_rodata_ro();
+	else
+		pr_info("Kernel memory protection disabled.\n");
+}
+#else
+static inline void mark_readonly(void)
+{
+	pr_warn("This architecture does not have kernel memory protection.\n");
+}
+#endif
+
 static int __ref kernel_init(void *unused)
 {
 	int ret;
@@ -937,7 +956,7 @@ static int __ref kernel_init(void *unused)
 	/* need to finish all async __init code before freeing the memory */
 	async_synchronize_full();
 	free_initmem();
-	mark_rodata_ro();
+	mark_readonly();
 	system_state = SYSTEM_RUNNING;
 	numa_default_policy();
 
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index e1dbf4a..90ff129 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
 	} else {
 		kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
 			   __func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
 		if (!bp->bp_type) {
 			kdb_printf("Software breakpoints are unavailable.\n"
-				   "  Change the kernel CONFIG_DEBUG_RODATA=n\n"
+				   "  Boot the kernel with rodata=off\n"
 				   "  OR use hw breaks: help bph\n");
 		}
-#endif
 		return 1;
 	}
 	return 0;
-- 
cgit v0.10.2


From 9ccaf77cf05915f51231d158abfd5448aedde758 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:14 -0800
Subject: x86/mm: Always enable CONFIG_DEBUG_RODATA and remove the Kconfig
 option

This removes the CONFIG_DEBUG_RODATA option and makes it always enabled.

This simplifies the code and also makes it clearer that read-only mapped
memory is just as fundamental a security feature in kernel-space as it is
in user-space.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-4-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c46662f..b105105 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -303,6 +303,9 @@ config ARCH_SUPPORTS_UPROBES
 config FIX_EARLYCON_MEM
 	def_bool y
 
+config DEBUG_RODATA
+	def_bool y
+
 config PGTABLE_LEVELS
 	int
 	default 4 if X86_64
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 9b18ed9..7816b7b 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -74,28 +74,16 @@ config EFI_PGT_DUMP
 	  issues with the mapping of the EFI runtime regions into that
 	  table.
 
-config DEBUG_RODATA
-	bool "Write protect kernel read-only data structures"
-	default y
-	depends on DEBUG_KERNEL
-	---help---
-	  Mark the kernel read-only data as write-protected in the pagetables,
-	  in order to catch accidental (and incorrect) writes to such const
-	  data. This is recommended so that we can catch kernel bugs sooner.
-	  If in doubt, say "Y".
-
 config DEBUG_RODATA_TEST
-	bool "Testcase for the DEBUG_RODATA feature"
-	depends on DEBUG_RODATA
+	bool "Testcase for the marking rodata read-only"
 	default y
 	---help---
-	  This option enables a testcase for the DEBUG_RODATA
-	  feature as well as for the change_page_attr() infrastructure.
+	  This option enables a testcase for the setting rodata read-only
+	  as well as for the change_page_attr() infrastructure.
 	  If in doubt, say "N"
 
 config DEBUG_WX
 	bool "Warn on W+X mappings at boot"
-	depends on DEBUG_RODATA
 	select X86_PTDUMP_CORE
 	---help---
 	  Generate a warning if any W+X mappings are found at boot.
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index c8cff75..61518cf 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -91,15 +91,10 @@ void clflush_cache_range(void *addr, unsigned int size);
 
 #define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
 
-#ifdef CONFIG_DEBUG_RODATA
 extern const int rodata_test_data;
 extern int kernel_set_to_readonly;
 void set_kernel_text_rw(void);
 void set_kernel_text_ro(void);
-#else
-static inline void set_kernel_text_rw(void) { }
-static inline void set_kernel_text_ro(void) { }
-#endif
 
 #ifdef CONFIG_DEBUG_RODATA_TEST
 int rodata_test(void);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c1adf33..bc62e7c 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -17,15 +17,8 @@ static inline bool kvm_check_and_clear_guest_paused(void)
 }
 #endif /* CONFIG_KVM_GUEST */
 
-#ifdef CONFIG_DEBUG_RODATA
 #define KVM_HYPERCALL \
         ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
-#else
-/* On AMD processors, vmcall will generate a trap that we will
- * then rewrite to the appropriate instruction.
- */
-#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
-#endif
 
 /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
  * instruction.  The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 0a52424..13b6cdd 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -7,7 +7,7 @@
 extern char __brk_base[], __brk_limit[];
 extern struct exception_table_entry __stop___ex_table[];
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
 extern char __end_rodata_hpage_align[];
 #endif
 
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 29408d6..05c9e3f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -81,9 +81,9 @@ within(unsigned long addr, unsigned long start, unsigned long end)
 static unsigned long text_ip_addr(unsigned long ip)
 {
 	/*
-	 * On x86_64, kernel text mappings are mapped read-only with
-	 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
-	 * of the kernel text mapping to modify the kernel text.
+	 * On x86_64, kernel text mappings are mapped read-only, so we use
+	 * the kernel identity mapping instead of the kernel text mapping
+	 * to modify the kernel text.
 	 *
 	 * For 32bit kernels, these mappings are same and we can use
 	 * kernel identity mapping to modify code.
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 44256a6..ed15cd48 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -750,9 +750,7 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 {
 	int err;
-#ifdef CONFIG_DEBUG_RODATA
 	char opc[BREAK_INSTR_SIZE];
-#endif /* CONFIG_DEBUG_RODATA */
 
 	bpt->type = BP_BREAKPOINT;
 	err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
@@ -761,7 +759,6 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 		return err;
 	err = probe_kernel_write((char *)bpt->bpt_addr,
 				 arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
-#ifdef CONFIG_DEBUG_RODATA
 	if (!err)
 		return err;
 	/*
@@ -778,13 +775,12 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 	if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
 		return -EINVAL;
 	bpt->type = BP_POKE_BREAKPOINT;
-#endif /* CONFIG_DEBUG_RODATA */
+
 	return err;
 }
 
 int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
 {
-#ifdef CONFIG_DEBUG_RODATA
 	int err;
 	char opc[BREAK_INSTR_SIZE];
 
@@ -801,8 +797,8 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
 	if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
 		goto knl_write;
 	return err;
+
 knl_write:
-#endif /* CONFIG_DEBUG_RODATA */
 	return probe_kernel_write((char *)bpt->bpt_addr,
 				  (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
 }
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 3f92ce0..27538f1 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -142,7 +142,6 @@ static int test_NX(void)
 	 * by the error message
 	 */
 
-#ifdef CONFIG_DEBUG_RODATA
 	/* Test 3: Check if the .rodata section is executable */
 	if (rodata_test_data != 0xC3) {
 		printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
@@ -151,7 +150,6 @@ static int test_NX(void)
 		printk(KERN_ERR "test_nx: .rodata section is executable\n");
 		ret = -ENODEV;
 	}
-#endif
 
 #if 0
 	/* Test 4: Check if the .data section of a module is executable */
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index 5ecbfe5..cb4a01b 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -76,5 +76,5 @@ int rodata_test(void)
 }
 
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
+MODULE_DESCRIPTION("Testcase for marking rodata as read-only");
 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf1..fe133b7 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,29 +41,28 @@ ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 #endif
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
 /*
- * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
- * we retain large page mappings for boundaries spanning kernel text, rodata
- * and data sections.
+ * On 64-bit, align RODATA to 2MB so we retain large page mappings for
+ * boundaries spanning kernel text, rodata and data sections.
  *
  * However, kernel identity mappings will have different RWX permissions
  * to the pages mapping to text and to the pages padding (which are freed) the
  * text section. Hence kernel identity mappings will be broken to smaller
  * pages. For 64-bit, kernel text and kernel identity mappings are different,
- * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
- * as well as retain 2MB large page mappings for kernel text.
+ * so we can enable protection checks as well as retain 2MB large page
+ * mappings for kernel text.
  */
-#define X64_ALIGN_DEBUG_RODATA_BEGIN	. = ALIGN(HPAGE_SIZE);
+#define X64_ALIGN_RODATA_BEGIN	. = ALIGN(HPAGE_SIZE);
 
-#define X64_ALIGN_DEBUG_RODATA_END				\
+#define X64_ALIGN_RODATA_END					\
 		. = ALIGN(HPAGE_SIZE);				\
 		__end_rodata_hpage_align = .;
 
 #else
 
-#define X64_ALIGN_DEBUG_RODATA_BEGIN
-#define X64_ALIGN_DEBUG_RODATA_END
+#define X64_ALIGN_RODATA_BEGIN
+#define X64_ALIGN_RODATA_END
 
 #endif
 
@@ -112,13 +111,11 @@ SECTIONS
 
 	EXCEPTION_TABLE(16) :text = 0x9090
 
-#if defined(CONFIG_DEBUG_RODATA)
 	/* .text should occupy whole number of pages */
 	. = ALIGN(PAGE_SIZE);
-#endif
-	X64_ALIGN_DEBUG_RODATA_BEGIN
+	X64_ALIGN_RODATA_BEGIN
 	RO_DATA(PAGE_SIZE)
-	X64_ALIGN_DEBUG_RODATA_END
+	X64_ALIGN_RODATA_END
 
 	/* Data */
 	.data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index cb4ef3d..2ebfbaf 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -871,7 +871,6 @@ static noinline int do_test_wp_bit(void)
 	return flag;
 }
 
-#ifdef CONFIG_DEBUG_RODATA
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);
 
@@ -960,5 +959,3 @@ void mark_rodata_ro(void)
 	if (__supported_pte_mask & _PAGE_NX)
 		debug_checkwx();
 }
-#endif
-
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5488d21..a40b755 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1074,7 +1074,6 @@ void __init mem_init(void)
 	mem_init_print_info(NULL);
 }
 
-#ifdef CONFIG_DEBUG_RODATA
 const int rodata_test_data = 0xC3;
 EXPORT_SYMBOL_GPL(rodata_test_data);
 
@@ -1166,8 +1165,6 @@ void mark_rodata_ro(void)
 	debug_checkwx();
 }
 
-#endif
-
 int kern_addr_valid(unsigned long addr)
 {
 	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2440814..2450488 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -283,7 +283,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+#if defined(CONFIG_X86_64)
 	/*
 	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 	 * kernel text mappings for the large page aligned text, rodata sections
-- 
cgit v0.10.2


From c74ba8b3480da6ddaea17df2263ec09b869ac496 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:15 -0800
Subject: arch: Introduce post-init read-only memory

One of the easiest ways to protect the kernel from attack is to reduce
the internal attack surface exposed when a "write" flaw is available. By
making as much of the kernel read-only as possible, we reduce the
attack surface.

Many things are written to only during __init, and never changed
again. These cannot be made "const" since the compiler will do the wrong
thing (we do actually need to write to them). Instead, move these items
into a memory region that will be made read-only during mark_rodata_ro()
which happens after all kernel __init code has finished.

This introduces __ro_after_init as a way to mark such memory, and adds
some documentation about the existing __read_mostly marking.

This improves the security of the Linux kernel by marking formerly
read-write memory regions as read-only on a fully booted up system.

Based on work by PaX Team and Brad Spengler.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-5-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h
index 3d0e17b..df0f52b 100644
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -22,6 +22,9 @@
 
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
+/* Read-only memory is marked before mark_rodata_ro() is called. */
+#define __ro_after_init	__read_mostly
+
 void parisc_cache_init(void);	/* initializes cache-flushing */
 void disable_sr_hashing_asm(int); /* low level support for above */
 void disable_sr_hashing(void);   /* turns off space register hashing */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c4bd0e2..772c784 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -256,6 +256,7 @@
 	.rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {		\
 		VMLINUX_SYMBOL(__start_rodata) = .;			\
 		*(.rodata) *(.rodata.*)					\
+		*(.data..ro_after_init)	/* Read only after init */	\
 		*(__vermagic)		/* Kernel version magic */	\
 		. = ALIGN(8);						\
 		VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .;		\
diff --git a/include/linux/cache.h b/include/linux/cache.h
index 17e7e82..1be04f8 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -12,10 +12,24 @@
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
 #endif
 
+/*
+ * __read_mostly is used to keep rarely changing variables out of frequently
+ * updated cachelines. If an architecture doesn't support it, ignore the
+ * hint.
+ */
 #ifndef __read_mostly
 #define __read_mostly
 #endif
 
+/*
+ * __ro_after_init is used to mark things that are read-only after init (i.e.
+ * after mark_rodata_ro() has been called). These are effectively read-only,
+ * but may get written to during init, so can't live in .rodata (via "const").
+ */
+#ifndef __ro_after_init
+#define __ro_after_init __attribute__((__section__(".data..ro_after_init")))
+#endif
+
 #ifndef ____cacheline_aligned
 #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
 #endif
-- 
cgit v0.10.2


From 7cca071ccbd2a293ea69168ace6abbcdce53098e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:16 -0800
Subject: lkdtm: Verify that '__ro_after_init' works correctly

The new __ro_after_init section should be writable before init, but
not after. Validate that it gets updated at init and can't be written
to afterwards.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-6-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c
index 11fdadc..2a6eaf1 100644
--- a/drivers/misc/lkdtm.c
+++ b/drivers/misc/lkdtm.c
@@ -103,6 +103,7 @@ enum ctype {
 	CT_EXEC_USERSPACE,
 	CT_ACCESS_USERSPACE,
 	CT_WRITE_RO,
+	CT_WRITE_RO_AFTER_INIT,
 	CT_WRITE_KERN,
 };
 
@@ -140,6 +141,7 @@ static char* cp_type[] = {
 	"EXEC_USERSPACE",
 	"ACCESS_USERSPACE",
 	"WRITE_RO",
+	"WRITE_RO_AFTER_INIT",
 	"WRITE_KERN",
 };
 
@@ -162,6 +164,7 @@ static DEFINE_SPINLOCK(lock_me_up);
 static u8 data_area[EXEC_SIZE];
 
 static const unsigned long rodata = 0xAA55AA55;
+static unsigned long ro_after_init __ro_after_init = 0x55AA5500;
 
 module_param(recur_count, int, 0644);
 MODULE_PARM_DESC(recur_count, " Recursion level for the stack overflow test");
@@ -503,11 +506,28 @@ static void lkdtm_do_action(enum ctype which)
 		break;
 	}
 	case CT_WRITE_RO: {
-		unsigned long *ptr;
+		/* Explicitly cast away "const" for the test. */
+		unsigned long *ptr = (unsigned long *)&rodata;
 
-		ptr = (unsigned long *)&rodata;
+		pr_info("attempting bad rodata write at %p\n", ptr);
+		*ptr ^= 0xabcd1234;
 
-		pr_info("attempting bad write at %p\n", ptr);
+		break;
+	}
+	case CT_WRITE_RO_AFTER_INIT: {
+		unsigned long *ptr = &ro_after_init;
+
+		/*
+		 * Verify we were written to during init. Since an Oops
+		 * is considered a "success", a failure is to just skip the
+		 * real test.
+		 */
+		if ((*ptr & 0xAA) != 0xAA) {
+			pr_info("%p was NOT written during init!?\n", ptr);
+			break;
+		}
+
+		pr_info("attempting bad ro_after_init write at %p\n", ptr);
 		*ptr ^= 0xabcd1234;
 
 		break;
@@ -817,6 +837,9 @@ static int __init lkdtm_module_init(void)
 	int n_debugfs_entries = 1; /* Assume only the direct entry */
 	int i;
 
+	/* Make sure we can write to __ro_after_init values during __init */
+	ro_after_init |= 0xAA;
+
 	/* Register debugfs interface */
 	lkdtm_debugfs_root = debugfs_create_dir("provoke-crash", NULL);
 	if (!lkdtm_debugfs_root) {
-- 
cgit v0.10.2


From 018ef8dcf3de5f62e2cc1a9273cc27e1c6ba8de5 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 17 Feb 2016 14:41:17 -0800
Subject: x86/vdso: Mark the vDSO code read-only after init

The vDSO does not need to be writable after __init, so mark it as
__ro_after_init. The result kills the exploit method of writing to the
vDSO from kernel space resulting in userspace executing the modified code,
as shown here to bypass SMEP restrictions: http://itszn.com/blog/?p=21

The memory map (with added vDSO address reporting) shows the vDSO moving
into read-only memory:

Before:
	[    0.143067] vDSO @ ffffffff82004000
	[    0.143551] vDSO @ ffffffff82006000
	---[ High Kernel Mapping ]---
	0xffffffff80000000-0xffffffff81000000      16M                         pmd
	0xffffffff81000000-0xffffffff81800000       8M   ro     PSE     GLB x  pmd
	0xffffffff81800000-0xffffffff819f3000    1996K   ro             GLB x  pte
	0xffffffff819f3000-0xffffffff81a00000      52K   ro                 NX pte
	0xffffffff81a00000-0xffffffff81e00000       4M   ro     PSE     GLB NX pmd
	0xffffffff81e00000-0xffffffff81e05000      20K   ro             GLB NX pte
	0xffffffff81e05000-0xffffffff82000000    2028K   ro                 NX pte
	0xffffffff82000000-0xffffffff8214f000    1340K   RW             GLB NX pte
	0xffffffff8214f000-0xffffffff82281000    1224K   RW                 NX pte
	0xffffffff82281000-0xffffffff82400000    1532K   RW             GLB NX pte
	0xffffffff82400000-0xffffffff83200000      14M   RW     PSE     GLB NX pmd
	0xffffffff83200000-0xffffffffc0000000     974M                         pmd

After:
	[    0.145062] vDSO @ ffffffff81da1000
	[    0.146057] vDSO @ ffffffff81da4000
	---[ High Kernel Mapping ]---
	0xffffffff80000000-0xffffffff81000000      16M                         pmd
	0xffffffff81000000-0xffffffff81800000       8M   ro     PSE     GLB x  pmd
	0xffffffff81800000-0xffffffff819f3000    1996K   ro             GLB x  pte
	0xffffffff819f3000-0xffffffff81a00000      52K   ro                 NX pte
	0xffffffff81a00000-0xffffffff81e00000       4M   ro     PSE     GLB NX pmd
	0xffffffff81e00000-0xffffffff81e0b000      44K   ro             GLB NX pte
	0xffffffff81e0b000-0xffffffff82000000    2004K   ro                 NX pte
	0xffffffff82000000-0xffffffff8214c000    1328K   RW             GLB NX pte
	0xffffffff8214c000-0xffffffff8227e000    1224K   RW                 NX pte
	0xffffffff8227e000-0xffffffff82400000    1544K   RW             GLB NX pte
	0xffffffff82400000-0xffffffff83200000      14M   RW     PSE     GLB NX pmd
	0xffffffff83200000-0xffffffffc0000000     974M                         pmd

Based on work by PaX Team and Brad Spengler.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Brown <david.brown@linaro.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Link: http://lkml.kernel.org/r/1455748879-21872-7-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 0224987..3f69326 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -140,7 +140,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
 	fprintf(outfile, "#include <asm/vdso.h>\n");
 	fprintf(outfile, "\n");
 	fprintf(outfile,
-		"static unsigned char raw_data[%lu] __page_aligned_data = {",
+		"static unsigned char raw_data[%lu] __ro_after_init __aligned(PAGE_SIZE) = {",
 		mapping_size);
 	for (j = 0; j < stripped_len; j++) {
 		if (j % 10 == 0)
-- 
cgit v0.10.2


From 11bf9b865898961cee60a41c483c9f27ec76e12e Mon Sep 17 00:00:00 2001
From: David Brown <david.brown@linaro.org>
Date: Wed, 17 Feb 2016 14:41:18 -0800
Subject: ARM/vdso: Mark the vDSO code read-only after init

Although the ARM vDSO is cleanly separated by code/data with the code
being read-only in userspace mappings, the code page is still writable
from the kernel.

There have been exploits (such as http://itszn.com/blog/?p=21) that
take advantage of this on x86 to go from a bad kernel write to full
root.

Prevent this specific exploit class on ARM as well by putting the vDSO
code page in post-init read-only memory as well.

Before:
	vdso: 1 text pages at base 80927000
	root@Vexpress:/ cat /sys/kernel/debug/kernel_page_tables
	---[ Modules ]---
	---[ Kernel Mapping ]---
	0x80000000-0x80100000           1M     RW NX SHD
	0x80100000-0x80600000           5M     ro x  SHD
	0x80600000-0x80800000           2M     ro NX SHD
	0x80800000-0xbe000000         984M     RW NX SHD

After:
	vdso: 1 text pages at base 8072b000
	root@Vexpress:/ cat /sys/kernel/debug/kernel_page_tables
	---[ Modules ]---
	---[ Kernel Mapping ]---
	0x80000000-0x80100000           1M     RW NX SHD
	0x80100000-0x80600000           5M     ro x  SHD
	0x80600000-0x80800000           2M     ro NX SHD
	0x80800000-0xbe000000         984M     RW NX SHD

Inspired by https://lkml.org/lkml/2016/1/19/494 based on work by the
PaX Team, Brad Spengler, and Kees Cook.

Signed-off-by: David Brown <david.brown@linaro.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Emese Revfy <re.emese@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Krause <minipli@googlemail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nathan Lynch <nathan_lynch@mentor.com>
Cc: PaX Team <pageexec@freemail.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-arch <linux-arch@vger.kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1455748879-21872-8-git-send-email-keescook@chromium.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/arm/vdso/vdso.S b/arch/arm/vdso/vdso.S
index b2b97e3..a62a7b6 100644
--- a/arch/arm/vdso/vdso.S
+++ b/arch/arm/vdso/vdso.S
@@ -23,9 +23,8 @@
 #include <linux/const.h>
 #include <asm/page.h>
 
-	__PAGE_ALIGNED_DATA
-
 	.globl vdso_start, vdso_end
+	.section .data..ro_after_init
 	.balign PAGE_SIZE
 vdso_start:
 	.incbin "arch/arm/vdso/vdso.so"
-- 
cgit v0.10.2


From c7edd7f99cdaec65e9303b4c9edc19bb595564ed Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Fri, 19 Feb 2016 11:04:45 +0100
Subject: ARM: 8534/1: virt: fix hyp-stub build for pre-ARMv7 CPUs

ARMv6 CPUs do not have virtualisation extensions, but hyp-stub.S is
still included into the image to keep it generic. In order to use ARMv7
instructions during HYP initialisation, add -march=armv7-a flag to
hyp-stub's build.

On an ARMv6 CPU, __hyp_stub_install returns as soon as it detects that
the mode isn't HYP, so we will never reach those instructions.

Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index 7a6a58e..43788b1 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -195,5 +195,7 @@ CFLAGS_font.o := -Dstatic=
 $(obj)/font.c: $(FONTC)
 	$(call cmd,shipped)
 
+AFLAGS_hyp-stub.o := -Wa,-march=armv7-a
+
 $(obj)/hyp-stub.S: $(srctree)/arch/$(SRCARCH)/kernel/hyp-stub.S
 	$(call cmd,shipped)
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 2c5f160..ad325a8 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -88,6 +88,7 @@ obj-$(CONFIG_DEBUG_LL)	+= debug.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 
 obj-$(CONFIG_ARM_VIRT_EXT)	+= hyp-stub.o
+AFLAGS_hyp-stub.o		:=-Wa,-march=armv7-a
 ifeq ($(CONFIG_ARM_PSCI),y)
 obj-$(CONFIG_SMP)		+= psci_smp.o
 endif
-- 
cgit v0.10.2


From 2c97b0d4a757eec7b83acfe3895d94ad4db13827 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 19 Feb 2016 19:47:04 -0300
Subject: perf tools: Fix build on older systems
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In RHEL 6.7:

  CC       /tmp/build/perf/util/parse-events.o
  cc1: warnings being treated as errors
  util/parse-events.c: In function ‘parse_events_add_cache’:
  util/parse-events.c:366: error: declaration of ‘error’ shadows a global declaration
  util/util.h:136: error: shadowed declaration is here

Rename it to 'err'.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: 43d0b97817a4 ("perf tools: Enable config and setting names for legacy cache events")
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 2996aa4..2b87708 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -363,7 +363,7 @@ static int config_attr(struct perf_event_attr *attr,
 
 int parse_events_add_cache(struct list_head *list, int *idx,
 			   char *type, char *op_result1, char *op_result2,
-			   struct parse_events_error *error,
+			   struct parse_events_error *err,
 			   struct list_head *head_config)
 {
 	struct perf_event_attr attr;
@@ -425,7 +425,7 @@ int parse_events_add_cache(struct list_head *list, int *idx,
 	attr.type = PERF_TYPE_HW_CACHE;
 
 	if (head_config) {
-		if (config_attr(&attr, head_config, error,
+		if (config_attr(&attr, head_config, err,
 				config_term_common))
 			return -EINVAL;
 
-- 
cgit v0.10.2


From 58de6ed0a9a32e4b1cf22cc0c46ca16056763f19 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 19 Feb 2016 19:51:13 -0300
Subject: perf tools: Remove duplicate typedef config_term_func_t definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Older compilers don't like this, for instance, on RHEL6.7:

    CC       /tmp/build/perf/util/parse-events.o
  util/parse-events.c:844: error: redefinition of typedef ‘config_term_func_t’
  util/parse-events.c:353: note: previous declaration of ‘config_term_func_t’ was here

So remove the second definition, that should've been just moved in 43d0b97817a4
("perf tools: Enable config and setting names for legacy cache events"), not copied.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: 43d0b97817a4 ("perf tools: Enable config and setting names for legacy cache events")
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 2b87708..b0b3295 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -841,10 +841,6 @@ void parse_events__shrink_config_terms(void)
 	config_term_shrinked = true;
 }
 
-typedef int config_term_func_t(struct perf_event_attr *attr,
-			       struct parse_events_term *term,
-			       struct parse_events_error *err);
-
 static int config_term_common(struct perf_event_attr *attr,
 			      struct parse_events_term *term,
 			      struct parse_events_error *err)
-- 
cgit v0.10.2


From 665aa75700edda07bd7f05acab86cef1a1a1ea66 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 21 Feb 2016 23:22:35 +0900
Subject: perf tools: Fix segfault on dynamic entries

A dynamic entry is created for each tracepoint event.  When it sets up
the sort key, it checks with existing keys using ->equal() callback.
But it missed to set the ->equal for dynamic entries.  The following
segfault was due to the missing ->equal() callback.

  (gdb) bt
  #0  0x0000000000140003 in ?? ()
  #1  0x0000000000537769 in fmt_equal (b=0x2106980, a=0x21067a0) at ui/hist.c:548
  #2  perf_hpp__setup_output_field (list=0x8c6d80 <perf_hpp_list>) at ui/hist.c:560
  #3  0x00000000004e927e in setup_sorting (evlist=<optimized out>) at util/sort.c:2642
  #4  0x000000000043cf50 in cmd_report (argc=<optimized out>, argv=<optimized out>, prefix=<optimized out>)
      at builtin-report.c:932
  #5  0x00000000004865a1 in run_builtin (p=p@entry=0x8bbce0 <commands+192>, argc=argc@entry=7,
      argv=argv@entry=0x7ffd24d56ce0) at perf.c:390
  #6  0x000000000042dc1f in handle_internal_command (argv=0x7ffd24d56ce0, argc=7) at perf.c:451
  #7  run_argv (argv=0x7ffd24d56a70, argcp=0x7ffd24d56a7c) at perf.c:495
  #8  main (argc=7, argv=0x7ffd24d56ce0) at perf.c:620

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1456064558-13086-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index de71575..7daea71 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1835,6 +1835,20 @@ bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *fmt)
 	return fmt->cmp == __sort__hde_cmp;
 }
 
+static bool __sort__hde_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
+{
+	struct hpp_dynamic_entry *hde_a;
+	struct hpp_dynamic_entry *hde_b;
+
+	if (!perf_hpp__is_dynamic_entry(a) || !perf_hpp__is_dynamic_entry(b))
+		return false;
+
+	hde_a = container_of(a, struct hpp_dynamic_entry, hpp);
+	hde_b = container_of(b, struct hpp_dynamic_entry, hpp);
+
+	return hde_a->field == hde_b->field;
+}
+
 static void hde_free(struct perf_hpp_fmt *fmt)
 {
 	struct hpp_dynamic_entry *hde;
@@ -1867,6 +1881,7 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
 	hde->hpp.cmp = __sort__hde_cmp;
 	hde->hpp.collapse = __sort__hde_cmp;
 	hde->hpp.sort = __sort__hde_cmp;
+	hde->hpp.equal = __sort__hde_equal;
 	hde->hpp.free = hde_free;
 
 	INIT_LIST_HEAD(&hde->hpp.list);
-- 
cgit v0.10.2


From cecaec635de3719ef56a9261c10cd8f2f74ebdb1 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 22 Feb 2016 09:31:51 +0900
Subject: perf tools: Update srcline/file if needed

Normally the hist entry's srcline and/or srcfile is set during sorting.
However sometime it's possible to a hist entry's srcline is not set yet
after the sorting.  This is because the entry is so unique and other
sort keys already make it distinct.  Then the srcline/file sort didn't
have a chance to be called during the sorting.  In that case it has NULL
srcline/srcfile field and shows nothing.

Before:

  $ perf report -s comm,sym,srcline
  ...
  Overhead  Command       Symbol
  -----------------------------------------------------------------
    34.42%  swapper       [k] intel_idle          intel_idle.c:0
     2.44%  perf          [.] __poll_nocancel     (null)
     1.70%  gnome-shell   [k] fw_domains_get      (null)
     1.04%  Xorg          [k] sock_poll           (null)

After:

    34.42%  swapper       [k] intel_idle          intel_idle.c:0
     2.44%  perf          [.] __poll_nocancel     .:0
     1.70%  gnome-shell   [k] fw_domains_get      fw_domains_get+42
     1.04%  Xorg          [k] sock_poll           socket.c:0

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1456101111-14400-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 7daea71..6f4605b 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -286,35 +286,34 @@ struct sort_entry sort_sym = {
 
 /* --sort srcline */
 
+static char *hist_entry__get_srcline(struct hist_entry *he)
+{
+	struct map *map = he->ms.map;
+
+	if (!map)
+		return SRCLINE_UNKNOWN;
+
+	return get_srcline(map->dso, map__rip_2objdump(map, he->ip),
+			   he->ms.sym, true);
+}
+
 static int64_t
 sort__srcline_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	if (!left->srcline) {
-		if (!left->ms.map)
-			left->srcline = SRCLINE_UNKNOWN;
-		else {
-			struct map *map = left->ms.map;
-			left->srcline = get_srcline(map->dso,
-					   map__rip_2objdump(map, left->ip),
-						    left->ms.sym, true);
-		}
-	}
-	if (!right->srcline) {
-		if (!right->ms.map)
-			right->srcline = SRCLINE_UNKNOWN;
-		else {
-			struct map *map = right->ms.map;
-			right->srcline = get_srcline(map->dso,
-					     map__rip_2objdump(map, right->ip),
-						     right->ms.sym, true);
-		}
-	}
+	if (!left->srcline)
+		left->srcline = hist_entry__get_srcline(left);
+	if (!right->srcline)
+		right->srcline = hist_entry__get_srcline(right);
+
 	return strcmp(right->srcline, left->srcline);
 }
 
 static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
 					size_t size, unsigned int width)
 {
+	if (!he->srcline)
+		he->srcline = hist_entry__get_srcline(he);
+
 	return repsep_snprintf(bf, size, "%-*.*s", width, width, he->srcline);
 }
 
@@ -329,11 +328,14 @@ struct sort_entry sort_srcline = {
 
 static char no_srcfile[1];
 
-static char *get_srcfile(struct hist_entry *e)
+static char *hist_entry__get_srcfile(struct hist_entry *e)
 {
 	char *sf, *p;
 	struct map *map = e->ms.map;
 
+	if (!map)
+		return no_srcfile;
+
 	sf = __get_srcline(map->dso, map__rip_2objdump(map, e->ip),
 			 e->ms.sym, false, true);
 	if (!strcmp(sf, SRCLINE_UNKNOWN))
@@ -350,24 +352,20 @@ static char *get_srcfile(struct hist_entry *e)
 static int64_t
 sort__srcfile_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	if (!left->srcfile) {
-		if (!left->ms.map)
-			left->srcfile = no_srcfile;
-		else
-			left->srcfile = get_srcfile(left);
-	}
-	if (!right->srcfile) {
-		if (!right->ms.map)
-			right->srcfile = no_srcfile;
-		else
-			right->srcfile = get_srcfile(right);
-	}
+	if (!left->srcfile)
+		left->srcfile = hist_entry__get_srcfile(left);
+	if (!right->srcfile)
+		right->srcfile = hist_entry__get_srcfile(right);
+
 	return strcmp(right->srcfile, left->srcfile);
 }
 
 static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
 					size_t size, unsigned int width)
 {
+	if (!he->srcfile)
+		he->srcfile = hist_entry__get_srcfile(he);
+
 	return repsep_snprintf(bf, size, "%-*.*s", width, width, he->srcfile);
 }
 
-- 
cgit v0.10.2


From 2960ed6f8d6794dcb39ba48c3e515e5be18ee9e1 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 22 Feb 2016 09:32:33 +0900
Subject: perf tools: Fix alignment on some sort keys

The srcline, srcfile and trace sort keys can have long entries.  With
commit 89fee7094323 ("perf hists: Do column alignment on the format
iterator"), it now aligns output with hist_entry__snprintf_alignment().
So each (possibly long) sort entries don't need to do it themselves.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1456101153-14519-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 6f4605b..a7d73e5 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -314,7 +314,7 @@ static int hist_entry__srcline_snprintf(struct hist_entry *he, char *bf,
 	if (!he->srcline)
 		he->srcline = hist_entry__get_srcline(he);
 
-	return repsep_snprintf(bf, size, "%-*.*s", width, width, he->srcline);
+	return repsep_snprintf(bf, size, "%-.*s", width, he->srcline);
 }
 
 struct sort_entry sort_srcline = {
@@ -366,7 +366,7 @@ static int hist_entry__srcfile_snprintf(struct hist_entry *he, char *bf,
 	if (!he->srcfile)
 		he->srcfile = hist_entry__get_srcfile(he);
 
-	return repsep_snprintf(bf, size, "%-*.*s", width, width, he->srcfile);
+	return repsep_snprintf(bf, size, "%-.*s", width, he->srcfile);
 }
 
 struct sort_entry sort_srcfile = {
@@ -496,11 +496,11 @@ static int hist_entry__trace_snprintf(struct hist_entry *he, char *bf,
 
 	evsel = hists_to_evsel(he->hists);
 	if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
-		return scnprintf(bf, size, "%-*.*s", width, width, "N/A");
+		return scnprintf(bf, size, "%-.*s", width, "N/A");
 
 	if (he->trace_output == NULL)
 		he->trace_output = get_trace_output(he);
-	return repsep_snprintf(bf, size, "%-*.*s", width, width, he->trace_output);
+	return repsep_snprintf(bf, size, "%-.*s", width, he->trace_output);
 }
 
 struct sort_entry sort_trace = {
-- 
cgit v0.10.2


From 0c0af78d472f96efe04daaaccede7522b2394b76 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 21 Feb 2016 23:22:38 +0900
Subject: perf tools: Fix column width setting on 'trace' sort key

It missed to update column length of the 'trace' sort key in the
hists__calc_col_len() so it might truncate the output.  It calculated
the column length in the ->cmp() callback originally but it doesn't
guarantee it's called always.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1456064558-13086-5-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 827c6cb..017eb5c 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -179,6 +179,9 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
 	if (h->transaction)
 		hists__new_col_len(hists, HISTC_TRANSACTION,
 				   hist_entry__transaction_len());
+
+	if (h->trace_output)
+		hists__new_col_len(hists, HISTC_TRACE, strlen(h->trace_output));
 }
 
 void hists__output_recalc_col_len(struct hists *hists, int max_rows)
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index a7d73e5..6d0f858 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -483,9 +483,6 @@ sort__trace_cmp(struct hist_entry *left, struct hist_entry *right)
 	if (right->trace_output == NULL)
 		right->trace_output = get_trace_output(right);
 
-	hists__new_col_len(left->hists, HISTC_TRACE, strlen(left->trace_output));
-	hists__new_col_len(right->hists, HISTC_TRACE, strlen(right->trace_output));
-
 	return strcmp(right->trace_output, left->trace_output);
 }
 
-- 
cgit v0.10.2


From dd42baf1f64d7257258fa4f20064aee5160df369 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sun, 21 Feb 2016 23:22:34 +0900
Subject: perf tools: Fix assertion failure on dynamic entry

The dynamic entry is created for each field in a tracepoint event.
Since they have no fixed hpp format index, it should skip when
perf_hpp__reset_width() is called.

This caused following assertion failure..

  $ perf record -e sched:sched_switch -a sleep 1

  $ perf report -s comm,next_pid --stdio
  perf: ui/hist.c:651: perf_hpp__reset_width:
    Assertion `!(fmt->idx >= PERF_HPP__MAX_INDEX)' failed.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1456064558-13086-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 1ba4117..12223d7 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -645,6 +645,9 @@ void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
 	if (perf_hpp__is_sort_entry(fmt))
 		return perf_hpp__reset_sort_width(fmt, hists);
 
+	if (perf_hpp__is_dynamic_entry(fmt))
+		return;
+
 	BUG_ON(fmt->idx >= PERF_HPP__MAX_INDEX);
 
 	switch (fmt->idx) {
-- 
cgit v0.10.2


From 066dacbf2a32defb4de23ea4c1af9e77578b5ac2 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 22 Feb 2016 09:10:30 +0000
Subject: perf bpf: Add API to set values to map entries in a bpf object

bpf__config_obj() is introduced as a core API to config BPF object after
loading. One configuration option of maps is introduced. After this
patch BPF object can accept assignments like:

  map:my_map.value=1234

(map.my_map.value looks pretty. However, there's a small but hard to fix
problem related to flex's greedy matching. Please see [1].  Choose ':'
to avoid it in a simpler way.)

This patch is more complex than the work it does because the
consideration of extension. In designing BPF map configuration, the
following things should be considered:

 1. Array indices selection: perf should allow user setting different
    value for different slots in an array, with syntax like:
    map:my_map.value[0,3...6]=1234;

 2. A map should be set by different config terms, each for a part
    of it. For example, set each slot to the pid of a thread;

 3. Type of value: integer is not the only valid value type. A perf
    counter can also be put into a map after commit 35578d798400
    ("bpf: Implement function bpf_perf_event_read() that get the
      selected hardware PMU counter")

 4. For a hash table, it should be possible to use a string or other
    value as a key;

 5. It is possible that map configuration is unable to be setup
    during parsing. A perf counter is an example.

Therefore, this patch does the following:

 1. Instead of updating map element during parsing, this patch stores
    map config options in 'struct bpf_map_priv'. Following patches
    will apply those configs at an appropriate time;

 2. Link map operations in a list so a map can have multiple config
    terms attached, so different parts can be configured separately;

 3. Make 'struct bpf_map_priv' extensible so that the following patches
    can add new types of keys and operations;

 4. Use bpf_obj_config__map_funcs array to support more map config options.

Since the patch changing the event parser to parse BPF object config is
relative large, I've put it in another commit. Code in this patch can be
tested after applying the next patch.

[1] http://lkml.kernel.org/g/564ED621.4050500@huawei.com

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456132275-98875-4-git-send-email-wangnan0@huawei.com
Signed-off-by: He Kuang <hekuang@huawei.com>
[ Changes "maps:my_map.value" to "map:my_map.value", improved error messages ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 0bdccf4..caeef9e 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -739,6 +739,261 @@ int bpf__foreach_tev(struct bpf_object *obj,
 	return 0;
 }
 
+enum bpf_map_op_type {
+	BPF_MAP_OP_SET_VALUE,
+};
+
+enum bpf_map_key_type {
+	BPF_MAP_KEY_ALL,
+};
+
+struct bpf_map_op {
+	struct list_head list;
+	enum bpf_map_op_type op_type;
+	enum bpf_map_key_type key_type;
+	union {
+		u64 value;
+	} v;
+};
+
+struct bpf_map_priv {
+	struct list_head ops_list;
+};
+
+static void
+bpf_map_op__delete(struct bpf_map_op *op)
+{
+	if (!list_empty(&op->list))
+		list_del(&op->list);
+	free(op);
+}
+
+static void
+bpf_map_priv__purge(struct bpf_map_priv *priv)
+{
+	struct bpf_map_op *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &priv->ops_list, list) {
+		list_del_init(&pos->list);
+		bpf_map_op__delete(pos);
+	}
+}
+
+static void
+bpf_map_priv__clear(struct bpf_map *map __maybe_unused,
+		    void *_priv)
+{
+	struct bpf_map_priv *priv = _priv;
+
+	bpf_map_priv__purge(priv);
+	free(priv);
+}
+
+static struct bpf_map_op *
+bpf_map_op__new(void)
+{
+	struct bpf_map_op *op;
+
+	op = zalloc(sizeof(*op));
+	if (!op) {
+		pr_debug("Failed to alloc bpf_map_op\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	INIT_LIST_HEAD(&op->list);
+
+	op->key_type = BPF_MAP_KEY_ALL;
+	return op;
+}
+
+static int
+bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op)
+{
+	struct bpf_map_priv *priv;
+	const char *map_name;
+	int err;
+
+	map_name = bpf_map__get_name(map);
+	err = bpf_map__get_private(map, (void **)&priv);
+	if (err) {
+		pr_debug("Failed to get private from map %s\n", map_name);
+		return err;
+	}
+
+	if (!priv) {
+		priv = zalloc(sizeof(*priv));
+		if (!priv) {
+			pr_debug("No enough memory to alloc map private\n");
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(&priv->ops_list);
+
+		if (bpf_map__set_private(map, priv, bpf_map_priv__clear)) {
+			free(priv);
+			return -BPF_LOADER_ERRNO__INTERNAL;
+		}
+	}
+
+	list_add_tail(&op->list, &priv->ops_list);
+	return 0;
+}
+
+static int
+__bpf_map__config_value(struct bpf_map *map,
+			struct parse_events_term *term)
+{
+	struct bpf_map_def def;
+	struct bpf_map_op *op;
+	const char *map_name;
+	int err;
+
+	map_name = bpf_map__get_name(map);
+
+	err = bpf_map__get_def(map, &def);
+	if (err) {
+		pr_debug("Unable to get map definition from '%s'\n",
+			 map_name);
+		return -BPF_LOADER_ERRNO__INTERNAL;
+	}
+
+	if (def.type != BPF_MAP_TYPE_ARRAY) {
+		pr_debug("Map %s type is not BPF_MAP_TYPE_ARRAY\n",
+			 map_name);
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+	}
+	if (def.key_size < sizeof(unsigned int)) {
+		pr_debug("Map %s has incorrect key size\n", map_name);
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_KEYSIZE;
+	}
+	switch (def.value_size) {
+	case 1:
+	case 2:
+	case 4:
+	case 8:
+		break;
+	default:
+		pr_debug("Map %s has incorrect value size\n", map_name);
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE;
+	}
+
+	op = bpf_map_op__new();
+	if (IS_ERR(op))
+		return PTR_ERR(op);
+	op->op_type = BPF_MAP_OP_SET_VALUE;
+	op->v.value = term->val.num;
+
+	err = bpf_map__add_op(map, op);
+	if (err)
+		bpf_map_op__delete(op);
+	return err;
+}
+
+static int
+bpf_map__config_value(struct bpf_map *map,
+		      struct parse_events_term *term,
+		      struct perf_evlist *evlist __maybe_unused)
+{
+	if (!term->err_val) {
+		pr_debug("Config value not set\n");
+		return -BPF_LOADER_ERRNO__OBJCONF_CONF;
+	}
+
+	if (term->type_val != PARSE_EVENTS__TERM_TYPE_NUM) {
+		pr_debug("ERROR: wrong value type\n");
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE;
+	}
+
+	return __bpf_map__config_value(map, term);
+}
+
+struct bpf_obj_config__map_func {
+	const char *config_opt;
+	int (*config_func)(struct bpf_map *, struct parse_events_term *,
+			   struct perf_evlist *);
+};
+
+struct bpf_obj_config__map_func bpf_obj_config__map_funcs[] = {
+	{"value", bpf_map__config_value},
+};
+
+static int
+bpf__obj_config_map(struct bpf_object *obj,
+		    struct parse_events_term *term,
+		    struct perf_evlist *evlist,
+		    int *key_scan_pos)
+{
+	/* key is "map:<mapname>.<config opt>" */
+	char *map_name = strdup(term->config + sizeof("map:") - 1);
+	struct bpf_map *map;
+	int err = -BPF_LOADER_ERRNO__OBJCONF_OPT;
+	char *map_opt;
+	size_t i;
+
+	if (!map_name)
+		return -ENOMEM;
+
+	map_opt = strchr(map_name, '.');
+	if (!map_opt) {
+		pr_debug("ERROR: Invalid map config: %s\n", map_name);
+		goto out;
+	}
+
+	*map_opt++ = '\0';
+	if (*map_opt == '\0') {
+		pr_debug("ERROR: Invalid map option: %s\n", term->config);
+		goto out;
+	}
+
+	map = bpf_object__get_map_by_name(obj, map_name);
+	if (!map) {
+		pr_debug("ERROR: Map %s doesn't exist\n", map_name);
+		err = -BPF_LOADER_ERRNO__OBJCONF_MAP_NOTEXIST;
+		goto out;
+	}
+
+	*key_scan_pos += map_opt - map_name;
+	for (i = 0; i < ARRAY_SIZE(bpf_obj_config__map_funcs); i++) {
+		struct bpf_obj_config__map_func *func =
+				&bpf_obj_config__map_funcs[i];
+
+		if (strcmp(map_opt, func->config_opt) == 0) {
+			err = func->config_func(map, term, evlist);
+			goto out;
+		}
+	}
+
+	pr_debug("ERROR: Invalid map config option '%s'\n", map_opt);
+	err = -BPF_LOADER_ERRNO__OBJCONF_MAP_OPT;
+out:
+	free(map_name);
+	if (!err)
+		key_scan_pos += strlen(map_opt);
+	return err;
+}
+
+int bpf__config_obj(struct bpf_object *obj,
+		    struct parse_events_term *term,
+		    struct perf_evlist *evlist,
+		    int *error_pos)
+{
+	int key_scan_pos = 0;
+	int err;
+
+	if (!obj || !term || !term->config)
+		return -EINVAL;
+
+	if (!prefixcmp(term->config, "map:")) {
+		key_scan_pos = sizeof("map:") - 1;
+		err = bpf__obj_config_map(obj, term, evlist, &key_scan_pos);
+		goto out;
+	}
+	err = -BPF_LOADER_ERRNO__OBJCONF_OPT;
+out:
+	if (error_pos)
+		*error_pos = key_scan_pos;
+	return err;
+
+}
+
 #define ERRNO_OFFSET(e)		((e) - __BPF_LOADER_ERRNO__START)
 #define ERRCODE_OFFSET(c)	ERRNO_OFFSET(BPF_LOADER_ERRNO__##c)
 #define NR_ERRNO	(__BPF_LOADER_ERRNO__END - __BPF_LOADER_ERRNO__START)
@@ -753,6 +1008,14 @@ static const char *bpf_loader_strerror_table[NR_ERRNO] = {
 	[ERRCODE_OFFSET(PROLOGUE)]	= "Failed to generate prologue",
 	[ERRCODE_OFFSET(PROLOGUE2BIG)]	= "Prologue too big for program",
 	[ERRCODE_OFFSET(PROLOGUEOOB)]	= "Offset out of bound for prologue",
+	[ERRCODE_OFFSET(OBJCONF_OPT)]	= "Invalid object config option",
+	[ERRCODE_OFFSET(OBJCONF_CONF)]	= "Config value not set (missing '=')",
+	[ERRCODE_OFFSET(OBJCONF_MAP_OPT)]	= "Invalid object map config option",
+	[ERRCODE_OFFSET(OBJCONF_MAP_NOTEXIST)]	= "Target map doesn't exist",
+	[ERRCODE_OFFSET(OBJCONF_MAP_VALUE)]	= "Incorrect value type for map",
+	[ERRCODE_OFFSET(OBJCONF_MAP_TYPE)]	= "Incorrect map type",
+	[ERRCODE_OFFSET(OBJCONF_MAP_KEYSIZE)]	= "Incorrect map key size",
+	[ERRCODE_OFFSET(OBJCONF_MAP_VALUESIZE)]	= "Incorrect map value size",
 };
 
 static int
@@ -872,3 +1135,16 @@ int bpf__strerror_load(struct bpf_object *obj,
 	bpf__strerror_end(buf, size);
 	return 0;
 }
+
+int bpf__strerror_config_obj(struct bpf_object *obj __maybe_unused,
+			     struct parse_events_term *term __maybe_unused,
+			     struct perf_evlist *evlist __maybe_unused,
+			     int *error_pos __maybe_unused, int err,
+			     char *buf, size_t size)
+{
+	bpf__strerror_head(err, buf, size);
+	bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE,
+			    "Can't use this config term with this map type");
+	bpf__strerror_end(buf, size);
+	return 0;
+}
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index 6fdc045..cc46a07 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -10,6 +10,7 @@
 #include <string.h>
 #include <bpf/libbpf.h>
 #include "probe-event.h"
+#include "evlist.h"
 #include "debug.h"
 
 enum bpf_loader_errno {
@@ -24,10 +25,19 @@ enum bpf_loader_errno {
 	BPF_LOADER_ERRNO__PROLOGUE,	/* Failed to generate prologue */
 	BPF_LOADER_ERRNO__PROLOGUE2BIG,	/* Prologue too big for program */
 	BPF_LOADER_ERRNO__PROLOGUEOOB,	/* Offset out of bound for prologue */
+	BPF_LOADER_ERRNO__OBJCONF_OPT,	/* Invalid object config option */
+	BPF_LOADER_ERRNO__OBJCONF_CONF,	/* Config value not set (lost '=')) */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_OPT,	/* Invalid object map config option */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_NOTEXIST,	/* Target map not exist */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE,	/* Incorrect value type for map */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE,	/* Incorrect map type */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_KEYSIZE,	/* Incorrect map key size */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE,/* Incorrect map value size */
 	__BPF_LOADER_ERRNO__END,
 };
 
 struct bpf_object;
+struct parse_events_term;
 #define PERF_BPF_PROBE_GROUP "perf_bpf_probe"
 
 typedef int (*bpf_prog_iter_callback_t)(struct probe_trace_event *tev,
@@ -53,6 +63,14 @@ int bpf__strerror_load(struct bpf_object *obj, int err,
 		       char *buf, size_t size);
 int bpf__foreach_tev(struct bpf_object *obj,
 		     bpf_prog_iter_callback_t func, void *arg);
+
+int bpf__config_obj(struct bpf_object *obj, struct parse_events_term *term,
+		    struct perf_evlist *evlist, int *error_pos);
+int bpf__strerror_config_obj(struct bpf_object *obj,
+			     struct parse_events_term *term,
+			     struct perf_evlist *evlist,
+			     int *error_pos, int err, char *buf,
+			     size_t size);
 #else
 static inline struct bpf_object *
 bpf__prepare_load(const char *filename __maybe_unused,
@@ -84,6 +102,15 @@ bpf__foreach_tev(struct bpf_object *obj __maybe_unused,
 }
 
 static inline int
+bpf__config_obj(struct bpf_object *obj __maybe_unused,
+		struct parse_events_term *term __maybe_unused,
+		struct perf_evlist *evlist __maybe_unused,
+		int *error_pos __maybe_unused)
+{
+	return 0;
+}
+
+static inline int
 __bpf_strerror(char *buf, size_t size)
 {
 	if (!size)
@@ -118,5 +145,16 @@ static inline int bpf__strerror_load(struct bpf_object *obj __maybe_unused,
 {
 	return __bpf_strerror(buf, size);
 }
+
+static inline int
+bpf__strerror_config_obj(struct bpf_object *obj __maybe_unused,
+			 struct parse_events_term *term __maybe_unused,
+			 struct perf_evlist *evlist __maybe_unused,
+			 int *error_pos __maybe_unused,
+			 int err __maybe_unused,
+			 char *buf, size_t size)
+{
+	return __bpf_strerror(buf, size);
+}
 #endif
 #endif
-- 
cgit v0.10.2


From a34f3be70cdf986850552e62b9f22d659bfbcef3 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 22 Feb 2016 09:10:31 +0000
Subject: perf tools: Enable BPF object configure syntax

This patch adds the final step for BPF map configuration. A new syntax
is appended into parser so user can config BPF objects through '/' '/'
enclosed config terms.

After this patch, following syntax is available:

  # perf record -e ./test_bpf_map_1.c/map:channel.value=10/ ...

It would takes effect after appling following commits.

Test result:

  # cat ./test_bpf_map_1.c
  /************************ BEGIN **************************/
  #include <uapi/linux/bpf.h>
  #define SEC(NAME) __attribute__((section(NAME), used))
  struct bpf_map_def {
      unsigned int type;
      unsigned int key_size;
      unsigned int value_size;
      unsigned int max_entries;
  };
  static void *(*map_lookup_elem)(struct bpf_map_def *, void *) =
      (void *)BPF_FUNC_map_lookup_elem;
  static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
      (void *)BPF_FUNC_trace_printk;
  struct bpf_map_def SEC("maps") channel = {
      .type = BPF_MAP_TYPE_ARRAY,
      .key_size = sizeof(int),
      .value_size = sizeof(int),
      .max_entries = 1,
  };
  SEC("func=sys_nanosleep")
  int func(void *ctx)
  {
      int key = 0;
      char fmt[] = "%d\n";
      int *pval = map_lookup_elem(&channel, &key);
      if (!pval)
          return 0;
      trace_printk(fmt, sizeof(fmt), *pval);
      return 0;
  }
  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  /************************* END ***************************/

 - Normal case:
  # ./perf record -e './test_bpf_map_1.c/map:channel.value=10/' usleep 10
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.012 MB perf.data ]

 - Error case:

  # ./perf record -e './test_bpf_map_1.c/map:channel.value/' usleep 10
  event syntax error: '..ps:channel:value/'
                                   \___ Config value not set (missing '=')
  Hint:	Valid config term:
         map:[<arraymap>]:value=[value]
         (add -v to see detail)
  Run 'perf list' for a list of valid events

  Usage: perf record [<options>] [<command>]
     or: perf record [<options>] -- <command> [<options>]

     -e, --event <event>   event selector. use 'perf list' to list available events

  # ./perf record -e './test_bpf_map_1.c/xmap:channel.value=10/' usleep 10
  event syntax error: '..pf_map_1.c/xmap:channel.value=10/'
                                    \___ Invalid object config option
  [SNIP]

  # ./perf record -e './test_bpf_map_1.c/map:xchannel.value=10/' usleep 10
  event syntax error: '..p_1.c/map:xchannel.value=10/'
                                    \___ Target map not exist
  [SNIP]

  # ./perf record -e './test_bpf_map_1.c/map:channel.xvalue=10/' usleep 10
  event syntax error: '..ps:channel.xvalue=10/'
                                    \___ Invalid object map config option
  [SNIP]

  # ./perf record -e './test_bpf_map_1.c/map:channel.value=x10/' usleep 10
  event syntax error: '..nnel.value=x10/'
                                    \___ Incorrect value type for map
  [SNIP]

  Change BPF_MAP_TYPE_ARRAY to '1' in test_bpf_map_1.c:

  # ./perf record -e './test_bpf_map_1.c/map:channel.value=10/' usleep 10
  event syntax error: '..ps:channel.value=10/'
                                    \___ Can't use this config term to this type of map

  Hint:	Valid config term:
      	map:[<arraymap>].value=[value]
      	(add -v to see detail)

Signed-off-by: Wang Nan <wangnan0@huawei.com>
[for parser part]
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456132275-98875-5-git-send-email-wangnan0@huawei.com
Signed-off-by: He Kuang <hekuang@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index b0b3295..a5dd670 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -672,17 +672,63 @@ errout:
 	return err;
 }
 
+static int
+parse_events_config_bpf(struct parse_events_evlist *data,
+			struct bpf_object *obj,
+			struct list_head *head_config)
+{
+	struct parse_events_term *term;
+	int error_pos;
+
+	if (!head_config || list_empty(head_config))
+		return 0;
+
+	list_for_each_entry(term, head_config, list) {
+		char errbuf[BUFSIZ];
+		int err;
+
+		if (term->type_term != PARSE_EVENTS__TERM_TYPE_USER) {
+			snprintf(errbuf, sizeof(errbuf),
+				 "Invalid config term for BPF object");
+			errbuf[BUFSIZ - 1] = '\0';
+
+			data->error->idx = term->err_term;
+			data->error->str = strdup(errbuf);
+			return -EINVAL;
+		}
+
+		err = bpf__config_obj(obj, term, NULL, &error_pos);
+		if (err) {
+			bpf__strerror_config_obj(obj, term, NULL,
+						 &error_pos, err, errbuf,
+						 sizeof(errbuf));
+			data->error->help = strdup(
+"Hint:\tValid config term:\n"
+"     \tmap:[<arraymap>].value=[value]\n"
+"     \t(add -v to see detail)");
+			data->error->str = strdup(errbuf);
+			if (err == -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE)
+				data->error->idx = term->err_val;
+			else
+				data->error->idx = term->err_term + error_pos;
+			return err;
+		}
+	}
+	return 0;
+}
+
 int parse_events_load_bpf(struct parse_events_evlist *data,
 			  struct list_head *list,
 			  char *bpf_file_name,
-			  bool source)
+			  bool source,
+			  struct list_head *head_config)
 {
 	struct bpf_object *obj;
+	int err;
 
 	obj = bpf__prepare_load(bpf_file_name, source);
 	if (IS_ERR(obj)) {
 		char errbuf[BUFSIZ];
-		int err;
 
 		err = PTR_ERR(obj);
 
@@ -700,7 +746,10 @@ int parse_events_load_bpf(struct parse_events_evlist *data,
 		return err;
 	}
 
-	return parse_events_load_bpf_obj(data, list, obj);
+	err = parse_events_load_bpf_obj(data, list, obj);
+	if (err)
+		return err;
+	return parse_events_config_bpf(data, obj, head_config);
 }
 
 static int
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index d5eb2af..c48377a 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -129,7 +129,8 @@ int parse_events_add_tracepoint(struct list_head *list, int *idx,
 int parse_events_load_bpf(struct parse_events_evlist *data,
 			  struct list_head *list,
 			  char *bpf_file_name,
-			  bool source);
+			  bool source,
+			  struct list_head *head_config);
 /* Provide this function for perf test */
 struct bpf_object;
 int parse_events_load_bpf_obj(struct parse_events_evlist *data,
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 99486e6..0cc6b84 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -122,7 +122,7 @@ num_dec		[0-9]+
 num_hex		0x[a-fA-F0-9]+
 num_raw_hex	[a-fA-F0-9]+
 name		[a-zA-Z_*?][a-zA-Z0-9_*?.]*
-name_minus	[a-zA-Z_*?][a-zA-Z0-9\-_*?.]*
+name_minus	[a-zA-Z_*?][a-zA-Z0-9\-_*?.:]*
 /* If you add a modifier you need to update check_modifier() */
 modifier_event	[ukhpPGHSDI]+
 modifier_bp	[rwx]{1,3}
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index 6a2d006..0e2d433 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -437,24 +437,26 @@ PE_RAW opt_event_config
 }
 
 event_bpf_file:
-PE_BPF_OBJECT
+PE_BPF_OBJECT opt_event_config
 {
 	struct parse_events_evlist *data = _data;
 	struct parse_events_error *error = data->error;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_load_bpf(data, list, $1, false));
+	ABORT_ON(parse_events_load_bpf(data, list, $1, false, $2));
+	parse_events_terms__delete($2);
 	$$ = list;
 }
 |
-PE_BPF_SOURCE
+PE_BPF_SOURCE opt_event_config
 {
 	struct parse_events_evlist *data = _data;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_load_bpf(data, list, $1, true));
+	ABORT_ON(parse_events_load_bpf(data, list, $1, true, $2));
+	parse_events_terms__delete($2);
 	$$ = list;
 }
 
-- 
cgit v0.10.2


From 8690a2a773703e4ad2a07a7f3912ea6b131307cc Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 22 Feb 2016 09:10:32 +0000
Subject: perf record: Apply config to BPF objects before recording

bpf__apply_obj_config() is introduced as the core API to apply object
config options to all BPF objects. This patch also does the real work
for setting values for BPF_MAP_TYPE_PERF_ARRAY maps by inserting value
stored in map's private field into the BPF map.

This patch is required because we are not always able to set all BPF
config during parsing. Further patch will set events created by perf to
BPF_MAP_TYPE_PERF_EVENT_ARRAY maps, which is not exist until
perf_evsel__open().

bpf_map_foreach_key() is introduced to iterate over each key needs to be
configured. This function would be extended to support more map types
and different key settings.

In perf record, before start recording, call bpf__apply_config() to turn
on all BPF config options.

Test result:

  # cat ./test_bpf_map_1.c
  /************************ BEGIN **************************/
  #include <uapi/linux/bpf.h>
  #define SEC(NAME) __attribute__((section(NAME), used))
  struct bpf_map_def {
      unsigned int type;
      unsigned int key_size;
      unsigned int value_size;
      unsigned int max_entries;
  };
  static void *(*map_lookup_elem)(struct bpf_map_def *, void *) =
      (void *)BPF_FUNC_map_lookup_elem;
  static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
      (void *)BPF_FUNC_trace_printk;
  struct bpf_map_def SEC("maps") channel = {
      .type = BPF_MAP_TYPE_ARRAY,
      .key_size = sizeof(int),
      .value_size = sizeof(int),
      .max_entries = 1,
  };
  SEC("func=sys_nanosleep")
  int func(void *ctx)
  {
      int key = 0;
      char fmt[] = "%d\n";
      int *pval = map_lookup_elem(&channel, &key);
      if (!pval)
          return 0;
      trace_printk(fmt, sizeof(fmt), *pval);
      return 0;
  }
  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  /************************* END ***************************/

  # echo "" > /sys/kernel/debug/tracing/trace
  # ./perf record -e './test_bpf_map_1.c/map:channel.value=11/' usleep 10
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.012 MB perf.data ]
  # cat /sys/kernel/debug/tracing/trace
  # tracer: nop
  #
  # entries-in-buffer/entries-written: 1/1   #P:8
  [SNIP]
  #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
  #              | |       |   ||||       |         |
             usleep-18593 [007] d... 2394714.395539: : 11
  # ./perf record -e './test_bpf_map_1.c/map:channel.value=101/' usleep 10
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.012 MB perf.data ]
  # cat /sys/kernel/debug/tracing/trace
  # tracer: nop
  #
  # entries-in-buffer/entries-written: 1/1   #P:8
  [SNIP]
  #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
  #              | |       |   ||||       |         |
             usleep-18593 [007] d... 2394714.395539: : 11
             usleep-19000 [006] d... 2394831.057840: : 101

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456132275-98875-6-git-send-email-wangnan0@huawei.com
Signed-off-by: He Kuang <hekuang@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index cf3a28d..7d11162 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -32,6 +32,7 @@
 #include "util/parse-branch-options.h"
 #include "util/parse-regs-options.h"
 #include "util/llvm-utils.h"
+#include "util/bpf-loader.h"
 
 #include <unistd.h>
 #include <sched.h>
@@ -536,6 +537,16 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 		goto out_child;
 	}
 
+	err = bpf__apply_obj_config();
+	if (err) {
+		char errbuf[BUFSIZ];
+
+		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
+		pr_err("ERROR: Apply config to BPF failed: %s\n",
+			 errbuf);
+		goto out_child;
+	}
+
 	/*
 	 * Normally perf_session__new would do this, but it doesn't have the
 	 * evlist.
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index caeef9e..dbbd17c 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -7,6 +7,7 @@
 
 #include <linux/bpf.h>
 #include <bpf/libbpf.h>
+#include <bpf/bpf.h>
 #include <linux/err.h>
 #include <linux/string.h>
 #include "perf.h"
@@ -994,6 +995,182 @@ out:
 
 }
 
+typedef int (*map_config_func_t)(const char *name, int map_fd,
+				 struct bpf_map_def *pdef,
+				 struct bpf_map_op *op,
+				 void *pkey, void *arg);
+
+static int
+foreach_key_array_all(map_config_func_t func,
+		      void *arg, const char *name,
+		      int map_fd, struct bpf_map_def *pdef,
+		      struct bpf_map_op *op)
+{
+	unsigned int i;
+	int err;
+
+	for (i = 0; i < pdef->max_entries; i++) {
+		err = func(name, map_fd, pdef, op, &i, arg);
+		if (err) {
+			pr_debug("ERROR: failed to insert value to %s[%u]\n",
+				 name, i);
+			return err;
+		}
+	}
+	return 0;
+}
+
+static int
+bpf_map_config_foreach_key(struct bpf_map *map,
+			   map_config_func_t func,
+			   void *arg)
+{
+	int err, map_fd;
+	const char *name;
+	struct bpf_map_op *op;
+	struct bpf_map_def def;
+	struct bpf_map_priv *priv;
+
+	name = bpf_map__get_name(map);
+
+	err = bpf_map__get_private(map, (void **)&priv);
+	if (err) {
+		pr_debug("ERROR: failed to get private from map %s\n", name);
+		return -BPF_LOADER_ERRNO__INTERNAL;
+	}
+	if (!priv || list_empty(&priv->ops_list)) {
+		pr_debug("INFO: nothing to config for map %s\n", name);
+		return 0;
+	}
+
+	err = bpf_map__get_def(map, &def);
+	if (err) {
+		pr_debug("ERROR: failed to get definition from map %s\n", name);
+		return -BPF_LOADER_ERRNO__INTERNAL;
+	}
+	map_fd = bpf_map__get_fd(map);
+	if (map_fd < 0) {
+		pr_debug("ERROR: failed to get fd from map %s\n", name);
+		return map_fd;
+	}
+
+	list_for_each_entry(op, &priv->ops_list, list) {
+		switch (def.type) {
+		case BPF_MAP_TYPE_ARRAY:
+			switch (op->key_type) {
+			case BPF_MAP_KEY_ALL:
+				err = foreach_key_array_all(func, arg, name,
+							    map_fd, &def, op);
+				if (err)
+					return err;
+				break;
+			default:
+				pr_debug("ERROR: keytype for map '%s' invalid\n",
+					 name);
+				return -BPF_LOADER_ERRNO__INTERNAL;
+			}
+			break;
+		default:
+			pr_debug("ERROR: type of '%s' incorrect\n", name);
+			return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+		}
+	}
+
+	return 0;
+}
+
+static int
+apply_config_value_for_key(int map_fd, void *pkey,
+			   size_t val_size, u64 val)
+{
+	int err = 0;
+
+	switch (val_size) {
+	case 1: {
+		u8 _val = (u8)(val);
+		err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+		break;
+	}
+	case 2: {
+		u16 _val = (u16)(val);
+		err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+		break;
+	}
+	case 4: {
+		u32 _val = (u32)(val);
+		err = bpf_map_update_elem(map_fd, pkey, &_val, BPF_ANY);
+		break;
+	}
+	case 8: {
+		err = bpf_map_update_elem(map_fd, pkey, &val, BPF_ANY);
+		break;
+	}
+	default:
+		pr_debug("ERROR: invalid value size\n");
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE;
+	}
+	if (err && errno)
+		err = -errno;
+	return err;
+}
+
+static int
+apply_obj_config_map_for_key(const char *name, int map_fd,
+			     struct bpf_map_def *pdef __maybe_unused,
+			     struct bpf_map_op *op,
+			     void *pkey, void *arg __maybe_unused)
+{
+	int err;
+
+	switch (op->op_type) {
+	case BPF_MAP_OP_SET_VALUE:
+		err = apply_config_value_for_key(map_fd, pkey,
+						 pdef->value_size,
+						 op->v.value);
+		break;
+	default:
+		pr_debug("ERROR: unknown value type for '%s'\n", name);
+		err = -BPF_LOADER_ERRNO__INTERNAL;
+	}
+	return err;
+}
+
+static int
+apply_obj_config_map(struct bpf_map *map)
+{
+	return bpf_map_config_foreach_key(map,
+					  apply_obj_config_map_for_key,
+					  NULL);
+}
+
+static int
+apply_obj_config_object(struct bpf_object *obj)
+{
+	struct bpf_map *map;
+	int err;
+
+	bpf_map__for_each(map, obj) {
+		err = apply_obj_config_map(map);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+int bpf__apply_obj_config(void)
+{
+	struct bpf_object *obj, *tmp;
+	int err;
+
+	bpf_object__for_each_safe(obj, tmp) {
+		err = apply_obj_config_object(obj);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 #define ERRNO_OFFSET(e)		((e) - __BPF_LOADER_ERRNO__START)
 #define ERRCODE_OFFSET(c)	ERRNO_OFFSET(BPF_LOADER_ERRNO__##c)
 #define NR_ERRNO	(__BPF_LOADER_ERRNO__END - __BPF_LOADER_ERRNO__START)
@@ -1148,3 +1325,10 @@ int bpf__strerror_config_obj(struct bpf_object *obj __maybe_unused,
 	bpf__strerror_end(buf, size);
 	return 0;
 }
+
+int bpf__strerror_apply_obj_config(int err, char *buf, size_t size)
+{
+	bpf__strerror_head(err, buf, size);
+	bpf__strerror_end(buf, size);
+	return 0;
+}
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index cc46a07..5d3b931 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -71,6 +71,8 @@ int bpf__strerror_config_obj(struct bpf_object *obj,
 			     struct perf_evlist *evlist,
 			     int *error_pos, int err, char *buf,
 			     size_t size);
+int bpf__apply_obj_config(void);
+int bpf__strerror_apply_obj_config(int err, char *buf, size_t size);
 #else
 static inline struct bpf_object *
 bpf__prepare_load(const char *filename __maybe_unused,
@@ -111,6 +113,12 @@ bpf__config_obj(struct bpf_object *obj __maybe_unused,
 }
 
 static inline int
+bpf__apply_obj_config(void)
+{
+	return 0;
+}
+
+static inline int
 __bpf_strerror(char *buf, size_t size)
 {
 	if (!size)
@@ -156,5 +164,12 @@ bpf__strerror_config_obj(struct bpf_object *obj __maybe_unused,
 {
 	return __bpf_strerror(buf, size);
 }
+
+static inline int
+bpf__strerror_apply_obj_config(int err __maybe_unused,
+			       char *buf, size_t size)
+{
+	return __bpf_strerror(buf, size);
+}
 #endif
 #endif
-- 
cgit v0.10.2


From 7630b3e28dd827fffad13cc0aada14b00ec524d9 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 22 Feb 2016 09:10:33 +0000
Subject: perf tools: Enable passing event to BPF object

A new syntax is added to the parser so that the user can access
predefined perf events in BPF objects.

After this patch, BPF programs for perf are finally able to utilize
bpf_perf_event_read() introduced in commit 35578d798400 ("bpf: Implement
function bpf_perf_event_read() that get the selected hardware PMU
counter").

Test result:

  # cat test_bpf_map_2.c
  /************************ BEGIN **************************/
  #include <uapi/linux/bpf.h>
  #define SEC(NAME) __attribute__((section(NAME), used))
  struct bpf_map_def {
      unsigned int type;
      unsigned int key_size;
      unsigned int value_size;
      unsigned int max_entries;
  };
  static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
      (void *)BPF_FUNC_trace_printk;
  static int (*get_smp_processor_id)(void) =
      (void *)BPF_FUNC_get_smp_processor_id;
  static int (*perf_event_read)(struct bpf_map_def *, int) =
      (void *)BPF_FUNC_perf_event_read;

  struct bpf_map_def SEC("maps") pmu_map = {
      .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
      .key_size = sizeof(int),
      .value_size = sizeof(int),
      .max_entries = __NR_CPUS__,
  };
  SEC("func_write=sys_write")
  int func_write(void *ctx)
  {
      unsigned long long val;
      char fmt[] = "sys_write:        pmu=%llu\n";
      val = perf_event_read(&pmu_map, get_smp_processor_id());
      trace_printk(fmt, sizeof(fmt), val);
      return 0;
  }

  SEC("func_write_return=sys_write%return")
  int func_write_return(void *ctx)
  {
      unsigned long long val = 0;
      char fmt[] = "sys_write_return: pmu=%llu\n";
      val = perf_event_read(&pmu_map, get_smp_processor_id());
      trace_printk(fmt, sizeof(fmt), val);
      return 0;
  }
  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  /************************* END ***************************/

Normal case:

  # echo "" > /sys/kernel/debug/tracing/trace
  # perf record -i -e cycles -e './test_bpf_map_2.c/map:pmu_map.event=cycles/' ls /
  [SNIP]
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.013 MB perf.data (7 samples) ]
  # cat /sys/kernel/debug/tracing/trace | grep ls
                ls-17066 [000] d... 938449.863301: : sys_write:        pmu=1157327
                ls-17066 [000] dN.. 938449.863342: : sys_write_return: pmu=1225218
                ls-17066 [000] d... 938449.863349: : sys_write:        pmu=1241922
                ls-17066 [000] dN.. 938449.863369: : sys_write_return: pmu=1267445

Normal case (system wide):

  # echo "" > /sys/kernel/debug/tracing/trace
  # perf record -i -e cycles -e './test_bpf_map_2.c/map:pmu_map.event=cycles/' -a
  ^C[ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.811 MB perf.data (120 samples) ]

  # cat /sys/kernel/debug/tracing/trace | grep -v '18446744073709551594' | grep -v perf | head -n 20
  [SNIP]
  #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
  #              | |       |   ||||       |         |
             gmain-30828 [002] d... 2740551.068992: : sys_write:        pmu=84373
             gmain-30828 [002] d... 2740551.068992: : sys_write_return: pmu=87696
             gmain-30828 [002] d... 2740551.068996: : sys_write:        pmu=100658
             gmain-30828 [002] d... 2740551.068997: : sys_write_return: pmu=102572

Error case 1:

  # perf record -e './test_bpf_map_2.c' ls /
  [SNIP]
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.014 MB perf.data ]
  # cat /sys/kernel/debug/tracing/trace | grep ls
                ls-17115 [007] d... 2724279.665625: : sys_write:        pmu=18446744073709551614
                ls-17115 [007] dN.. 2724279.665651: : sys_write_return: pmu=18446744073709551614
                ls-17115 [007] d... 2724279.665658: : sys_write:        pmu=18446744073709551614
                ls-17115 [007] dN.. 2724279.665677: : sys_write_return: pmu=18446744073709551614

  (18446744073709551614 is 0xfffffffffffffffe (-2))

Error case 2:

  # perf record -e cycles -e './test_bpf_map_2.c/map:pmu_map.event=evt/' -a
  event syntax error: '..ps:pmu_map.event=evt/'
                                    \___ Event not found for map setting

  Hint:	Valid config terms:
       	map:[<arraymap>].value=[value]
       	map:[<eventmap>].event=[event]
  [SNIP]

Error case 3:
  # ls /proc/2348/task/
  2348  2505  2506  2507  2508
  # perf record -i -e cycles -e './test_bpf_map_2.c/map:pmu_map.event=cycles/' -p 2348
  ERROR: Apply config to BPF failed: Cannot set event to BPF map in multi-thread tracing

Error case 4:
  # perf record -e cycles -e './test_bpf_map_2.c/map:pmu_map.event=cycles/' ls /
  ERROR: Apply config to BPF failed: Doesn't support inherit event (Hint: use -i to turn off inherit)

Error case 5:
  # perf record -i -e raw_syscalls:sys_enter -e './test_bpf_map_2.c/map:pmu_map.event=raw_syscalls:sys_enter/' ls
  ERROR: Apply config to BPF failed: Can only put raw, hardware and BPF output event into a BPF map

Error case 6:
  # perf record -i -e './test_bpf_map_2.c/map:pmu_map.event=123/' ls /
  event syntax error: '.._map.event=123/'
                                    \___ Incorrect value type for map
  [SNIP]

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456132275-98875-7-git-send-email-wangnan0@huawei.com
Signed-off-by: He Kuang <hekuang@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index dbbd17c..deacb95 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -742,6 +742,7 @@ int bpf__foreach_tev(struct bpf_object *obj,
 
 enum bpf_map_op_type {
 	BPF_MAP_OP_SET_VALUE,
+	BPF_MAP_OP_SET_EVSEL,
 };
 
 enum bpf_map_key_type {
@@ -754,6 +755,7 @@ struct bpf_map_op {
 	enum bpf_map_key_type key_type;
 	union {
 		u64 value;
+		struct perf_evsel *evsel;
 	} v;
 };
 
@@ -838,6 +840,24 @@ bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op)
 	return 0;
 }
 
+static struct bpf_map_op *
+bpf_map__add_newop(struct bpf_map *map)
+{
+	struct bpf_map_op *op;
+	int err;
+
+	op = bpf_map_op__new();
+	if (IS_ERR(op))
+		return op;
+
+	err = bpf_map__add_op(map, op);
+	if (err) {
+		bpf_map_op__delete(op);
+		return ERR_PTR(err);
+	}
+	return op;
+}
+
 static int
 __bpf_map__config_value(struct bpf_map *map,
 			struct parse_events_term *term)
@@ -876,16 +896,12 @@ __bpf_map__config_value(struct bpf_map *map,
 		return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE;
 	}
 
-	op = bpf_map_op__new();
+	op = bpf_map__add_newop(map);
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 	op->op_type = BPF_MAP_OP_SET_VALUE;
 	op->v.value = term->val.num;
-
-	err = bpf_map__add_op(map, op);
-	if (err)
-		bpf_map_op__delete(op);
-	return err;
+	return 0;
 }
 
 static int
@@ -899,13 +915,75 @@ bpf_map__config_value(struct bpf_map *map,
 	}
 
 	if (term->type_val != PARSE_EVENTS__TERM_TYPE_NUM) {
-		pr_debug("ERROR: wrong value type\n");
+		pr_debug("ERROR: wrong value type for 'value'\n");
 		return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE;
 	}
 
 	return __bpf_map__config_value(map, term);
 }
 
+static int
+__bpf_map__config_event(struct bpf_map *map,
+			struct parse_events_term *term,
+			struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel;
+	struct bpf_map_def def;
+	struct bpf_map_op *op;
+	const char *map_name;
+	int err;
+
+	map_name = bpf_map__get_name(map);
+	evsel = perf_evlist__find_evsel_by_str(evlist, term->val.str);
+	if (!evsel) {
+		pr_debug("Event (for '%s') '%s' doesn't exist\n",
+			 map_name, term->val.str);
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_NOEVT;
+	}
+
+	err = bpf_map__get_def(map, &def);
+	if (err) {
+		pr_debug("Unable to get map definition from '%s'\n",
+			 map_name);
+		return err;
+	}
+
+	/*
+	 * No need to check key_size and value_size:
+	 * kernel has already checked them.
+	 */
+	if (def.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) {
+		pr_debug("Map %s type is not BPF_MAP_TYPE_PERF_EVENT_ARRAY\n",
+			 map_name);
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
+	}
+
+	op = bpf_map__add_newop(map);
+	if (IS_ERR(op))
+		return PTR_ERR(op);
+	op->op_type = BPF_MAP_OP_SET_EVSEL;
+	op->v.evsel = evsel;
+	return 0;
+}
+
+static int
+bpf_map__config_event(struct bpf_map *map,
+		      struct parse_events_term *term,
+		      struct perf_evlist *evlist)
+{
+	if (!term->err_val) {
+		pr_debug("Config value not set\n");
+		return -BPF_LOADER_ERRNO__OBJCONF_CONF;
+	}
+
+	if (term->type_val != PARSE_EVENTS__TERM_TYPE_STR) {
+		pr_debug("ERROR: wrong value type for 'event'\n");
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE;
+	}
+
+	return __bpf_map__config_event(map, term, evlist);
+}
+
 struct bpf_obj_config__map_func {
 	const char *config_opt;
 	int (*config_func)(struct bpf_map *, struct parse_events_term *,
@@ -914,6 +992,7 @@ struct bpf_obj_config__map_func {
 
 struct bpf_obj_config__map_func bpf_obj_config__map_funcs[] = {
 	{"value", bpf_map__config_value},
+	{"event", bpf_map__config_event},
 };
 
 static int
@@ -1057,6 +1136,7 @@ bpf_map_config_foreach_key(struct bpf_map *map,
 	list_for_each_entry(op, &priv->ops_list, list) {
 		switch (def.type) {
 		case BPF_MAP_TYPE_ARRAY:
+		case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
 			switch (op->key_type) {
 			case BPF_MAP_KEY_ALL:
 				err = foreach_key_array_all(func, arg, name,
@@ -1115,6 +1195,60 @@ apply_config_value_for_key(int map_fd, void *pkey,
 }
 
 static int
+apply_config_evsel_for_key(const char *name, int map_fd, void *pkey,
+			   struct perf_evsel *evsel)
+{
+	struct xyarray *xy = evsel->fd;
+	struct perf_event_attr *attr;
+	unsigned int key, events;
+	bool check_pass = false;
+	int *evt_fd;
+	int err;
+
+	if (!xy) {
+		pr_debug("ERROR: evsel not ready for map %s\n", name);
+		return -BPF_LOADER_ERRNO__INTERNAL;
+	}
+
+	if (xy->row_size / xy->entry_size != 1) {
+		pr_debug("ERROR: Dimension of target event is incorrect for map %s\n",
+			 name);
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM;
+	}
+
+	attr = &evsel->attr;
+	if (attr->inherit) {
+		pr_debug("ERROR: Can't put inherit event into map %s\n", name);
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH;
+	}
+
+	if (attr->type == PERF_TYPE_RAW)
+		check_pass = true;
+	if (attr->type == PERF_TYPE_HARDWARE)
+		check_pass = true;
+	if (attr->type == PERF_TYPE_SOFTWARE &&
+			attr->config == PERF_COUNT_SW_BPF_OUTPUT)
+		check_pass = true;
+	if (!check_pass) {
+		pr_debug("ERROR: Event type is wrong for map %s\n", name);
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE;
+	}
+
+	events = xy->entries / (xy->row_size / xy->entry_size);
+	key = *((unsigned int *)pkey);
+	if (key >= events) {
+		pr_debug("ERROR: there is no event %d for map %s\n",
+			 key, name);
+		return -BPF_LOADER_ERRNO__OBJCONF_MAP_MAPSIZE;
+	}
+	evt_fd = xyarray__entry(xy, key, 0);
+	err = bpf_map_update_elem(map_fd, pkey, evt_fd, BPF_ANY);
+	if (err && errno)
+		err = -errno;
+	return err;
+}
+
+static int
 apply_obj_config_map_for_key(const char *name, int map_fd,
 			     struct bpf_map_def *pdef __maybe_unused,
 			     struct bpf_map_op *op,
@@ -1128,6 +1262,10 @@ apply_obj_config_map_for_key(const char *name, int map_fd,
 						 pdef->value_size,
 						 op->v.value);
 		break;
+	case BPF_MAP_OP_SET_EVSEL:
+		err = apply_config_evsel_for_key(name, map_fd, pkey,
+						 op->v.evsel);
+		break;
 	default:
 		pr_debug("ERROR: unknown value type for '%s'\n", name);
 		err = -BPF_LOADER_ERRNO__INTERNAL;
@@ -1193,6 +1331,11 @@ static const char *bpf_loader_strerror_table[NR_ERRNO] = {
 	[ERRCODE_OFFSET(OBJCONF_MAP_TYPE)]	= "Incorrect map type",
 	[ERRCODE_OFFSET(OBJCONF_MAP_KEYSIZE)]	= "Incorrect map key size",
 	[ERRCODE_OFFSET(OBJCONF_MAP_VALUESIZE)]	= "Incorrect map value size",
+	[ERRCODE_OFFSET(OBJCONF_MAP_NOEVT)]	= "Event not found for map setting",
+	[ERRCODE_OFFSET(OBJCONF_MAP_MAPSIZE)]	= "Invalid map size for event setting",
+	[ERRCODE_OFFSET(OBJCONF_MAP_EVTDIM)]	= "Event dimension too large",
+	[ERRCODE_OFFSET(OBJCONF_MAP_EVTINH)]	= "Doesn't support inherit event",
+	[ERRCODE_OFFSET(OBJCONF_MAP_EVTTYPE)]	= "Wrong event type for map",
 };
 
 static int
@@ -1329,6 +1472,12 @@ int bpf__strerror_config_obj(struct bpf_object *obj __maybe_unused,
 int bpf__strerror_apply_obj_config(int err, char *buf, size_t size)
 {
 	bpf__strerror_head(err, buf, size);
+	bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM,
+			    "Cannot set event to BPF map in multi-thread tracing");
+	bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH,
+			    "%s (Hint: use -i to turn off inherit)", emsg);
+	bpf__strerror_entry(BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE,
+			    "Can only put raw, hardware and BPF output event into a BPF map");
 	bpf__strerror_end(buf, size);
 	return 0;
 }
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index 5d3b931..7c7689f 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -33,6 +33,11 @@ enum bpf_loader_errno {
 	BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE,	/* Incorrect map type */
 	BPF_LOADER_ERRNO__OBJCONF_MAP_KEYSIZE,	/* Incorrect map key size */
 	BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE,/* Incorrect map value size */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_NOEVT,	/* Event not found for map setting */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_MAPSIZE,	/* Invalid map size for event setting */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM,	/* Event dimension too large */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH,	/* Doesn't support inherit event */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE,	/* Wrong event type for map */
 	__BPF_LOADER_ERRNO__END,
 };
 
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index 0f57716..c42e196 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1741,3 +1741,19 @@ void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
 
 	tracking_evsel->tracking = true;
 }
+
+struct perf_evsel *
+perf_evlist__find_evsel_by_str(struct perf_evlist *evlist,
+			       const char *str)
+{
+	struct perf_evsel *evsel;
+
+	evlist__for_each(evlist, evsel) {
+		if (!evsel->name)
+			continue;
+		if (strcmp(str, evsel->name) == 0)
+			return evsel;
+	}
+
+	return NULL;
+}
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 7c4d9a2..a0d1522 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -294,4 +294,7 @@ void perf_evlist__set_tracking_event(struct perf_evlist *evlist,
 				     struct perf_evsel *tracking_evsel);
 
 void perf_event_attr__set_max_precise_ip(struct perf_event_attr *attr);
+
+struct perf_evsel *
+perf_evlist__find_evsel_by_str(struct perf_evlist *evlist, const char *str);
 #endif /* __PERF_EVLIST_H */
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index a5dd670..5909fd2 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -697,14 +697,16 @@ parse_events_config_bpf(struct parse_events_evlist *data,
 			return -EINVAL;
 		}
 
-		err = bpf__config_obj(obj, term, NULL, &error_pos);
+		err = bpf__config_obj(obj, term, data->evlist, &error_pos);
 		if (err) {
-			bpf__strerror_config_obj(obj, term, NULL,
+			bpf__strerror_config_obj(obj, term, data->evlist,
 						 &error_pos, err, errbuf,
 						 sizeof(errbuf));
 			data->error->help = strdup(
-"Hint:\tValid config term:\n"
+"Hint:\tValid config terms:\n"
 "     \tmap:[<arraymap>].value=[value]\n"
+"     \tmap:[<eventmap>].event=[event]\n"
+"\n"
 "     \t(add -v to see detail)");
 			data->error->str = strdup(errbuf);
 			if (err == -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE)
@@ -1530,9 +1532,10 @@ int parse_events(struct perf_evlist *evlist, const char *str,
 		 struct parse_events_error *err)
 {
 	struct parse_events_evlist data = {
-		.list  = LIST_HEAD_INIT(data.list),
-		.idx   = evlist->nr_entries,
-		.error = err,
+		.list   = LIST_HEAD_INIT(data.list),
+		.idx    = evlist->nr_entries,
+		.error  = err,
+		.evlist = evlist,
 	};
 	int ret;
 
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index c48377a..e036969 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -99,6 +99,7 @@ struct parse_events_evlist {
 	int			   idx;
 	int			   nr_groups;
 	struct parse_events_error *error;
+	struct perf_evlist	  *evlist;
 };
 
 struct parse_events_terms {
-- 
cgit v0.10.2


From 2d055bf253c0d606c5de3fe7749e3188080780ad Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 22 Feb 2016 09:10:34 +0000
Subject: perf tools: Support setting different slots in a BPF map separately

This patch introduces basic facilities to support config different slots
in a BPF map one by one.

array.nr_ranges and array.ranges are introduced into 'struct
parse_events_term', where ranges is an array of indices range (start,
length) which will be configured by this config term. nr_ranges is the
size of the array. The array is passed to 'struct bpf_map_priv'.  To
indicate the new type of configuration, BPF_MAP_KEY_RANGES is added as a
new key type. bpf_map_config_foreach_key() is extended to iterate over
those indices instead of all possible keys.

Code in this commit will be enabled by following commit which enables
the indices syntax for array configuration.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456132275-98875-8-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index deacb95..44824e3 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -17,6 +17,7 @@
 #include "llvm-utils.h"
 #include "probe-event.h"
 #include "probe-finder.h" // for MAX_PROBES
+#include "parse-events.h"
 #include "llvm-utils.h"
 
 #define DEFINE_PRINT_FN(name, level) \
@@ -747,6 +748,7 @@ enum bpf_map_op_type {
 
 enum bpf_map_key_type {
 	BPF_MAP_KEY_ALL,
+	BPF_MAP_KEY_RANGES,
 };
 
 struct bpf_map_op {
@@ -754,6 +756,9 @@ struct bpf_map_op {
 	enum bpf_map_op_type op_type;
 	enum bpf_map_key_type key_type;
 	union {
+		struct parse_events_array array;
+	} k;
+	union {
 		u64 value;
 		struct perf_evsel *evsel;
 	} v;
@@ -768,6 +773,8 @@ bpf_map_op__delete(struct bpf_map_op *op)
 {
 	if (!list_empty(&op->list))
 		list_del(&op->list);
+	if (op->key_type == BPF_MAP_KEY_RANGES)
+		parse_events__clear_array(&op->k.array);
 	free(op);
 }
 
@@ -792,10 +799,33 @@ bpf_map_priv__clear(struct bpf_map *map __maybe_unused,
 	free(priv);
 }
 
+static int
+bpf_map_op_setkey(struct bpf_map_op *op, struct parse_events_term *term)
+{
+	op->key_type = BPF_MAP_KEY_ALL;
+	if (!term)
+		return 0;
+
+	if (term->array.nr_ranges) {
+		size_t memsz = term->array.nr_ranges *
+				sizeof(op->k.array.ranges[0]);
+
+		op->k.array.ranges = memdup(term->array.ranges, memsz);
+		if (!op->k.array.ranges) {
+			pr_debug("No enough memory to alloc indices for map\n");
+			return -ENOMEM;
+		}
+		op->key_type = BPF_MAP_KEY_RANGES;
+		op->k.array.nr_ranges = term->array.nr_ranges;
+	}
+	return 0;
+}
+
 static struct bpf_map_op *
-bpf_map_op__new(void)
+bpf_map_op__new(struct parse_events_term *term)
 {
 	struct bpf_map_op *op;
+	int err;
 
 	op = zalloc(sizeof(*op));
 	if (!op) {
@@ -804,7 +834,11 @@ bpf_map_op__new(void)
 	}
 	INIT_LIST_HEAD(&op->list);
 
-	op->key_type = BPF_MAP_KEY_ALL;
+	err = bpf_map_op_setkey(op, term);
+	if (err) {
+		free(op);
+		return ERR_PTR(err);
+	}
 	return op;
 }
 
@@ -841,12 +875,12 @@ bpf_map__add_op(struct bpf_map *map, struct bpf_map_op *op)
 }
 
 static struct bpf_map_op *
-bpf_map__add_newop(struct bpf_map *map)
+bpf_map__add_newop(struct bpf_map *map, struct parse_events_term *term)
 {
 	struct bpf_map_op *op;
 	int err;
 
-	op = bpf_map_op__new();
+	op = bpf_map_op__new(term);
 	if (IS_ERR(op))
 		return op;
 
@@ -896,7 +930,7 @@ __bpf_map__config_value(struct bpf_map *map,
 		return -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUESIZE;
 	}
 
-	op = bpf_map__add_newop(map);
+	op = bpf_map__add_newop(map, term);
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 	op->op_type = BPF_MAP_OP_SET_VALUE;
@@ -958,7 +992,7 @@ __bpf_map__config_event(struct bpf_map *map,
 		return -BPF_LOADER_ERRNO__OBJCONF_MAP_TYPE;
 	}
 
-	op = bpf_map__add_newop(map);
+	op = bpf_map__add_newop(map, term);
 	if (IS_ERR(op))
 		return PTR_ERR(op);
 	op->op_type = BPF_MAP_OP_SET_EVSEL;
@@ -996,6 +1030,44 @@ struct bpf_obj_config__map_func bpf_obj_config__map_funcs[] = {
 };
 
 static int
+config_map_indices_range_check(struct parse_events_term *term,
+			       struct bpf_map *map,
+			       const char *map_name)
+{
+	struct parse_events_array *array = &term->array;
+	struct bpf_map_def def;
+	unsigned int i;
+	int err;
+
+	if (!array->nr_ranges)
+		return 0;
+	if (!array->ranges) {
+		pr_debug("ERROR: map %s: array->nr_ranges is %d but range array is NULL\n",
+			 map_name, (int)array->nr_ranges);
+		return -BPF_LOADER_ERRNO__INTERNAL;
+	}
+
+	err = bpf_map__get_def(map, &def);
+	if (err) {
+		pr_debug("ERROR: Unable to get map definition from '%s'\n",
+			 map_name);
+		return -BPF_LOADER_ERRNO__INTERNAL;
+	}
+
+	for (i = 0; i < array->nr_ranges; i++) {
+		unsigned int start = array->ranges[i].start;
+		size_t length = array->ranges[i].length;
+		unsigned int idx = start + length - 1;
+
+		if (idx >= def.max_entries) {
+			pr_debug("ERROR: index %d too large\n", idx);
+			return -BPF_LOADER_ERRNO__OBJCONF_MAP_IDX2BIG;
+		}
+	}
+	return 0;
+}
+
+static int
 bpf__obj_config_map(struct bpf_object *obj,
 		    struct parse_events_term *term,
 		    struct perf_evlist *evlist,
@@ -1030,7 +1102,12 @@ bpf__obj_config_map(struct bpf_object *obj,
 		goto out;
 	}
 
-	*key_scan_pos += map_opt - map_name;
+	*key_scan_pos += strlen(map_opt);
+	err = config_map_indices_range_check(term, map, map_name);
+	if (err)
+		goto out;
+	*key_scan_pos -= strlen(map_opt);
+
 	for (i = 0; i < ARRAY_SIZE(bpf_obj_config__map_funcs); i++) {
 		struct bpf_obj_config__map_func *func =
 				&bpf_obj_config__map_funcs[i];
@@ -1100,6 +1177,33 @@ foreach_key_array_all(map_config_func_t func,
 }
 
 static int
+foreach_key_array_ranges(map_config_func_t func, void *arg,
+			 const char *name, int map_fd,
+			 struct bpf_map_def *pdef,
+			 struct bpf_map_op *op)
+{
+	unsigned int i, j;
+	int err;
+
+	for (i = 0; i < op->k.array.nr_ranges; i++) {
+		unsigned int start = op->k.array.ranges[i].start;
+		size_t length = op->k.array.ranges[i].length;
+
+		for (j = 0; j < length; j++) {
+			unsigned int idx = start + j;
+
+			err = func(name, map_fd, pdef, op, &idx, arg);
+			if (err) {
+				pr_debug("ERROR: failed to insert value to %s[%u]\n",
+					 name, idx);
+				return err;
+			}
+		}
+	}
+	return 0;
+}
+
+static int
 bpf_map_config_foreach_key(struct bpf_map *map,
 			   map_config_func_t func,
 			   void *arg)
@@ -1141,14 +1245,19 @@ bpf_map_config_foreach_key(struct bpf_map *map,
 			case BPF_MAP_KEY_ALL:
 				err = foreach_key_array_all(func, arg, name,
 							    map_fd, &def, op);
-				if (err)
-					return err;
+				break;
+			case BPF_MAP_KEY_RANGES:
+				err = foreach_key_array_ranges(func, arg, name,
+							       map_fd, &def,
+							       op);
 				break;
 			default:
 				pr_debug("ERROR: keytype for map '%s' invalid\n",
 					 name);
 				return -BPF_LOADER_ERRNO__INTERNAL;
 			}
+			if (err)
+				return err;
 			break;
 		default:
 			pr_debug("ERROR: type of '%s' incorrect\n", name);
@@ -1336,6 +1445,7 @@ static const char *bpf_loader_strerror_table[NR_ERRNO] = {
 	[ERRCODE_OFFSET(OBJCONF_MAP_EVTDIM)]	= "Event dimension too large",
 	[ERRCODE_OFFSET(OBJCONF_MAP_EVTINH)]	= "Doesn't support inherit event",
 	[ERRCODE_OFFSET(OBJCONF_MAP_EVTTYPE)]	= "Wrong event type for map",
+	[ERRCODE_OFFSET(OBJCONF_MAP_IDX2BIG)]	= "Index too large",
 };
 
 static int
diff --git a/tools/perf/util/bpf-loader.h b/tools/perf/util/bpf-loader.h
index 7c7689f..be43119 100644
--- a/tools/perf/util/bpf-loader.h
+++ b/tools/perf/util/bpf-loader.h
@@ -38,6 +38,7 @@ enum bpf_loader_errno {
 	BPF_LOADER_ERRNO__OBJCONF_MAP_EVTDIM,	/* Event dimension too large */
 	BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH,	/* Doesn't support inherit event */
 	BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE,	/* Wrong event type for map */
+	BPF_LOADER_ERRNO__OBJCONF_MAP_IDX2BIG,	/* Index too large */
 	__BPF_LOADER_ERRNO__END,
 };
 
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 5909fd2..697d350 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -2211,6 +2211,8 @@ void parse_events_terms__purge(struct list_head *terms)
 	struct parse_events_term *term, *h;
 
 	list_for_each_entry_safe(term, h, terms, list) {
+		if (term->array.nr_ranges)
+			free(term->array.ranges);
 		list_del_init(&term->list);
 		free(term);
 	}
@@ -2224,6 +2226,11 @@ void parse_events_terms__delete(struct list_head *terms)
 	free(terms);
 }
 
+void parse_events__clear_array(struct parse_events_array *a)
+{
+	free(a->ranges);
+}
+
 void parse_events_evlist_error(struct parse_events_evlist *data,
 			       int idx, const char *str)
 {
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index e036969..e445622 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -72,8 +72,17 @@ enum {
 	__PARSE_EVENTS__TERM_TYPE_NR,
 };
 
+struct parse_events_array {
+	size_t nr_ranges;
+	struct {
+		unsigned int start;
+		size_t length;
+	} *ranges;
+};
+
 struct parse_events_term {
 	char *config;
+	struct parse_events_array array;
 	union {
 		char *str;
 		u64  num;
@@ -120,6 +129,7 @@ int parse_events_term__clone(struct parse_events_term **new,
 			     struct parse_events_term *term);
 void parse_events_terms__delete(struct list_head *terms);
 void parse_events_terms__purge(struct list_head *terms);
+void parse_events__clear_array(struct parse_events_array *a);
 int parse_events__modifier_event(struct list_head *list, char *str, bool add);
 int parse_events__modifier_group(struct list_head *list, char *event_mod);
 int parse_events_name(struct list_head *list, char *name);
-- 
cgit v0.10.2


From e571e029bdbf59f485fe67740b7a4ef421e1d55d Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 22 Feb 2016 09:10:35 +0000
Subject: perf tools: Enable indices setting syntax for BPF map

This patch introduces a new syntax to perf event parser:

 # perf record -e './test_bpf_map_3.c/map:channel.value[0,1,2,3...5]=101/' usleep 2

By utilizing the basic facilities in bpf-loader.c which allow setting
different slots in a BPF map separately, the newly introduced syntax
allows perf to control specific elements in a BPF map.

Test result:

  # cat ./test_bpf_map_3.c
  /************************ BEGIN **************************/
  #include <uapi/linux/bpf.h>
  #define SEC(NAME) __attribute__((section(NAME), used))
  struct bpf_map_def {
	unsigned int type;
	unsigned int key_size;
	unsigned int value_size;
	unsigned int max_entries;
  };
  static void *(*map_lookup_elem)(struct bpf_map_def *, void *) =
 	(void *)BPF_FUNC_map_lookup_elem;
  static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
 	(void *)BPF_FUNC_trace_printk;
  struct bpf_map_def SEC("maps") channel = {
 	.type = BPF_MAP_TYPE_ARRAY,
 	.key_size = sizeof(int),
 	.value_size = sizeof(unsigned char),
 	.max_entries = 100,
  };
  SEC("func=hrtimer_nanosleep rqtp->tv_nsec")
  int func(void *ctx, int err, long nsec)
  {
 	char fmt[] = "%ld\n";
 	long usec = nsec * 0x10624dd3 >> 38; // nsec / 1000
 	int key = (int)usec;
 	unsigned char *pval = map_lookup_elem(&channel, &key);

 	if (!pval)
 		return 0;
 	trace_printk(fmt, sizeof(fmt), (unsigned char)*pval);
 	return 0;
  }
  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  /************************* END ***************************/

Normal case:

  # echo "" > /sys/kernel/debug/tracing/trace
  # ./perf record -e './test_bpf_map_3.c/map:channel.value[0,1,2,3...5]=101/' usleep 2
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.012 MB perf.data ]
  # cat /sys/kernel/debug/tracing/trace | grep usleep
            usleep-405   [004] d... 2745423.547822: : 101
  # ./perf record -e './test_bpf_map_3.c/map:channel.value[0...9,20...29]=102,map:channel.value[10...19]=103/' usleep 3
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.012 MB perf.data ]
  # ./perf record -e './test_bpf_map_3.c/map:channel.value[0...9,20...29]=102,map:channel.value[10...19]=103/' usleep 15
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.012 MB perf.data ]
  # cat /sys/kernel/debug/tracing/trace | grep usleep
            usleep-405   [004] d... 2745423.547822: : 101
            usleep-655   [006] d... 2745434.122814: : 102
            usleep-904   [006] d... 2745439.916264: : 103
  # ./perf record -e './test_bpf_map_3.c/map:channel.value[all]=104/' usleep 99
  # cat /sys/kernel/debug/tracing/trace | grep usleep
            usleep-405   [004] d... 2745423.547822: : 101
            usleep-655   [006] d... 2745434.122814: : 102
            usleep-904   [006] d... 2745439.916264: : 103
            usleep-1537  [003] d... 2745538.053737: : 104

Error case:

  # ./perf record -e './test_bpf_map_3.c/map:channel.value[10...1000]=104/' usleep 99
  event syntax error: '..annel.value[10...1000]=104/'
                                   \___ Index too large
  Hint:	Valid config terms:
      	map:[<arraymap>].value<indices>=[value]
      	map:[<eventmap>].event<indices>=[event]

      	where <indices> is something like [0,3...5] or [all]
      	(add -v to see detail)
  Run 'perf list' for a list of valid events

   Usage: perf record [<options>] [<command>]
      or: perf record [<options>] -- <command> [<options>]

      -e, --event <event>   event selector. use 'perf list' to list available events

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456132275-98875-9-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 697d350..6e2f203 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -704,9 +704,10 @@ parse_events_config_bpf(struct parse_events_evlist *data,
 						 sizeof(errbuf));
 			data->error->help = strdup(
 "Hint:\tValid config terms:\n"
-"     \tmap:[<arraymap>].value=[value]\n"
-"     \tmap:[<eventmap>].event=[event]\n"
+"     \tmap:[<arraymap>].value<indices>=[value]\n"
+"     \tmap:[<eventmap>].event<indices>=[event]\n"
 "\n"
+"     \twhere <indices> is something like [0,3...5] or [all]\n"
 "     \t(add -v to see detail)");
 			data->error->str = strdup(errbuf);
 			if (err == -BPF_LOADER_ERRNO__OBJCONF_MAP_VALUE)
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 0cc6b84..fb85d03 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -9,8 +9,8 @@
 %{
 #include <errno.h>
 #include "../perf.h"
-#include "parse-events-bison.h"
 #include "parse-events.h"
+#include "parse-events-bison.h"
 
 char *parse_events_get_text(yyscan_t yyscanner);
 YYSTYPE *parse_events_get_lval(yyscan_t yyscanner);
@@ -111,6 +111,7 @@ do {							\
 %x mem
 %s config
 %x event
+%x array
 
 group		[^,{}/]*[{][^}]*[}][^,{}/]*
 event_pmu	[^,{}/]+[/][^/]*[/][^,{}/]*
@@ -176,6 +177,14 @@ modifier_bp	[rwx]{1,3}
 
 }
 
+<array>{
+"]"			{ BEGIN(config); return ']'; }
+{num_dec}		{ return value(yyscanner, 10); }
+{num_hex}		{ return value(yyscanner, 16); }
+,			{ return ','; }
+"\.\.\."		{ return PE_ARRAY_RANGE; }
+}
+
 <config>{
 	/*
 	 * Please update config_term_names when new static term is added.
@@ -195,6 +204,8 @@ no-inherit		{ return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); }
 ,			{ return ','; }
 "/"			{ BEGIN(INITIAL); return '/'; }
 {name_minus}		{ return str(yyscanner, PE_NAME); }
+\[all\]			{ return PE_ARRAY_ALL; }
+"["			{ BEGIN(array); return '['; }
 }
 
 <mem>{
diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index 0e2d433..d1fbcab 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -48,6 +48,7 @@ static inc_group_count(struct list_head *list,
 %token PE_PREFIX_MEM PE_PREFIX_RAW PE_PREFIX_GROUP
 %token PE_ERROR
 %token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT
+%token PE_ARRAY_ALL PE_ARRAY_RANGE
 %type <num> PE_VALUE
 %type <num> PE_VALUE_SYM_HW
 %type <num> PE_VALUE_SYM_SW
@@ -83,6 +84,9 @@ static inc_group_count(struct list_head *list,
 %type <head> group_def
 %type <head> group
 %type <head> groups
+%type <array> array
+%type <array> array_term
+%type <array> array_terms
 
 %union
 {
@@ -94,6 +98,7 @@ static inc_group_count(struct list_head *list,
 		char *sys;
 		char *event;
 	} tracepoint_name;
+	struct parse_events_array array;
 }
 %%
 
@@ -572,6 +577,86 @@ PE_TERM
 	ABORT_ON(parse_events_term__num(&term, (int)$1, NULL, 1, &@1, NULL));
 	$$ = term;
 }
+|
+PE_NAME array '=' PE_NAME
+{
+	struct parse_events_term *term;
+	int i;
+
+	ABORT_ON(parse_events_term__str(&term, PARSE_EVENTS__TERM_TYPE_USER,
+					$1, $4, &@1, &@4));
+
+	term->array = $2;
+	$$ = term;
+}
+|
+PE_NAME array '=' PE_VALUE
+{
+	struct parse_events_term *term;
+
+	ABORT_ON(parse_events_term__num(&term, PARSE_EVENTS__TERM_TYPE_USER,
+					$1, $4, &@1, &@4));
+	term->array = $2;
+	$$ = term;
+}
+
+array:
+'[' array_terms ']'
+{
+	$$ = $2;
+}
+|
+PE_ARRAY_ALL
+{
+	$$.nr_ranges = 0;
+	$$.ranges = NULL;
+}
+
+array_terms:
+array_terms ',' array_term
+{
+	struct parse_events_array new_array;
+
+	new_array.nr_ranges = $1.nr_ranges + $3.nr_ranges;
+	new_array.ranges = malloc(sizeof(new_array.ranges[0]) *
+				  new_array.nr_ranges);
+	ABORT_ON(!new_array.ranges);
+	memcpy(&new_array.ranges[0], $1.ranges,
+	       $1.nr_ranges * sizeof(new_array.ranges[0]));
+	memcpy(&new_array.ranges[$1.nr_ranges], $3.ranges,
+	       $3.nr_ranges * sizeof(new_array.ranges[0]));
+	free($1.ranges);
+	free($3.ranges);
+	$$ = new_array;
+}
+|
+array_term
+
+array_term:
+PE_VALUE
+{
+	struct parse_events_array array;
+
+	array.nr_ranges = 1;
+	array.ranges = malloc(sizeof(array.ranges[0]));
+	ABORT_ON(!array.ranges);
+	array.ranges[0].start = $1;
+	array.ranges[0].length = 1;
+	$$ = array;
+}
+|
+PE_VALUE PE_ARRAY_RANGE PE_VALUE
+{
+	struct parse_events_array array;
+
+	ABORT_ON($3 < $1);
+	array.nr_ranges = 1;
+	array.ranges = malloc(sizeof(array.ranges[0]));
+	ABORT_ON(!array.ranges);
+	array.ranges[0].start = $1;
+	array.ranges[0].length = $3 - $1 + 1;
+	$$ = array;
+}
 
 sep_dc: ':' |
 
-- 
cgit v0.10.2


From 95088a591e197610bd03f4059f5fdbe9e376425b Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 22 Feb 2016 09:10:36 +0000
Subject: perf tools: Apply tracepoint event definition options to BPF script

Users can pass options to tracepoints defined in the BPF script.  For
example:

  # perf record -e ./test.c/no-inherit/ bash
  # dd if=/dev/zero of=/dev/null count=10000
  # exit
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.022 MB perf.data (139 samples) ]

  (no-inherit works, only the sys_read issued by bash are captured, at
   least 10000 sys_read issued by dd are skipped.)

test.c:

  #define SEC(NAME) __attribute__((section(NAME), used))
  SEC("func=sys_read")
  int bpf_func__sys_read(void *ctx)
  {
      return 1;
  }
  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;

no-inherit is applied to the kprobe event defined in test.c.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456132275-98875-10-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c
index 4aed5cb..199501c 100644
--- a/tools/perf/tests/bpf.c
+++ b/tools/perf/tests/bpf.c
@@ -112,7 +112,7 @@ static int do_test(struct bpf_object *obj, int (*func)(void),
 	parse_evlist.error = &parse_error;
 	INIT_LIST_HEAD(&parse_evlist.list);
 
-	err = parse_events_load_bpf_obj(&parse_evlist, &parse_evlist.list, obj);
+	err = parse_events_load_bpf_obj(&parse_evlist, &parse_evlist.list, obj, NULL);
 	if (err || list_empty(&parse_evlist.list)) {
 		pr_debug("Failed to add events selected by BPF\n");
 		return TEST_FAIL;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 6e2f203..4c19d5e 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -581,6 +581,7 @@ static int add_tracepoint_multi_sys(struct list_head *list, int *idx,
 struct __add_bpf_event_param {
 	struct parse_events_evlist *data;
 	struct list_head *list;
+	struct list_head *head_config;
 };
 
 static int add_bpf_event(struct probe_trace_event *tev, int fd,
@@ -597,7 +598,8 @@ static int add_bpf_event(struct probe_trace_event *tev, int fd,
 		 tev->group, tev->event, fd);
 
 	err = parse_events_add_tracepoint(&new_evsels, &evlist->idx, tev->group,
-					  tev->event, evlist->error, NULL);
+					  tev->event, evlist->error,
+					  param->head_config);
 	if (err) {
 		struct perf_evsel *evsel, *tmp;
 
@@ -622,11 +624,12 @@ static int add_bpf_event(struct probe_trace_event *tev, int fd,
 
 int parse_events_load_bpf_obj(struct parse_events_evlist *data,
 			      struct list_head *list,
-			      struct bpf_object *obj)
+			      struct bpf_object *obj,
+			      struct list_head *head_config)
 {
 	int err;
 	char errbuf[BUFSIZ];
-	struct __add_bpf_event_param param = {data, list};
+	struct __add_bpf_event_param param = {data, list, head_config};
 	static bool registered_unprobe_atexit = false;
 
 	if (IS_ERR(obj) || !obj) {
@@ -720,14 +723,47 @@ parse_events_config_bpf(struct parse_events_evlist *data,
 	return 0;
 }
 
+/*
+ * Split config terms:
+ * perf record -e bpf.c/call-graph=fp,map:array.value[0]=1/ ...
+ *  'call-graph=fp' is 'evt config', should be applied to each
+ *  events in bpf.c.
+ * 'map:array.value[0]=1' is 'obj config', should be processed
+ * with parse_events_config_bpf.
+ *
+ * Move object config terms from the first list to obj_head_config.
+ */
+static void
+split_bpf_config_terms(struct list_head *evt_head_config,
+		       struct list_head *obj_head_config)
+{
+	struct parse_events_term *term, *temp;
+
+	/*
+	 * Currectly, all possible user config term
+	 * belong to bpf object. parse_events__is_hardcoded_term()
+	 * happends to be a good flag.
+	 *
+	 * See parse_events_config_bpf() and
+	 * config_term_tracepoint().
+	 */
+	list_for_each_entry_safe(term, temp, evt_head_config, list)
+		if (!parse_events__is_hardcoded_term(term))
+			list_move_tail(&term->list, obj_head_config);
+}
+
 int parse_events_load_bpf(struct parse_events_evlist *data,
 			  struct list_head *list,
 			  char *bpf_file_name,
 			  bool source,
 			  struct list_head *head_config)
 {
-	struct bpf_object *obj;
 	int err;
+	struct bpf_object *obj;
+	LIST_HEAD(obj_head_config);
+
+	if (head_config)
+		split_bpf_config_terms(head_config, &obj_head_config);
 
 	obj = bpf__prepare_load(bpf_file_name, source);
 	if (IS_ERR(obj)) {
@@ -749,10 +785,18 @@ int parse_events_load_bpf(struct parse_events_evlist *data,
 		return err;
 	}
 
-	err = parse_events_load_bpf_obj(data, list, obj);
+	err = parse_events_load_bpf_obj(data, list, obj, head_config);
 	if (err)
 		return err;
-	return parse_events_config_bpf(data, obj, head_config);
+	err = parse_events_config_bpf(data, obj, &obj_head_config);
+
+	/*
+	 * Caller doesn't know anything about obj_head_config,
+	 * so combine them together again before returnning.
+	 */
+	if (head_config)
+		list_splice_tail(&obj_head_config, head_config);
+	return err;
 }
 
 static int
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index e445622..67e4930 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -146,7 +146,8 @@ int parse_events_load_bpf(struct parse_events_evlist *data,
 struct bpf_object;
 int parse_events_load_bpf_obj(struct parse_events_evlist *data,
 			      struct list_head *list,
-			      struct bpf_object *obj);
+			      struct bpf_object *obj,
+			      struct list_head *head_config);
 int parse_events_add_numeric(struct parse_events_evlist *data,
 			     struct list_head *list,
 			     u32 type, u64 config,
-- 
cgit v0.10.2


From 03e0a7df3efd959e40cd7ff40b1fabddc234ec5a Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 22 Feb 2016 09:10:37 +0000
Subject: perf tools: Introduce bpf-output event

Commit a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
adds a helper to enable a BPF program to output data to a perf ring
buffer through a new type of perf event, PERF_COUNT_SW_BPF_OUTPUT. This
patch enables perf to create events of that type. Now a perf user can
use the following cmdline to receive output data from BPF programs:

  # perf record -a -e bpf-output/no-inherit,name=evt/ \
                    -e ./test_bpf_output.c/map:channel.event=evt/ ls /
  # perf script
     perf 1560 [004] 347747.086295:  evt: ffffffff811fd201 sys_write ...
     perf 1560 [004] 347747.086300:  evt: ffffffff811fd201 sys_write ...
     perf 1560 [004] 347747.086315:  evt: ffffffff811fd201 sys_write ...
            ...

Test result:

  # cat test_bpf_output.c
  /************************ BEGIN **************************/
  #include <uapi/linux/bpf.h>
  struct bpf_map_def {
 	unsigned int type;
 	unsigned int key_size;
 	unsigned int value_size;
 	unsigned int max_entries;
  };

  #define SEC(NAME) __attribute__((section(NAME), used))
  static u64 (*ktime_get_ns)(void) =
 	(void *)BPF_FUNC_ktime_get_ns;
  static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
 	(void *)BPF_FUNC_trace_printk;
  static int (*get_smp_processor_id)(void) =
 	(void *)BPF_FUNC_get_smp_processor_id;
  static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *, unsigned long) =
 	(void *)BPF_FUNC_perf_event_output;

  struct bpf_map_def SEC("maps") channel = {
 	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 	.key_size = sizeof(int),
 	.value_size = sizeof(u32),
 	.max_entries = __NR_CPUS__,
  };

  SEC("func_write=sys_write")
  int func_write(void *ctx)
  {
 	struct {
 		u64 ktime;
 		int cpuid;
 	} __attribute__((packed)) output_data;
 	char error_data[] = "Error: failed to output: %d\n";

 	output_data.cpuid = get_smp_processor_id();
 	output_data.ktime = ktime_get_ns();
 	int err = perf_event_output(ctx, &channel, get_smp_processor_id(),
 				    &output_data, sizeof(output_data));
 	if (err)
 		trace_printk(error_data, sizeof(error_data), err);
 	return 0;
  }
  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  /************************ END ***************************/

  # perf record -a -e bpf-output/no-inherit,name=evt/ \
                    -e ./test_bpf_output.c/map:channel.event=evt/ ls /
  # perf script | grep ls
     ls  2242 [003] 347851.557563:   evt: ffffffff811fd201 sys_write ...
     ls  2242 [003] 347851.557571:   evt: ffffffff811fd201 sys_write ...

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Cody P Schafer <dev@codyps.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kirill Smelkov <kirr@nexedi.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456132275-98875-11-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index 44824e3..0967ce6 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -1331,13 +1331,12 @@ apply_config_evsel_for_key(const char *name, int map_fd, void *pkey,
 		return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTINH;
 	}
 
+	if (perf_evsel__is_bpf_output(evsel))
+		check_pass = true;
 	if (attr->type == PERF_TYPE_RAW)
 		check_pass = true;
 	if (attr->type == PERF_TYPE_HARDWARE)
 		check_pass = true;
-	if (attr->type == PERF_TYPE_SOFTWARE &&
-			attr->config == PERF_COUNT_SW_BPF_OUTPUT)
-		check_pass = true;
 	if (!check_pass) {
 		pr_debug("ERROR: Event type is wrong for map %s\n", name);
 		return -BPF_LOADER_ERRNO__OBJCONF_MAP_EVTTYPE;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 6ae20d0..0902fe4 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -225,6 +225,11 @@ struct perf_evsel *perf_evsel__new_idx(struct perf_event_attr *attr, int idx)
 	if (evsel != NULL)
 		perf_evsel__init(evsel, attr, idx);
 
+	if (perf_evsel__is_bpf_output(evsel)) {
+		evsel->attr.sample_type |= PERF_SAMPLE_RAW;
+		evsel->attr.sample_period = 1;
+	}
+
 	return evsel;
 }
 
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 8e75434..efad78f 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -364,6 +364,14 @@ static inline bool perf_evsel__is_function_event(struct perf_evsel *evsel)
 #undef FUNCTION_EVENT
 }
 
+static inline bool perf_evsel__is_bpf_output(struct perf_evsel *evsel)
+{
+	struct perf_event_attr *attr = &evsel->attr;
+
+	return (attr->config == PERF_COUNT_SW_BPF_OUTPUT) &&
+		(attr->type == PERF_TYPE_SOFTWARE);
+}
+
 struct perf_attr_details {
 	bool freq;
 	bool verbose;
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index fb85d03..1477fbc 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -248,6 +248,7 @@ cpu-migrations|migrations			{ return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COU
 alignment-faults				{ return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_ALIGNMENT_FAULTS); }
 emulation-faults				{ return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_EMULATION_FAULTS); }
 dummy						{ return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_DUMMY); }
+bpf-output					{ return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUTPUT); }
 
 	/*
 	 * We have to handle the kernel PMU event cycles-ct/cycles-t/mem-loads/mem-stores separately.
-- 
cgit v0.10.2


From c637fa5294cefeda8be73cce20ba6693d22262dc Mon Sep 17 00:00:00 2001
From: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Date: Tue, 23 Feb 2016 01:29:59 +0000
Subject: x86/platform/intel/quark: Drop IMR lock bit support

Isolated Memory Regions support a lock bit. The lock bit in an IMR prevents
modification of the IMR until the core goes through a warm or cold reset.
The lock bit feature is not useful in the context of the kernel API and is
not really necessary since modification of IMRs is possible only from
ring-zero anyway. This patch drops support for IMR locks bits, it
simplifies the kernel API and removes an unnecessary and needlessly complex
feature.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: andriy.shevchenko@linux.intel.com
Cc: boon.leong.ong@intel.com
Cc: paul.gortmaker@windriver.com
Link: http://lkml.kernel.org/r/1456190999-12685-3-git-send-email-pure.logic@nexus-software.ie
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h
index cd2ce40..ebea2c9 100644
--- a/arch/x86/include/asm/imr.h
+++ b/arch/x86/include/asm/imr.h
@@ -53,7 +53,7 @@
 #define IMR_MASK		(IMR_ALIGN - 1)
 
 int imr_add_range(phys_addr_t base, size_t size,
-		  unsigned int rmask, unsigned int wmask, bool lock);
+		  unsigned int rmask, unsigned int wmask);
 
 int imr_remove_range(phys_addr_t base, size_t size);
 
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index 740445a..17d6d22 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -134,11 +134,9 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
  * @idev:	pointer to imr_device structure.
  * @imr_id:	IMR entry to write.
  * @imr:	IMR structure representing address and access masks.
- * @lock:	indicates if the IMR lock bit should be applied.
  * @return:	0 on success or error code passed from mbi_iosf on failure.
  */
-static int imr_write(struct imr_device *idev, u32 imr_id,
-		     struct imr_regs *imr, bool lock)
+static int imr_write(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
 {
 	unsigned long flags;
 	u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
@@ -162,15 +160,6 @@ static int imr_write(struct imr_device *idev, u32 imr_id,
 	if (ret)
 		goto failed;
 
-	/* Lock bit must be set separately to addr_lo address bits. */
-	if (lock) {
-		imr->addr_lo |= IMR_LOCK;
-		ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE,
-				     reg - IMR_NUM_REGS, imr->addr_lo);
-		if (ret)
-			goto failed;
-	}
-
 	local_irq_restore(flags);
 	return 0;
 failed:
@@ -322,11 +311,10 @@ static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr)
  * @size:	physical size of region in bytes must be aligned to 1KiB.
  * @read_mask:	read access mask.
  * @write_mask:	write access mask.
- * @lock:	indicates whether or not to permanently lock this region.
  * @return:	zero on success or negative value indicating error.
  */
 int imr_add_range(phys_addr_t base, size_t size,
-		  unsigned int rmask, unsigned int wmask, bool lock)
+		  unsigned int rmask, unsigned int wmask)
 {
 	phys_addr_t end;
 	unsigned int i;
@@ -399,7 +387,7 @@ int imr_add_range(phys_addr_t base, size_t size,
 	imr.rmask = rmask;
 	imr.wmask = wmask;
 
-	ret = imr_write(idev, reg, &imr, lock);
+	ret = imr_write(idev, reg, &imr);
 	if (ret < 0) {
 		/*
 		 * In the highly unlikely event iosf_mbi_write failed
@@ -410,7 +398,7 @@ int imr_add_range(phys_addr_t base, size_t size,
 		imr.addr_hi = 0;
 		imr.rmask = IMR_READ_ACCESS_ALL;
 		imr.wmask = IMR_WRITE_ACCESS_ALL;
-		imr_write(idev, reg, &imr, false);
+		imr_write(idev, reg, &imr);
 	}
 failed:
 	mutex_unlock(&idev->lock);
@@ -506,7 +494,7 @@ static int __imr_remove_range(int reg, phys_addr_t base, size_t size)
 	imr.rmask = IMR_READ_ACCESS_ALL;
 	imr.wmask = IMR_WRITE_ACCESS_ALL;
 
-	ret = imr_write(idev, reg, &imr, false);
+	ret = imr_write(idev, reg, &imr);
 
 failed:
 	mutex_unlock(&idev->lock);
@@ -587,7 +575,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev)
 	 * We don't round up @size since it is already PAGE_SIZE aligned.
 	 * See vmlinux.lds.S for details.
 	 */
-	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
 	if (ret < 0) {
 		pr_err("unable to setup IMR for kernel: %zu KiB (%lx - %lx)\n",
 			size / 1024, start, end);
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index 0381343..f5bad40 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -60,30 +60,30 @@ static void __init imr_self_test(void)
 	int ret;
 
 	/* Test zero zero. */
-	ret = imr_add_range(0, 0, 0, 0, false);
+	ret = imr_add_range(0, 0, 0, 0);
 	imr_self_test_result(ret < 0, "zero sized IMR\n");
 
 	/* Test exact overlap. */
-	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
 	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
 
 	/* Test overlap with base inside of existing. */
 	base += size - IMR_ALIGN;
-	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
 	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
 
 	/* Test overlap with end inside of existing. */
 	base -= size + IMR_ALIGN * 2;
-	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+	ret = imr_add_range(base, size, IMR_CPU, IMR_CPU);
 	imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
 
 	/* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */
 	ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL,
-			    IMR_WRITE_ACCESS_ALL, false);
+			    IMR_WRITE_ACCESS_ALL);
 	imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n");
 
 	/* Test that a 1 KiB IMR @ zero with CPU only will work. */
-	ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false);
+	ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU);
 	imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n");
 	if (ret >= 0) {
 		ret = imr_remove_range(0, IMR_ALIGN);
@@ -92,8 +92,7 @@ static void __init imr_self_test(void)
 
 	/* Test 2 KiB works. */
 	size = IMR_ALIGN * 2;
-	ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL,
-			    IMR_WRITE_ACCESS_ALL, false);
+	ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL, IMR_WRITE_ACCESS_ALL);
 	imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n");
 	if (ret >= 0) {
 		ret = imr_remove_range(0, size);
-- 
cgit v0.10.2


From 7a36b930e6ed4702c866dc74a5ad07318a57c688 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Thu, 18 Feb 2016 19:49:18 +0100
Subject: mac80211: minstrel_ht: set default tx aggregation timeout to 0

The value 5000 was put here with the addition of the timeout field to
ieee80211_start_tx_ba_session. It was originally added in mac80211 to
save resources for drivers like iwlwifi, which only supports a limited
number of concurrent aggregation sessions.

Since iwlwifi does not use minstrel_ht and other drivers don't need
this, 0 is a better default - especially since there have been
recent reports of aggregation setup related issues reproduced with
ath9k. This should improve stability without causing any adverse
effects.

Cc: stable@vger.kernel.org
Acked-by: Avery Pennarun <apenwarr@gmail.com>
Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 702328b..409b364 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -692,7 +692,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb)
 	if (likely(sta->ampdu_mlme.tid_tx[tid]))
 		return;
 
-	ieee80211_start_tx_ba_session(pubsta, tid, 5000);
+	ieee80211_start_tx_ba_session(pubsta, tid, 0);
 }
 
 static void
-- 
cgit v0.10.2


From 5e950a78bf5b18ded277a27aed0bcdbe7c1b868a Mon Sep 17 00:00:00 2001
From: Ola Olsson <ola1olsson@gmail.com>
Date: Thu, 11 Feb 2016 01:00:22 +0100
Subject: nl80211: Zero out the connection keys memory when freeing them.

The connection keys are zeroed out in all other cases except this
one. Let's fix the last one as well.

Signed-off-by: Ola Olsson <ola.olsson@sonymobile.com>
Reviewed-by: Julian Calaby <julian.calaby@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d4786f2..711cb7a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7547,7 +7547,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 
 		if ((ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) &&
 		    no_ht) {
-			kfree(connkeys);
+			kzfree(connkeys);
 			return -EINVAL;
 		}
 	}
-- 
cgit v0.10.2


From b86071528f3261ab592fad5b9b1a02aea3dcabf3 Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend@broadcom.com>
Date: Mon, 15 Feb 2016 14:35:53 +0100
Subject: cfg80211: stop critical protocol session upon disconnect event

When user-space has started a critical protocol session and a disconnect
event occurs, the rdev::crit_prot_nlportid remains set. This caused a
subsequent NL80211_CMD_CRIT_PROTO_START to fail (-EBUSY). Fix this by
clearing the rdev attribute and call .crit_proto_stop() callback upon
disconnect event.

Reviewed-by: Hante Meuleman <meuleman@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieterpg@broadcom.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 8020b5b..d49ed76 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -917,6 +917,12 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
 
 	nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
 
+	/* stop critical protocol if supported */
+	if (rdev->ops->crit_proto_stop && rdev->crit_proto_nlportid) {
+		rdev->crit_proto_nlportid = 0;
+		rdev_crit_proto_stop(rdev, wdev);
+	}
+
 	/*
 	 * Delete all the keys ... pairwise keys can't really
 	 * exist any more anyway, but default keys might.
-- 
cgit v0.10.2


From 0e47b38dcd24c78d0699b42f28d5986154d2aa11 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Mon, 22 Feb 2016 14:08:22 -0300
Subject: tools lib traceevent: Implement '%' operation

The operation '%' is not implemented on event-parse.c, causing an error
when parsing events with '%' the operation in its printk format. For
example,

  # perf record -e sched:sched_deadline_yield ~/yield-test
    Warning: [sched:sched_deadline_yield] unknown op '%'
  ....
  # perf script
    Warning: [sched:sched_deadline_yield] unknown op '%'
        test  1641 [006]  3364.109319: sched:sched_deadline_yield: \
                        [FAILED TO PARSE] now=3364109314595        \
                        deadline=3364139295135 runtime=19975597

This patch implements the '%' operation. With this patch, we see the
correct output:

  # perf record -e sched:sched_deadline_yield ~/yield-test
    No Warning

  # perf script
        yield-test  4005 [001]  4623.650978: sched:sched_deadline_yield: \
                now=4623.650974050                                       \
                deadline=4623.680957364 remaining_runtime=19979611

Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-rt-users <linux-rt-users@vger.kernel.org>
Link: http://lkml.kernel.org/r/5c96a395c56cea6d3d13d949051bdece86cc26e0.1456157869.git.bristot@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index c3bd294..575e751 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -1951,6 +1951,7 @@ process_op(struct event_format *event, struct print_arg *arg, char **tok)
 		   strcmp(token, "*") == 0 ||
 		   strcmp(token, "^") == 0 ||
 		   strcmp(token, "/") == 0 ||
+		   strcmp(token, "%") == 0 ||
 		   strcmp(token, "<") == 0 ||
 		   strcmp(token, ">") == 0 ||
 		   strcmp(token, "<=") == 0 ||
@@ -3689,6 +3690,9 @@ eval_num_arg(void *data, int size, struct event_format *event, struct print_arg
 		case '/':
 			val = left / right;
 			break;
+		case '%':
+			val = left % right;
+			break;
 		case '*':
 			val = left * right;
 			break;
-- 
cgit v0.10.2


From e95cf700b1b267d912ee779c3ab36e582111a52d Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 15 Feb 2016 09:34:32 +0100
Subject: perf tools: Make cl_address global

It'll be used in following patches.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1455525293-8671-3-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 6d0f858..5f94ee7 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1015,12 +1015,6 @@ static int hist_entry__snoop_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-*s", width, out);
 }
 
-static inline  u64 cl_address(u64 address)
-{
-	/* return the cacheline of the address */
-	return (address & ~(cacheline_size - 1));
-}
-
 static int64_t
 sort__dcacheline_cmp(struct hist_entry *left, struct hist_entry *right)
 {
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 89a1273..46f159f 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -162,6 +162,11 @@ static inline float hist_entry__get_percent_limit(struct hist_entry *he)
 	return period * 100.0 / total_period;
 }
 
+static inline u64 cl_address(u64 address)
+{
+	/* return the cacheline of the address */
+	return (address & ~(cacheline_size - 1));
+}
 
 enum sort_mode {
 	SORT_MODE__NORMAL,
-- 
cgit v0.10.2


From d392711095f12942a61e2963f5ab0076ac651e73 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 15 Feb 2016 09:34:33 +0100
Subject: perf tools: Introduce cl_offset function

It'll be used in following patches.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1455525293-8671-4-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 46f159f..5b9c624 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -168,6 +168,12 @@ static inline u64 cl_address(u64 address)
 	return (address & ~(cacheline_size - 1));
 }
 
+static inline u64 cl_offset(u64 address)
+{
+	/* return the cacheline of the address */
+	return (address & (cacheline_size - 1));
+}
+
 enum sort_mode {
 	SORT_MODE__NORMAL,
 	SORT_MODE__BRANCH,
-- 
cgit v0.10.2


From acbe613e0c03d59cab21aec3565cdb28c7df98c3 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 15 Feb 2016 09:34:34 +0100
Subject: perf tools: Add monitored events array

It will ease up configuration of memory events and addition of other
memory events in following patches.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1455525293-8671-5-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 3901700..36c56a4 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -6,6 +6,7 @@
 #include "util/tool.h"
 #include "util/session.h"
 #include "util/data.h"
+#include "util/mem-events.h"
 
 #define MEM_OPERATION_LOAD	0x1
 #define MEM_OPERATION_STORE	0x2
@@ -34,20 +35,20 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 
 	rec_argv[i++] = "record";
 
-	if (mem->operation & MEM_OPERATION_LOAD)
+	if (mem->operation & MEM_OPERATION_LOAD) {
+		perf_mem_events[PERF_MEM_EVENTS__LOAD].record = true;
 		rec_argv[i++] = "-W";
+	}
 
 	rec_argv[i++] = "-d";
 
-	if (mem->operation & MEM_OPERATION_LOAD) {
-		rec_argv[i++] = "-e";
-		rec_argv[i++] = "cpu/mem-loads/pp";
-	}
+	for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+		if (!perf_mem_events[j].record)
+			continue;
 
-	if (mem->operation & MEM_OPERATION_STORE) {
 		rec_argv[i++] = "-e";
-		rec_argv[i++] = "cpu/mem-stores/pp";
-	}
+		rec_argv[i++] = perf_mem_events[j].name;
+	};
 
 	for (j = 1; j < argc; j++, i++)
 		rec_argv[i] = argv[j];
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index a34752d..df2b690 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -82,6 +82,7 @@ libperf-y += parse-branch-options.o
 libperf-y += parse-regs-options.o
 libperf-y += term.o
 libperf-y += help-unknown-cmd.o
+libperf-y += mem-events.o
 
 libperf-$(CONFIG_LIBBPF) += bpf-loader.o
 libperf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
new file mode 100644
index 0000000..c6ba0a1
--- /dev/null
+++ b/tools/perf/util/mem-events.c
@@ -0,0 +1,10 @@
+#include "mem-events.h"
+
+#define E(n) { .name = n }
+
+struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
+	E("cpu/mem-loads,ldlat=30/P"),
+	E("cpu/mem-stores/P"),
+};
+
+#undef E
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
new file mode 100644
index 0000000..c97b214
--- /dev/null
+++ b/tools/perf/util/mem-events.h
@@ -0,0 +1,19 @@
+#ifndef __PERF_MEM_EVENTS_H
+#define __PERF_MEM_EVENTS_H
+
+#include <stdbool.h>
+
+struct perf_mem_event {
+	bool		record;
+	const char	*name;
+};
+
+enum {
+	PERF_MEM_EVENTS__LOAD,
+	PERF_MEM_EVENTS__STORE,
+	PERF_MEM_EVENTS__MAX,
+};
+
+extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
+
+#endif /* __PERF_MEM_EVENTS_H */
-- 
cgit v0.10.2


From ce1e22b08f0728e840614d3d0fc43fd1d6b7f7a2 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 15 Feb 2016 09:34:35 +0100
Subject: perf mem: Add -e record option

Adding -e option for perf mem record command, to be able to specify
memory event directly.

Get list of available events:

  $ perf mem record -e list
  ldlat-loads
  ldlat-stores

Monitor ldlat-loads:
  $ perf mem record -e ldlat-loads true

Committer notes:

Further testing:

  # perf mem record -e ldlat-loads true
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.020 MB perf.data (10 samples) ]
  # perf evlist
  cpu/mem-loads,ldlat=30/P
  #

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1455525293-8671-6-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 36c56a4..b3f8a89 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -7,6 +7,7 @@
 #include "util/session.h"
 #include "util/data.h"
 #include "util/mem-events.h"
+#include "util/debug.h"
 
 #define MEM_OPERATION_LOAD	0x1
 #define MEM_OPERATION_STORE	0x2
@@ -22,11 +23,55 @@ struct perf_mem {
 	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 };
 
+static int parse_record_events(const struct option *opt,
+			       const char *str, int unset __maybe_unused)
+{
+	struct perf_mem *mem = *(struct perf_mem **)opt->value;
+	int j;
+
+	if (strcmp(str, "list")) {
+		if (!perf_mem_events__parse(str)) {
+			mem->operation = 0;
+			return 0;
+		}
+		exit(-1);
+	}
+
+	for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+		struct perf_mem_event *e = &perf_mem_events[j];
+
+		fprintf(stderr, "%-20s%s",
+			e->tag, verbose ? "" : "\n");
+		if (verbose)
+			fprintf(stderr, " [%s]\n", e->name);
+	}
+	exit(0);
+}
+
+static const char * const __usage[] = {
+	"perf mem record [<options>] [<command>]",
+	"perf mem record [<options>] -- <command> [<options>]",
+	NULL
+};
+
+static const char * const *record_mem_usage = __usage;
+
 static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 {
 	int rec_argc, i = 0, j;
 	const char **rec_argv;
 	int ret;
+	struct option options[] = {
+	OPT_CALLBACK('e', "event", &mem, "event",
+		     "event selector. use 'perf mem record -e list' to list available events",
+		     parse_record_events),
+	OPT_INCR('v', "verbose", &verbose,
+		 "be more verbose (show counter open errors, etc)"),
+	OPT_END()
+	};
+
+	argc = parse_options(argc, argv, options, record_mem_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
 
 	rec_argc = argc + 7; /* max number of arguments */
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
@@ -35,10 +80,11 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 
 	rec_argv[i++] = "record";
 
-	if (mem->operation & MEM_OPERATION_LOAD) {
+	if (mem->operation & MEM_OPERATION_LOAD)
 		perf_mem_events[PERF_MEM_EVENTS__LOAD].record = true;
+
+	if (perf_mem_events[PERF_MEM_EVENTS__LOAD].record)
 		rec_argv[i++] = "-W";
-	}
 
 	rec_argv[i++] = "-d";
 
@@ -50,9 +96,19 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 		rec_argv[i++] = perf_mem_events[j].name;
 	};
 
-	for (j = 1; j < argc; j++, i++)
+	for (j = 0; j < argc; j++, i++)
 		rec_argv[i] = argv[j];
 
+	if (verbose > 0) {
+		pr_debug("calling: record ");
+
+		while (rec_argv[j]) {
+			pr_debug("%s ", rec_argv[j]);
+			j++;
+		}
+		pr_debug("\n");
+	}
+
 	ret = cmd_record(i, rec_argv, NULL);
 	free(rec_argv);
 	return ret;
@@ -299,7 +355,6 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
 		NULL
 	};
 
-
 	argc = parse_options_subcommand(argc, argv, mem_options, mem_subcommands,
 					mem_usage, PARSE_OPT_STOP_AT_NON_OPTION);
 
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index c6ba0a1..b1507c0 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -1,10 +1,51 @@
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
 #include "mem-events.h"
+#include "debug.h"
 
-#define E(n) { .name = n }
+#define E(t, n) { .tag = t, .name = n }
 
 struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
-	E("cpu/mem-loads,ldlat=30/P"),
-	E("cpu/mem-stores/P"),
+	E("ldlat-loads",	"cpu/mem-loads,ldlat=30/P"),
+	E("ldlat-stores",	"cpu/mem-stores/P"),
 };
 
 #undef E
+
+int perf_mem_events__parse(const char *str)
+{
+	char *tok, *saveptr = NULL;
+	bool found = false;
+	char *buf;
+	int j;
+
+	/* We need buffer that we know we can write to. */
+	buf = malloc(strlen(str) + 1);
+	if (!buf)
+		return -ENOMEM;
+
+	strcpy(buf, str);
+
+	tok = strtok_r((char *)buf, ",", &saveptr);
+
+	while (tok) {
+		for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+			struct perf_mem_event *e = &perf_mem_events[j];
+
+			if (strstr(e->tag, tok))
+				e->record = found = true;
+		}
+
+		tok = strtok_r(NULL, ",", &saveptr);
+	}
+
+	free(buf);
+
+	if (found)
+		return 0;
+
+	pr_err("failed: event '%s' not found, use '-e list' to get list of available events\n", str);
+	return -1;
+}
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index c97b214..2995bae 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -5,6 +5,7 @@
 
 struct perf_mem_event {
 	bool		record;
+	const char	*tag;
 	const char	*name;
 };
 
@@ -16,4 +17,6 @@ enum {
 
 extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
 
+int perf_mem_events__parse(const char *str);
+
 #endif /* __PERF_MEM_EVENTS_H */
-- 
cgit v0.10.2


From b19a1b6a233ede3ffc379b49e3653d6ce80dd743 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 15 Feb 2016 09:34:42 +0100
Subject: perf tools: Use ARRAY_SIZE in mem sort display functions

There's no need to define extra macros for that.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1455525293-8671-13-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 5f94ee7..5388f79 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -838,7 +838,6 @@ static const char * const tlb_access[] = {
 	"Walker",
 	"Fault",
 };
-#define NUM_TLB_ACCESS (sizeof(tlb_access)/sizeof(const char *))
 
 static int hist_entry__tlb_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
@@ -860,7 +859,7 @@ static int hist_entry__tlb_snprintf(struct hist_entry *he, char *bf,
 	/* already taken care of */
 	m &= ~(PERF_MEM_TLB_HIT|PERF_MEM_TLB_MISS);
 
-	for (i = 0; m && i < NUM_TLB_ACCESS; i++, m >>= 1) {
+	for (i = 0; m && i < ARRAY_SIZE(tlb_access); i++, m >>= 1) {
 		if (!(m & 0x1))
 			continue;
 		if (l) {
@@ -915,7 +914,6 @@ static const char * const mem_lvl[] = {
 	"I/O",
 	"Uncached",
 };
-#define NUM_MEM_LVL (sizeof(mem_lvl)/sizeof(const char *))
 
 static int hist_entry__lvl_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
@@ -937,7 +935,7 @@ static int hist_entry__lvl_snprintf(struct hist_entry *he, char *bf,
 	/* already taken care of */
 	m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS);
 
-	for (i = 0; m && i < NUM_MEM_LVL; i++, m >>= 1) {
+	for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) {
 		if (!(m & 0x1))
 			continue;
 		if (l) {
@@ -983,7 +981,6 @@ static const char * const snoop_access[] = {
 	"Hit",
 	"HitM",
 };
-#define NUM_SNOOP_ACCESS (sizeof(snoop_access)/sizeof(const char *))
 
 static int hist_entry__snoop_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
@@ -998,7 +995,7 @@ static int hist_entry__snoop_snprintf(struct hist_entry *he, char *bf,
 	if (he->mem_info)
 		m = he->mem_info->data_src.mem_snoop;
 
-	for (i = 0; m && i < NUM_SNOOP_ACCESS; i++, m >>= 1) {
+	for (i = 0; m && i < ARRAY_SIZE(snoop_access); i++, m >>= 1) {
 		if (!(m & 0x1))
 			continue;
 		if (l) {
-- 
cgit v0.10.2


From 94ddddfab521423d94d6066879b514b9431e5cae Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 15 Feb 2016 09:34:51 +0100
Subject: perf script: Add data_src and weight column definitions

Adding data_src and weight column definitions, so it's displayed for
related sample types.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1455525293-8671-22-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index c691214..b7f1e8e 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -58,6 +58,8 @@ enum perf_output_field {
 	PERF_OUTPUT_IREGS	    = 1U << 14,
 	PERF_OUTPUT_BRSTACK	    = 1U << 15,
 	PERF_OUTPUT_BRSTACKSYM	    = 1U << 16,
+	PERF_OUTPUT_DATA_SRC	    = 1U << 17,
+	PERF_OUTPUT_WEIGHT	    = 1U << 18,
 };
 
 struct output_option {
@@ -81,6 +83,8 @@ struct output_option {
 	{.str = "iregs", .field = PERF_OUTPUT_IREGS},
 	{.str = "brstack", .field = PERF_OUTPUT_BRSTACK},
 	{.str = "brstacksym", .field = PERF_OUTPUT_BRSTACKSYM},
+	{.str = "data_src", .field = PERF_OUTPUT_DATA_SRC},
+	{.str = "weight",   .field = PERF_OUTPUT_WEIGHT},
 };
 
 /* default set to maintain compatibility with current format */
@@ -242,6 +246,16 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
 					   PERF_OUTPUT_ADDR, allow_user_set))
 		return -EINVAL;
 
+	if (PRINT_FIELD(DATA_SRC) &&
+		perf_evsel__check_stype(evsel, PERF_SAMPLE_DATA_SRC, "DATA_SRC",
+					PERF_OUTPUT_DATA_SRC))
+		return -EINVAL;
+
+	if (PRINT_FIELD(WEIGHT) &&
+		perf_evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT, "WEIGHT",
+					PERF_OUTPUT_WEIGHT))
+		return -EINVAL;
+
 	if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) {
 		pr_err("Display of symbols requested but neither sample IP nor "
 			   "sample address\nis selected. Hence, no addresses to convert "
@@ -673,6 +687,12 @@ static void process_event(struct perf_script *script, union perf_event *event,
 	if (PRINT_FIELD(ADDR))
 		print_sample_addr(event, sample, thread, attr);
 
+	if (PRINT_FIELD(DATA_SRC))
+		printf("%16" PRIx64, sample->data_src);
+
+	if (PRINT_FIELD(WEIGHT))
+		printf("%16" PRIu64, sample->weight);
+
 	if (PRINT_FIELD(IP)) {
 		if (!symbol_conf.use_callchain)
 			printf(" ");
-- 
cgit v0.10.2


From ff7b191583c368612fde88bf3cff6e3f3b0d73d5 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 15 Feb 2016 09:34:52 +0100
Subject: perf script: Display addr/data_src/weight columns for raw events

Adding addr/data_src/weight columns for raw events.

Example:
  $ perf script
  ...
  true 11883 322960.489590: ...  ffff8801aa0b8400        68501042             246 ffffffff813b2cd
  true 11883 322960.489600: ...  ffff8800b90b38d8        68501042             251 ffffffff811d0b7
  true 11883 322960.489612: ...  ffff880196893130        6a100142              94 ffffffff8177fb8
  true 11883 322960.489637: ...  ffff880164277b40        68100842             101 ffffffff813b2cd
  true 11883 322960.489683: ...  ffff880035d3d818        68501042             201 ffffffff811d0b7
  true 11883 322960.489733: ...      7fb9616efcf0        68100242             199     7fb961aaba9
  true 11883 322960.489818: ...  ffffea000481c39c        6a100142             122 ffffffff811b634

                                 ^^^^^^^^^^^^^^^^        ^^^^^^^^             ^^^
                                             addr        data_src          weight

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1455525293-8671-23-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index b7f1e8e..f4caf48 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -135,7 +135,8 @@ static struct {
 			      PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
 			      PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
 			      PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
-			      PERF_OUTPUT_PERIOD,
+			      PERF_OUTPUT_PERIOD |  PERF_OUTPUT_ADDR |
+			      PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT,
 
 		.invalid_fields = PERF_OUTPUT_TRACE,
 	},
-- 
cgit v0.10.2


From 940db6dcd3f4659303fdf6befe7416adc4d24118 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 17 Feb 2016 14:44:55 -0800
Subject: perf tools: Dont stop PMU parsing on alias parse error

When an error happens during alias parsing currently the complete
parsing of all attributes of the PMU is stopped. This is breaks old perf
on a newer kernel that may have not-yet-know alias attributes (such as
.scale or .per-pkg).

Continue when some attribute is unparseable.

This is IMHO a stable candidate and should be backported to older
versions to avoid problems with newer kernels.

v2: Print warnings when something goes wrong.
v3: Change warning to debug output

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: stable@vger.kernel.org # v3.6+
Link: http://lkml.kernel.org/r/1455749095-18358-1-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index cf59fba..ce61f79 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -284,13 +284,12 @@ static int pmu_aliases_parse(char *dir, struct list_head *head)
 {
 	struct dirent *evt_ent;
 	DIR *event_dir;
-	int ret = 0;
 
 	event_dir = opendir(dir);
 	if (!event_dir)
 		return -EINVAL;
 
-	while (!ret && (evt_ent = readdir(event_dir))) {
+	while ((evt_ent = readdir(event_dir))) {
 		char path[PATH_MAX];
 		char *name = evt_ent->d_name;
 		FILE *file;
@@ -306,17 +305,19 @@ static int pmu_aliases_parse(char *dir, struct list_head *head)
 
 		snprintf(path, PATH_MAX, "%s/%s", dir, name);
 
-		ret = -EINVAL;
 		file = fopen(path, "r");
-		if (!file)
-			break;
+		if (!file) {
+			pr_debug("Cannot open %s\n", path);
+			continue;
+		}
 
-		ret = perf_pmu__new_alias(head, dir, name, file);
+		if (perf_pmu__new_alias(head, dir, name, file) < 0)
+			pr_debug("Cannot set up %s\n", name);
 		fclose(file);
 	}
 
 	closedir(event_dir);
-	return ret;
+	return 0;
 }
 
 /*
-- 
cgit v0.10.2


From d1130686f463e6feff19196475c3c15b1923c525 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 23 Feb 2016 16:18:37 -0300
Subject: perf help: No need to use strbuf_remove()

It is the only user of this function, just use the strlen() to skip
the prefix.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-blao710l5cd5hmwrhy51ftgq@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 96c1a4c..f4dd2b4 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -86,8 +86,7 @@ static int check_emacsclient_version(void)
 		return -1;
 	}
 
-	strbuf_remove(&buffer, 0, strlen("emacsclient"));
-	version = atoi(buffer.buf);
+	version = atoi(buffer.buf + strlen("emacsclient"));
 
 	if (version < 22) {
 		fprintf(stderr,
-- 
cgit v0.10.2


From bea2400621836b028d82c3d6a74053921d70dbd7 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 23 Feb 2016 16:21:04 -0300
Subject: perf tools: Remove strbuf_{remove,splice}()

No users, nuke them.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-kfv2wo8xann8t97wdalttcx7@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c
index 25671fa..d3d2792 100644
--- a/tools/perf/util/strbuf.c
+++ b/tools/perf/util/strbuf.c
@@ -51,30 +51,6 @@ void strbuf_grow(struct strbuf *sb, size_t extra)
 	ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
 }
 
-static void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
-				   const void *data, size_t dlen)
-{
-	if (pos + len < pos)
-		die("you want to use way too much memory");
-	if (pos > sb->len)
-		die("`pos' is too far after the end of the buffer");
-	if (pos + len > sb->len)
-		die("`pos + len' is too far after the end of the buffer");
-
-	if (dlen >= len)
-		strbuf_grow(sb, dlen - len);
-	memmove(sb->buf + pos + dlen,
-			sb->buf + pos + len,
-			sb->len - pos - len);
-	memcpy(sb->buf + pos, data, dlen);
-	strbuf_setlen(sb, sb->len + dlen - len);
-}
-
-void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
-{
-	strbuf_splice(sb, pos, len, NULL, 0);
-}
-
 void strbuf_add(struct strbuf *sb, const void *data, size_t len)
 {
 	strbuf_grow(sb, len);
diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h
index 529f2f0..7a32c83 100644
--- a/tools/perf/util/strbuf.h
+++ b/tools/perf/util/strbuf.h
@@ -77,8 +77,6 @@ static inline void strbuf_addch(struct strbuf *sb, int c) {
 	sb->buf[sb->len] = '\0';
 }
 
-extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
-
 extern void strbuf_add(struct strbuf *, const void *, size_t);
 static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
 	strbuf_add(sb, s, strlen(s));
-- 
cgit v0.10.2


From 905e36ae172c83a30894a3adefab7d4f850fcf54 Mon Sep 17 00:00:00 2001
From: Matti Gottlieb <matti.gottlieb@intel.com>
Date: Sun, 14 Feb 2016 17:05:39 +0200
Subject: iwlwifi: mvm: Fix paging memory leak

If the opmode is stopped and started again we did not free
the paging buffers. Fix that.
In addition when freeing the firmware's paging download
buffer, set the pointer to NULL.

Signed-off-by: Matti Gottlieb <matti.gottlieb@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
index 4ed5180..0ccc697 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/fw.c
@@ -107,7 +107,7 @@ static int iwl_send_tx_ant_cfg(struct iwl_mvm *mvm, u8 valid_tx_ant)
 				    sizeof(tx_ant_cmd), &tx_ant_cmd);
 }
 
-static void iwl_free_fw_paging(struct iwl_mvm *mvm)
+void iwl_free_fw_paging(struct iwl_mvm *mvm)
 {
 	int i;
 
@@ -127,6 +127,8 @@ static void iwl_free_fw_paging(struct iwl_mvm *mvm)
 			     get_order(mvm->fw_paging_db[i].fw_paging_size));
 	}
 	kfree(mvm->trans->paging_download_buf);
+	mvm->trans->paging_download_buf = NULL;
+
 	memset(mvm->fw_paging_db, 0, sizeof(mvm->fw_paging_db));
 }
 
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
index 5f3ac8c..ff7c6df 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -1225,6 +1225,9 @@ void iwl_mvm_rx_umac_scan_complete_notif(struct iwl_mvm *mvm,
 void iwl_mvm_rx_umac_scan_iter_complete_notif(struct iwl_mvm *mvm,
 					      struct iwl_rx_cmd_buffer *rxb);
 
+/* Paging */
+void iwl_free_fw_paging(struct iwl_mvm *mvm);
+
 /* MVM debugfs */
 #ifdef CONFIG_IWLWIFI_DEBUGFS
 int iwl_mvm_dbgfs_register(struct iwl_mvm *mvm, struct dentry *dbgfs_dir);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
index 89ea70d..e80be9a 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
@@ -684,6 +684,8 @@ static void iwl_op_mode_mvm_stop(struct iwl_op_mode *op_mode)
 	for (i = 0; i < NVM_MAX_NUM_SECTIONS; i++)
 		kfree(mvm->nvm_sections[i].data);
 
+	iwl_free_fw_paging(mvm);
+
 	iwl_mvm_tof_clean(mvm);
 
 	ieee80211_free_hw(mvm->hw);
-- 
cgit v0.10.2


From fb5eb24cdd5cdb83be77d3e4b2f16e92e06bd9e9 Mon Sep 17 00:00:00 2001
From: Patrik Halfar <patrik_halfar@halfarit.cz>
Date: Sat, 20 Feb 2016 18:49:40 +0100
Subject: Add Dell Wireless 5809e Gobi 4G HSPA+ Mobile Broadband Card (rev3) to
 qmi_wwan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New revison of Dell Wireless 5809e Gobi 4G HSPA+ Mobile Broadband Card has new idProduct

Bus 002 Device 006: ID 413c:81b3 Dell Computer Corp.
Device Descriptor:
  bLength                18
  bDescriptorType         1
  bcdUSB               2.00
  bDeviceClass            0
  bDeviceSubClass         0
  bDeviceProtocol         0
  bMaxPacketSize0        64
  idVendor           0x413c Dell Computer Corp.
  idProduct          0x81b3
  bcdDevice            0.06
  iManufacturer           1 Sierra Wireless, Incorporated
  iProduct                2 Dell Wireless 5809e Gobi™ 4G HSPA+ Mobile Broadband Card
  iSerial                 3
  bNumConfigurations      2

Signed-off-by: Patrik Halfar <patrik_halfar@halfarit.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 570deef..4ce4ec1 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -885,6 +885,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x413c, 0x81a8, 8)},	/* Dell Wireless 5808 Gobi(TM) 4G LTE Mobile Broadband Card */
 	{QMI_FIXED_INTF(0x413c, 0x81a9, 8)},	/* Dell Wireless 5808e Gobi(TM) 4G LTE Mobile Broadband Card */
 	{QMI_FIXED_INTF(0x413c, 0x81b1, 8)},	/* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card */
+	{QMI_FIXED_INTF(0x413c, 0x81b3, 8)},	/* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */
 	{QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)},	/* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */
 	{QMI_FIXED_INTF(0x22de, 0x9061, 3)},	/* WeTelecom WPD-600N */
 	{QMI_FIXED_INTF(0x1e0e, 0x9001, 5)},	/* SIMCom 7230E */
-- 
cgit v0.10.2


From 9bdfb3b79e61c60e1a3e2dc05ad164528afa6b8a Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Sun, 21 Feb 2016 10:12:39 +0300
Subject: tcp: convert cached rtt from usec to jiffies when feeding initial rto

Currently it's converted into msecs, thus HZ=1000 intact.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Fixes: 740b0f1841f6 ("tcp: switch rtt estimations to usec resolution")
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index c8cbc2b..a726d78 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -550,7 +550,7 @@ reset:
 	 */
 	if (crtt > tp->srtt_us) {
 		/* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
-		crtt /= 8 * USEC_PER_MSEC;
+		crtt /= 8 * USEC_PER_SEC / HZ;
 		inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
 	} else if (tp->srtt_us == 0) {
 		/* RFC6298: 5.7 We've failed to get a valid RTT sample from
-- 
cgit v0.10.2


From 5146d1f151122e868e594c7b45115d64825aee5f Mon Sep 17 00:00:00 2001
From: Bernie Harris <bernie.harris@alliedtelesis.co.nz>
Date: Mon, 22 Feb 2016 12:58:05 +1300
Subject: tunnel: Clear IPCB(skb)->opt before dst_link_failure called

IPCB may contain data from previous layers (in the observed case the
qdisc layer). In the observed scenario, the data was misinterpreted as
ip header options, which later caused the ihl to be set to an invalid
value (<5). This resulted in an infinite loop in the mips implementation
of ip_fast_csum.

This patch clears IPCB(skb)->opt before dst_link_failure can be called for
various types of tunnels. This change only applies to encapsulated ipv4
packets.

The code introduced in 11c21a30 which clears all of IPCB has been removed
to be consistent with these changes, and instead the opt field is cleared
unconditionally in ip_tunnel_xmit. The change in ip_tunnel_xmit applies to
SIT, GRE, and IPIP tunnels.

The relevant vti, l2tp, and pptp functions already contain similar code for
clearing the IPCB.

Signed-off-by: Bernie Harris <bernie.harris@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 89e8861..336e689 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -661,6 +661,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 	connected = (tunnel->parms.iph.daddr != 0);
 
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
 	dst = tnl_params->daddr;
 	if (dst == 0) {
 		/* NBMA tunnel */
@@ -758,7 +760,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 			tunnel->err_count--;
 
-			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 			dst_link_failure(skb);
 		} else
 			tunnel->err_count = 0;
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 0ec0881..96599d1 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -89,6 +89,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb
 	uh->source = src_port;
 	uh->len = htons(skb->len);
 
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
 	udp_set_csum(nocheck, skb, src, dst, skb->len);
 
 	iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index a69aad1..c0d4dc1 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -777,6 +777,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
 	__u32 mtu;
 	int err;
 
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
 	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
 		encap_limit = t->parms.encap_limit;
 
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 137fca4..6c5dfec 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1180,6 +1180,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	u8 tproto;
 	int err;
 
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
 	tproto = ACCESS_ONCE(t->parms.proto);
 	if (tproto != IPPROTO_IPIP && tproto != 0)
 		return -1;
-- 
cgit v0.10.2


From fbb0fa8b48892a3db8f5b89fb591c741fbd2fe7a Mon Sep 17 00:00:00 2001
From: Michael Chan <michael.chan@broadcom.com>
Date: Mon, 22 Feb 2016 02:10:26 -0500
Subject: bnxt_en: Fix zero padding of tx push data.

The arithmetic to zero pad the last 64-bit word in the push buffer is not
correct.

1. It should be pdata + length to get to the end.
2. 'pdata' is void pointer and passing it to PTR_ALIGN() will cast the
aligned pointer to void.  Pass 'end' which is u64 pointer to PTR_ALIGN()
instead so that the aligned pointer - 1 is the last 64-bit pointer to data.

Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 8ab000d..82f1913 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -248,7 +248,8 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		tx_push1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags);
 		tx_push1->tx_bd_cfa_action = cpu_to_le32(cfa_action);
 
-		end = PTR_ALIGN(pdata + length + 1, 8) - 1;
+		end = pdata + length;
+		end = PTR_ALIGN(end, 8) - 1;
 		*end = 0;
 
 		skb_copy_from_linear_data(skb, pdata, len);
-- 
cgit v0.10.2


From da9b9303ed8d1673a89a4bdd85464e33614775e3 Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Mon, 22 Feb 2016 23:35:44 +0100
Subject: ASoC: wm9713: fix regmap free path

In the conversion to regmap, I assumed that the devm_() variant could be
used in the soc probe function.

As a mater of fact with the current code the regmap is freed twice
because of the devm_() call:
(mutex_lock) from [<c01f6624>] (debugfs_remove_recursive+0x50/0x1d0)
(debugfs_remove_recursive) from [<c02bf800>] (regmap_debugfs_exit+0x1c/0xd4)
(regmap_debugfs_exit) from [<c02ba1f8>] (regmap_exit+0x28/0xc8)
(regmap_exit) from [<c02aa258>] (release_nodes+0x18c/0x204)
(release_nodes) from [<c02a278c>] (device_release+0x18/0x90)
(device_release) from [<c0239030>] (kobject_release+0x90/0x1bc)
(kobject_release) from [<c0395c94>] (wm9713_soc_remove+0x1c/0x24)
(wm9713_soc_remove) from [<c0384884>] (soc_remove_component+0x50/0x7c)
(soc_remove_component) from [<c0386c28>] (soc_remove_dai_links+0x118/0x228)
(soc_remove_dai_links) from [<c038721c>] (snd_soc_register_card+0x4e4/0xdd4)
(snd_soc_register_card) from [<c0393c54>] (devm_snd_soc_register_card+0x34/0x70)

Fix this by replacing the devm_regmap initialization code with the non
devm_() variant.

Fixes: 700dadfefc3d ASoC: wm9713: convert to regmap
Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Acked-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c
index 79e1436..9849643 100644
--- a/sound/soc/codecs/wm9713.c
+++ b/sound/soc/codecs/wm9713.c
@@ -1212,7 +1212,7 @@ static int wm9713_soc_probe(struct snd_soc_codec *codec)
 	if (IS_ERR(wm9713->ac97))
 		return PTR_ERR(wm9713->ac97);
 
-	regmap = devm_regmap_init_ac97(wm9713->ac97, &wm9713_regmap_config);
+	regmap = regmap_init_ac97(wm9713->ac97, &wm9713_regmap_config);
 	if (IS_ERR(regmap)) {
 		snd_soc_free_ac97_codec(wm9713->ac97);
 		return PTR_ERR(regmap);
-- 
cgit v0.10.2


From bb53e416e02cd5406d5e1dbdd89432ff5875e982 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 10 Dec 2015 09:30:12 -0800
Subject: rcu: Assign false instead of 0 for ->core_needs_qs

A zero seems to have escaped earlier true/false substitution efforts,
so this commit changes 0 to false for the ->core_needs_qs boolean field.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e41dd41..b8bbf3b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2390,7 +2390,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	if ((rnp->qsmask & mask) == 0) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	} else {
-		rdp->core_needs_qs = 0;
+		rdp->core_needs_qs = false;
 
 		/*
 		 * This GP can't end until cpu checks in, so all of our
-- 
cgit v0.10.2


From 8994515cf0030e5020d67bb837a9d9a92441d0f7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 10 Dec 2015 09:59:43 -0800
Subject: rcu: Update rcu_report_qs_rsp() comment

The header comment for rcu_report_qs_rsp() was obsolete, dating well
before the advent of RCU grace-period kthreads.  This commit therefore
brings this comment back into alignment with current reality.

Reported-by: Lihao Liang <lihao.liang@cs.ox.ac.uk>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b8bbf3b..a918368 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2234,11 +2234,13 @@ static bool rcu_start_gp(struct rcu_state *rsp)
 }
 
 /*
- * Report a full set of quiescent states to the specified rcu_state
- * data structure.  This involves cleaning up after the prior grace
- * period and letting rcu_start_gp() start up the next grace period
- * if one is needed.  Note that the caller must hold rnp->lock, which
- * is released before return.
+ * Report a full set of quiescent states to the specified rcu_state data
+ * structure.  Invoke rcu_gp_kthread_wake() to awaken the grace-period
+ * kthread if another grace period is required.  Whether we wake
+ * the grace-period kthread or it awakens itself for the next round
+ * of quiescent-state forcing, that kthread will clean up after the
+ * just-completed grace period.  Note that the caller must hold rnp->lock,
+ * which is released before return.
  */
 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	__releases(rcu_get_root(rsp)->lock)
-- 
cgit v0.10.2


From 4914950aaa12de8a2004b8031b45480526caf42b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 11 Dec 2015 13:48:43 -0800
Subject: rcu: Stop treating in-kernel CPU-bound workloads as errors

Commit 4a81e8328d379 ("Reduce overhead of cond_resched() checks for RCU")
handles the error case where a nohz_full loops indefinitely in the kernel
with the scheduling-clock interrupt disabled.  However, this handling
includes IPIing the CPU running the offending loop, which is not what
we want for real-time workloads.  And there are starting to be real-time
CPU-bound in-kernel workloads, and these must be handled without IPIing
the CPU, at least not in the common case.  Therefore, this situation can
no longer be dismissed as an error case.

This commit therefore splits the handling out, so that the setting of
bits in the per-CPU rcu_sched_qs_mask variable is done relatively early,
but if the problem persists, resched_cpu() is eventually used to IPI the
CPU containing the offending loop.  Assuming that in-kernel CPU-bound
loops used by real-time tasks contain frequent calls cond_resched_rcu_qs()
(as in more than once per few tens of milliseconds), the real-time tasks
will never be IPIed.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a918368..68f4bee 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1173,15 +1173,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 			smp_mb(); /* ->cond_resched_completed before *rcrmp. */
 			WRITE_ONCE(*rcrmp,
 				   READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
-			resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
-			rdp->rsp->jiffies_resched += 5; /* Enable beating. */
-		} else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
-			/* Time to beat on that CPU again! */
-			resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
-			rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
 		}
+		rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
 	}
 
+	/* And if it has been a really long time, kick the CPU as well. */
+	if (ULONG_CMP_GE(jiffies,
+			 rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) ||
+	    ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs))
+		resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 23a9bacd357ab8388c7e07101b186a4282874992 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 13 Dec 2015 08:57:10 -0800
Subject: rcu: Set rdp->gpwrap when CPU is idle

Commit #e3663b1024d1 ("rcu: Handle gpnum/completed wrap while dyntick
idle") sets rdp->gpwrap on the wrong side of the "if" statement in
dyntick_save_progress_counter(), that is, it sets it when the CPU is
not idle instead of when it is idle.  Of course, if the CPU is not idle,
its rdp->gpnum won't be lagging beind the global rsp->gpnum, which means
that rdp->gpwrap will never be set.

This commit therefore moves this code to the proper leg of that "if"
statement.  This change means that the "else" cause is just "return 0"
and the "then" clause ends with "return 1", so also move the "return 0"
to follow the "if", dropping the "else" clause.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 68f4bee..976a166 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1083,13 +1083,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
 	rcu_sysidle_check_cpu(rdp, isidle, maxj);
 	if ((rdp->dynticks_snap & 0x1) == 0) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
-		return 1;
-	} else {
 		if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
 				 rdp->mynode->gpnum))
 			WRITE_ONCE(rdp->gpwrap, true);
-		return 0;
+		return 1;
 	}
+	return 0;
 }
 
 /*
-- 
cgit v0.10.2


From aa5a8988760142fe33e38790a578bd84ddf3e339 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 31 Dec 2015 16:27:06 -0800
Subject: rcutorture: Correct no-expedite console messages

The "Disabled dynamic grace-period expediting" console message is
currently printed unconditionally.  This commit causes it to be
output only when it is impossible to switch between normal and
expedited grace periods, which was the original intent.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d2988d0..65ae0e5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -932,12 +932,14 @@ rcu_torture_writer(void *arg)
 	int nsynctypes = 0;
 
 	VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
-	pr_alert("%s" TORTURE_FLAG
-		 " Grace periods expedited from boot/sysfs for %s,\n",
-		 torture_type, cur_ops->name);
-	pr_alert("%s" TORTURE_FLAG
-		 " Testing of dynamic grace-period expediting diabled.\n",
-		 torture_type);
+	if (!can_expedite) {
+		pr_alert("%s" TORTURE_FLAG
+			 " Grace periods expedited from boot/sysfs for %s,\n",
+			 torture_type, cur_ops->name);
+		pr_alert("%s" TORTURE_FLAG
+			 " Disabled dynamic grace-period expediting.\n",
+			 torture_type);
+	}
 
 	/* Initialize synctype[] array.  If none set, take default. */
 	if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
-- 
cgit v0.10.2


From 1914aab54319ed70608078df4f18ac4767753977 Mon Sep 17 00:00:00 2001
From: Chen Gang <chengang@emindsoft.com.cn>
Date: Sat, 26 Dec 2015 21:41:44 +0800
Subject: rcu: Remove useless rcu_data_p when !PREEMPT_RCU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The related warning from gcc 6.0:

  In file included from kernel/rcu/tree.c:4630:0:
  kernel/rcu/tree_plugin.h:810:40: warning: ‘rcu_data_p’ defined but not used [-Wunused-const-variable]
   static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
                                          ^~~~~~~~~~

Also remove always redundant rcu_data_p in tree.c.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 976a166..1e44fce 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -108,7 +108,6 @@ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
 
 static struct rcu_state *const rcu_state_p;
-static struct rcu_data __percpu *const rcu_data_p;
 LIST_HEAD(rcu_struct_flavors);
 
 /* Dump rcu_node combining tree at boot to verify correct setup. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 9467a8b..e6b88ad 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -807,7 +807,6 @@ void exit_rcu(void)
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
 static struct rcu_state *const rcu_state_p = &rcu_sched_state;
-static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
 
 /*
  * Tell them what RCU they are running.
-- 
cgit v0.10.2


From ad315455d396a1cbcb2f9fdd687b7e1b26b789e7 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Tue, 29 Dec 2015 12:18:46 +0800
Subject: sparse: Add __private to privatize members of structs

In C programming language, we don't have a easy way to privatize a
member of a structure. However in kernel, sometimes there is a need to
privatize a member in case of potential bugs or misuses.

Fortunately, the noderef attribute of sparse is a way to privatize a
member, as by defining a member as noderef, the address-of operator on
the member will produce a noderef pointer to that member, and if anyone
wants to dereference that kind of pointers to read or modify the member,
sparse will yell.

Based on this, __private modifier and related operation ACCESS_PRIVATE()
are introduced, which could help detect undesigned public uses of
private members of structs. Here is an example of sparse's output if it
detect an undersigned public use:

| kernel/rcu/tree.c:4453:25: warning: incorrect type in argument 1 (different modifiers)
| kernel/rcu/tree.c:4453:25:    expected struct raw_spinlock [usertype] *lock
| kernel/rcu/tree.c:4453:25:    got struct raw_spinlock [noderef] *<noident>

Also, this patch improves compiler.h a little bit by adding comments for
"#else" and "#endif".

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 00b042c..c845356 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -20,12 +20,14 @@
 # define __pmem		__attribute__((noderef, address_space(5)))
 #ifdef CONFIG_SPARSE_RCU_POINTER
 # define __rcu		__attribute__((noderef, address_space(4)))
-#else
+#else /* CONFIG_SPARSE_RCU_POINTER */
 # define __rcu
-#endif
+#endif /* CONFIG_SPARSE_RCU_POINTER */
+# define __private	__attribute__((noderef))
 extern void __chk_user_ptr(const volatile void __user *);
 extern void __chk_io_ptr(const volatile void __iomem *);
-#else
+# define ACCESS_PRIVATE(p, member) (*((typeof((p)->member) __force *) &(p)->member))
+#else /* __CHECKER__ */
 # define __user
 # define __kernel
 # define __safe
@@ -44,7 +46,9 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 # define __percpu
 # define __rcu
 # define __pmem
-#endif
+# define __private
+# define ACCESS_PRIVATE(p, member) ((p)->member)
+#endif /* __CHECKER__ */
 
 /* Indirect macros required for expanded argument pasting, eg. __LINE__. */
 #define ___PASTE(a,b) a##b
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 0147c91..874132b 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -269,7 +269,8 @@ our $Sparse	= qr{
 			__init_refok|
 			__kprobes|
 			__ref|
-			__rcu
+			__rcu|
+			__private
 		}x;
 our $InitAttributePrefix = qr{__(?:mem|cpu|dev|net_|)};
 our $InitAttributeData = qr{$InitAttributePrefix(?:initdata\b)};
-- 
cgit v0.10.2


From 67c583a7de3433a971983490b37ad2bff3c55463 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Tue, 29 Dec 2015 12:18:47 +0800
Subject: RCU: Privatize rcu_node::lock

In patch:

"rcu: Add transitivity to remaining rcu_node ->lock acquisitions"

All locking operations on rcu_node::lock are replaced with the wrappers
because of the need of transitivity, which indicates we should never
write code using LOCK primitives alone(i.e. without a proper barrier
following) on rcu_node::lock outside those wrappers. We could detect
this kind of misuses on rcu_node::lock in the future by adding __private
modifier on rcu_node::lock.

To privatize rcu_node::lock, unlock wrappers are also needed. Replacing
spinlock unlocks with these wrappers not only privatizes rcu_node::lock
but also makes it easier to figure out critical sections of rcu_node.

This patch adds __private modifier to rcu_node::lock and makes every
access to it wrapped by ACCESS_PRIVATE(). Besides, unlock wrappers are
added and raw_spin_unlock(&rnp->lock) and its friends are replaced with
those wrappers.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1e44fce..d5a6d2f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1245,7 +1245,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 				if (rnp->qsmask & (1UL << cpu))
 					dump_cpu_task(rnp->grplo + cpu);
 		}
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 }
 
@@ -1265,12 +1265,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	delta = jiffies - READ_ONCE(rsp->jiffies_stall);
 	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
 	}
 	WRITE_ONCE(rsp->jiffies_stall,
 		   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 	/*
 	 * OK, time to rat on our buddy...
@@ -1291,7 +1291,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 					ndetected++;
 				}
 		}
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 
 	print_cpu_stall_info_end();
@@ -1356,7 +1356,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
 		WRITE_ONCE(rsp->jiffies_stall,
 			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 	/*
 	 * Attempt to revive the RCU machinery by forcing a context switch.
@@ -1594,7 +1594,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 	}
 unlock_out:
 	if (rnp != rnp_root)
-		raw_spin_unlock(&rnp_root->lock);
+		raw_spin_unlock_rcu_node(rnp_root);
 out:
 	if (c_out != NULL)
 		*c_out = c;
@@ -1814,7 +1814,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 		return;
 	}
 	needwake = __note_gp_changes(rsp, rnp, rdp);
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	if (needwake)
 		rcu_gp_kthread_wake(rsp);
 }
@@ -1839,7 +1839,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 	raw_spin_lock_irq_rcu_node(rnp);
 	if (!READ_ONCE(rsp->gp_flags)) {
 		/* Spurious wakeup, tell caller to go back to sleep.  */
-		raw_spin_unlock_irq(&rnp->lock);
+		raw_spin_unlock_irq_rcu_node(rnp);
 		return false;
 	}
 	WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
@@ -1849,7 +1849,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 		 * Grace period already in progress, don't start another.
 		 * Not supposed to be able to happen.
 		 */
-		raw_spin_unlock_irq(&rnp->lock);
+		raw_spin_unlock_irq_rcu_node(rnp);
 		return false;
 	}
 
@@ -1858,7 +1858,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 	/* Record GP times before starting GP, hence smp_store_release(). */
 	smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
 	trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
-	raw_spin_unlock_irq(&rnp->lock);
+	raw_spin_unlock_irq_rcu_node(rnp);
 
 	/*
 	 * Apply per-leaf buffered online and offline operations to the
@@ -1872,7 +1872,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
 		    !rnp->wait_blkd_tasks) {
 			/* Nothing to do on this leaf rcu_node structure. */
-			raw_spin_unlock_irq(&rnp->lock);
+			raw_spin_unlock_irq_rcu_node(rnp);
 			continue;
 		}
 
@@ -1906,7 +1906,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 			rcu_cleanup_dead_rnp(rnp);
 		}
 
-		raw_spin_unlock_irq(&rnp->lock);
+		raw_spin_unlock_irq_rcu_node(rnp);
 	}
 
 	/*
@@ -1937,7 +1937,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 		trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
 					    rnp->level, rnp->grplo,
 					    rnp->grphi, rnp->qsmask);
-		raw_spin_unlock_irq(&rnp->lock);
+		raw_spin_unlock_irq_rcu_node(rnp);
 		cond_resched_rcu_qs();
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 	}
@@ -1995,7 +1995,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
 		raw_spin_lock_irq_rcu_node(rnp);
 		WRITE_ONCE(rsp->gp_flags,
 			   READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
-		raw_spin_unlock_irq(&rnp->lock);
+		raw_spin_unlock_irq_rcu_node(rnp);
 	}
 }
 
@@ -2024,7 +2024,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	 * safe for us to drop the lock in order to mark the grace
 	 * period as completed in all of the rcu_node structures.
 	 */
-	raw_spin_unlock_irq(&rnp->lock);
+	raw_spin_unlock_irq_rcu_node(rnp);
 
 	/*
 	 * Propagate new ->completed value to rcu_node structures so
@@ -2045,7 +2045,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 			needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
 		/* smp_mb() provided by prior unlock-lock pair. */
 		nocb += rcu_future_gp_cleanup(rsp, rnp);
-		raw_spin_unlock_irq(&rnp->lock);
+		raw_spin_unlock_irq_rcu_node(rnp);
 		cond_resched_rcu_qs();
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 		rcu_gp_slow(rsp, gp_cleanup_delay);
@@ -2067,7 +2067,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 				       READ_ONCE(rsp->gpnum),
 				       TPS("newreq"));
 	}
-	raw_spin_unlock_irq(&rnp->lock);
+	raw_spin_unlock_irq_rcu_node(rnp);
 }
 
 /*
@@ -2246,7 +2246,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 {
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
-	raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
 	rcu_gp_kthread_wake(rsp);
 }
 
@@ -2276,7 +2276,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 			 * Our bit has already been cleared, or the
 			 * relevant grace period is already over, so done.
 			 */
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 			return;
 		}
 		WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
@@ -2288,7 +2288,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 		if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 
 			/* Other bits still set at this level, so done. */
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 			return;
 		}
 		mask = rnp->grpmask;
@@ -2298,7 +2298,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 
 			break;
 		}
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		rnp_c = rnp;
 		rnp = rnp->parent;
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -2330,7 +2330,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 
 	if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
 	    rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;  /* Still need more quiescent states! */
 	}
 
@@ -2347,7 +2347,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 	/* Report up the rest of the hierarchy, tracking current ->gpnum. */
 	gps = rnp->gpnum;
 	mask = rnp->grpmask;
-	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
+	raw_spin_unlock_rcu_node(rnp);	/* irqs remain disabled. */
 	raw_spin_lock_rcu_node(rnp_p);	/* irqs already disabled. */
 	rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
 }
@@ -2384,12 +2384,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 		 */
 		rdp->cpu_no_qs.b.norm = true;	/* need qs for new gp. */
 		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
 	}
 	mask = rdp->grpmask;
 	if ((rnp->qsmask & mask) == 0) {
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	} else {
 		rdp->core_needs_qs = false;
 
@@ -2600,10 +2600,11 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 		rnp->qsmaskinit &= ~mask;
 		rnp->qsmask &= ~mask;
 		if (rnp->qsmaskinit) {
-			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+			raw_spin_unlock_rcu_node(rnp);
+			/* irqs remain disabled. */
 			return;
 		}
-		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
 	}
 }
 
@@ -2626,7 +2627,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
 	mask = rdp->grpmask;
 	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
 	rnp->qsmaskinitnext &= ~mask;
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
 /*
@@ -2860,7 +2861,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
 			rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
 		} else {
 			/* Nothing to do here, so just drop the lock. */
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		}
 	}
 }
@@ -2896,11 +2897,11 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	raw_spin_unlock(&rnp_old->fqslock);
 	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
 		rsp->n_force_qs_lh++;
-		raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
 		return;  /* Someone beat us to it. */
 	}
 	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
-	raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
 	rcu_gp_kthread_wake(rsp);
 }
 
@@ -2926,7 +2927,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
 	if (cpu_needs_another_gp(rsp, rdp)) {
 		raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
 		needwake = rcu_start_gp(rsp);
-		raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
 		if (needwake)
 			rcu_gp_kthread_wake(rsp);
 	} else {
@@ -3017,7 +3018,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
 
 			raw_spin_lock_rcu_node(rnp_root);
 			needwake = rcu_start_gp(rsp);
-			raw_spin_unlock(&rnp_root->lock);
+			raw_spin_unlock_rcu_node(rnp_root);
 			if (needwake)
 				rcu_gp_kthread_wake(rsp);
 		} else {
@@ -3437,14 +3438,14 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
 	rcu_for_each_leaf_node(rsp, rnp) {
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		if (rnp->expmaskinit == rnp->expmaskinitnext) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 			continue;  /* No new CPUs, nothing to do. */
 		}
 
 		/* Update this node's mask, track old value for propagation. */
 		oldmask = rnp->expmaskinit;
 		rnp->expmaskinit = rnp->expmaskinitnext;
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 		/* If was already nonzero, nothing to propagate. */
 		if (oldmask)
@@ -3459,7 +3460,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
 			if (rnp_up->expmaskinit)
 				done = true;
 			rnp_up->expmaskinit |= mask;
-			raw_spin_unlock_irqrestore(&rnp_up->lock, flags);
+			raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
 			if (done)
 				break;
 			mask = rnp_up->grpmask;
@@ -3482,7 +3483,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		WARN_ON_ONCE(rnp->expmask);
 		rnp->expmask = rnp->expmaskinit;
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 }
 
@@ -3523,11 +3524,11 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			if (!rnp->expmask)
 				rcu_initiate_boost(rnp, flags);
 			else
-				raw_spin_unlock_irqrestore(&rnp->lock, flags);
+				raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 			break;
 		}
 		if (rnp->parent == NULL) {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 			if (wake) {
 				smp_mb(); /* EGP done before wake_up(). */
 				wake_up(&rsp->expedited_wq);
@@ -3535,7 +3536,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			break;
 		}
 		mask = rnp->grpmask;
-		raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
 		rnp = rnp->parent;
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
 		WARN_ON_ONCE(!(rnp->expmask & mask));
@@ -3570,7 +3571,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if (!(rnp->expmask & mask)) {
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
 	}
 	rnp->expmask &= ~mask;
@@ -3731,7 +3732,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 		 */
 		if (rcu_preempt_has_tasks(rnp))
 			rnp->exp_tasks = rnp->blkd_tasks.next;
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 
 		/* IPI the remaining CPUs for expedited quiescent state. */
 		mask = 1;
@@ -3748,7 +3749,7 @@ retry_ipi:
 			raw_spin_lock_irqsave_rcu_node(rnp, flags);
 			if (cpu_online(cpu) &&
 			    (rnp->expmask & mask)) {
-				raw_spin_unlock_irqrestore(&rnp->lock, flags);
+				raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 				schedule_timeout_uninterruptible(1);
 				if (cpu_online(cpu) &&
 				    (rnp->expmask & mask))
@@ -3757,7 +3758,7 @@ retry_ipi:
 			}
 			if (!(rnp->expmask & mask))
 				mask_ofl_ipi &= ~mask;
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		}
 		/* Report quiescent states for those that went offline. */
 		mask_ofl_test |= mask_ofl_ipi;
@@ -4164,7 +4165,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 			return;
 		raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
 		rnp->qsmaskinit |= mask;
-		raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+		raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
 	}
 }
 
@@ -4188,7 +4189,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->rsp = rsp;
 	mutex_init(&rdp->exp_funnel_mutex);
 	rcu_boot_init_nocb_percpu_data(rdp);
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
 /*
@@ -4216,7 +4217,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rcu_sysidle_init_percpu_data(rdp->dynticks);
 	atomic_set(&rdp->dynticks->dynticks,
 		   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
-	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+	raw_spin_unlock_rcu_node(rnp);		/* irqs remain disabled. */
 
 	/*
 	 * Add CPU to leaf rcu_node pending-online bitmask.  Any needed
@@ -4237,7 +4238,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
 	rdp->core_needs_qs = false;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
 static void rcu_prepare_cpu(int cpu)
@@ -4359,7 +4360,7 @@ static int __init rcu_spawn_gp_kthread(void)
 			sp.sched_priority = kthread_prio;
 			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 		}
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		wake_up_process(t);
 	}
 	rcu_spawn_nocb_kthreads();
@@ -4450,8 +4451,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 		cpustride *= levelspread[i];
 		rnp = rsp->level[i];
 		for (j = 0; j < levelcnt[i]; j++, rnp++) {
-			raw_spin_lock_init(&rnp->lock);
-			lockdep_set_class_and_name(&rnp->lock,
+			raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
+			lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
 						   &rcu_node_class[i], buf[i]);
 			raw_spin_lock_init(&rnp->fqslock);
 			lockdep_set_class_and_name(&rnp->fqslock,
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 83360b4..4886d6a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -149,8 +149,9 @@ struct rcu_dynticks {
  * Definition for node within the RCU grace-period-detection hierarchy.
  */
 struct rcu_node {
-	raw_spinlock_t lock;	/* Root rcu_node's lock protects some */
-				/*  rcu_state fields as well as following. */
+	raw_spinlock_t __private lock;	/* Root rcu_node's lock protects */
+					/*  some rcu_state fields as well as */
+					/*  following. */
 	unsigned long gpnum;	/* Current grace period for this node. */
 				/*  This will either be equal to or one */
 				/*  behind the root rcu_node's gpnum. */
@@ -680,7 +681,7 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 #endif /* #else #ifdef CONFIG_PPC */
 
 /*
- * Wrappers for the rcu_node::lock acquire.
+ * Wrappers for the rcu_node::lock acquire and release.
  *
  * Because the rcu_nodes form a tree, the tree traversal locking will observe
  * different lock values, this in turn means that an UNLOCK of one level
@@ -689,29 +690,48 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
  *
  * In order to restore full ordering between tree levels, augment the regular
  * lock acquire functions with smp_mb__after_unlock_lock().
+ *
+ * As ->lock of struct rcu_node is a __private field, therefore one should use
+ * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
  */
 static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
 {
-	raw_spin_lock(&rnp->lock);
+	raw_spin_lock(&ACCESS_PRIVATE(rnp, lock));
 	smp_mb__after_unlock_lock();
 }
 
+static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp)
+{
+	raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock));
+}
+
 static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
 {
-	raw_spin_lock_irq(&rnp->lock);
+	raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock));
 	smp_mb__after_unlock_lock();
 }
 
-#define raw_spin_lock_irqsave_rcu_node(rnp, flags)	\
-do {							\
-	typecheck(unsigned long, flags);		\
-	raw_spin_lock_irqsave(&(rnp)->lock, flags);	\
-	smp_mb__after_unlock_lock();			\
+static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp)
+{
+	raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock));
+}
+
+#define raw_spin_lock_irqsave_rcu_node(rnp, flags)			\
+do {									\
+	typecheck(unsigned long, flags);				\
+	raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags);	\
+	smp_mb__after_unlock_lock();					\
+} while (0)
+
+#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags)			\
+do {									\
+	typecheck(unsigned long, flags);				\
+	raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags);	\
 } while (0)
 
 static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
 {
-	bool locked = raw_spin_trylock(&rnp->lock);
+	bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock));
 
 	if (locked)
 		smp_mb__after_unlock_lock();
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e6b88ad..43e9b4a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
 		rnp->gp_tasks = &t->rcu_node_entry;
 	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
 		rnp->exp_tasks = &t->rcu_node_entry;
-	raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */
+	raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
 
 	/*
 	 * Report the quiescent state for the expedited GP.  This expedited
@@ -489,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 							 !!rnp->gp_tasks);
 			rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
 		} else {
-			raw_spin_unlock_irqrestore(&rnp->lock, flags);
+			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		}
 
 		/* Unboost if we were boosted. */
@@ -518,14 +518,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if (!rcu_preempt_blocked_readers_cgp(rnp)) {
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
 	}
 	t = list_entry(rnp->gp_tasks->prev,
 		       struct task_struct, rcu_node_entry);
 	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
 		sched_show_task(t);
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
 /*
@@ -990,7 +990,7 @@ static int rcu_boost(struct rcu_node *rnp)
 	 * might exit their RCU read-side critical sections on their own.
 	 */
 	if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return 0;
 	}
 
@@ -1027,7 +1027,7 @@ static int rcu_boost(struct rcu_node *rnp)
 	 */
 	t = container_of(tb, struct task_struct, rcu_node_entry);
 	rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	/* Lock only for side effect: boosts task t's priority. */
 	rt_mutex_lock(&rnp->boost_mtx);
 	rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
@@ -1087,7 +1087,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 
 	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
 		rnp->n_balk_exp_gp_tasks++;
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		return;
 	}
 	if (rnp->exp_tasks != NULL ||
@@ -1097,13 +1097,13 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 	     ULONG_CMP_GE(jiffies, rnp->boost_time))) {
 		if (rnp->exp_tasks == NULL)
 			rnp->boost_tasks = rnp->gp_tasks;
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 		t = rnp->boost_kthread_task;
 		if (t)
 			rcu_wake_cond(t, rnp->boost_kthread_status);
 	} else {
 		rcu_initiate_boost_trace(rnp);
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	}
 }
 
@@ -1171,7 +1171,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 		return PTR_ERR(t);
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	rnp->boost_kthread_task = t;
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	sp.sched_priority = kthread_prio;
 	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
@@ -1307,7 +1307,7 @@ static void rcu_prepare_kthreads(int cpu)
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
 	__releases(rnp->lock)
 {
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
 
 static void invoke_rcu_callbacks_kthread(void)
@@ -1558,7 +1558,7 @@ static void rcu_prepare_for_idle(void)
 		rnp = rdp->mynode;
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
-		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
 		if (needwake)
 			rcu_gp_kthread_wake(rsp);
 	}
@@ -2058,7 +2058,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	needwake = rcu_start_future_gp(rnp, rdp, &c);
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	if (needwake)
 		rcu_gp_kthread_wake(rdp->rsp);
 
-- 
cgit v0.10.2


From b354286effa52da6cb1b1f16604d41ff81b8c445 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Tue, 29 Dec 2015 12:18:48 +0800
Subject: irq: Privatize irq_common_data::state_use_accessors

irq_common_data::state_use_accessors is not designed for public use.
Therefore make it private so that people who write code accessing it
directly will get blamed by sparse. Also #undef the macro
__irqd_to_state after used in header files, so that the macro can't be
misused.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3c1c967..cd14cd4 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -137,7 +137,7 @@ struct irq_domain;
  * @msi_desc:		MSI descriptor
  */
 struct irq_common_data {
-	unsigned int		state_use_accessors;
+	unsigned int		__private state_use_accessors;
 #ifdef CONFIG_NUMA
 	unsigned int		node;
 #endif
@@ -208,7 +208,7 @@ enum {
 	IRQD_FORWARDED_TO_VCPU		= (1 << 20),
 };
 
-#define __irqd_to_state(d)		((d)->common->state_use_accessors)
+#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
 
 static inline bool irqd_is_setaffinity_pending(struct irq_data *d)
 {
@@ -299,6 +299,8 @@ static inline void irqd_clr_forwarded_to_vcpu(struct irq_data *d)
 	__irqd_to_state(d) &= ~IRQD_FORWARDED_TO_VCPU;
 }
 
+#undef __irqd_to_state
+
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
 {
 	return d->hwirq;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fcab63c..3d18293 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -160,6 +160,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
 	__irq_put_desc_unlock(desc, flags, false);
 }
 
+#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
+
 /*
  * Manipulation functions for irq_data.state
  */
@@ -188,6 +190,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
 	return __irqd_to_state(d) & mask;
 }
 
+#undef __irqd_to_state
+
 static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
 {
 	__this_cpu_inc(*desc->kstat_irqs);
-- 
cgit v0.10.2


From 9fc9204ef9fdbe550fd81a554abce2b210e63e49 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 7 Jan 2016 19:05:19 -0500
Subject: rcu: Make rcu/tiny_plugin.h explicitly non-modular

The Kconfig currently controlling compilation of this code is:

init/Kconfig:config TINY_RCU
init/Kconfig:   bool

...meaning that it currently is not being built as a module by anyone.

Lets remove the modular code that is essentially orphaned, so that
when reading the code there is no doubt it is builtin-only.

Since module_init translates to device_initcall in the non-modular
case, the init ordering remains unchanged with this commit.  We could
consider moving this to an earlier initcall (subsys?) if desired.

We also delete the MODULE_LICENSE tag etc. since all that information
is already contained at the top of the file in the comments.

Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index e492a52..196f030 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -23,7 +23,7 @@
  */
 
 #include <linux/kthread.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
@@ -122,18 +122,7 @@ free_out:
 	debugfs_remove_recursive(rcudir);
 	return 1;
 }
-
-static void __exit rcutiny_trace_cleanup(void)
-{
-	debugfs_remove_recursive(rcudir);
-}
-
-module_init(rcutiny_trace_init);
-module_exit(rcutiny_trace_cleanup);
-
-MODULE_AUTHOR("Paul E. McKenney");
-MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
-MODULE_LICENSE("GPL");
+device_initcall(rcutiny_trace_init);
 
 static void check_cpu_stall(struct rcu_ctrlblk *rcp)
 {
-- 
cgit v0.10.2


From 9de630c4f264dec48e61edd871cb98b8e1b58250 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 8 Jan 2016 07:43:50 -0800
Subject: rcu: Document unique-name limitation for DEFINE_STATIC_SRCU()

SRCU uses per-CPU variables, and DEFINE_STATIC_SRCU() uses a static
per-CPU variable.  However, per-CPU variables have significant
restrictions, for example, names of per-CPU variables must be globally
unique, even if declared static.  These restrictions carry over to
DEFINE_STATIC_SRCU(), and this commit therefore documents these
restrictions.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Tejun Heo <tj@kernel.org>

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index f5f80c5..dc8eb63 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -99,8 +99,23 @@ void process_srcu(struct work_struct *work);
 	}
 
 /*
- * define and init a srcu struct at build time.
- * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ * Define and initialize a srcu struct at build time.
+ * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ *
+ * Note that although DEFINE_STATIC_SRCU() hides the name from other
+ * files, the per-CPU variable rules nevertheless require that the
+ * chosen name be globally unique.  These rules also prohibit use of
+ * DEFINE_STATIC_SRCU() within a function.  If these rules are too
+ * restrictive, declare the srcu_struct manually.  For example, in
+ * each file:
+ *
+ *	static struct srcu_struct my_srcu;
+ *
+ * Then, before the first use of each my_srcu, manually initialize it:
+ *
+ *	init_srcu_struct(&my_srcu);
+ *
+ * See include/linux/percpu-defs.h for the rules on per-CPU variables.
  */
 #define __DEFINE_SRCU(name, is_static)					\
 	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-- 
cgit v0.10.2


From 4b455dc3e13064795ef2cd3415132df747e64063 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 27 Jan 2016 22:44:45 -0800
Subject: rcu: Catch up rcu_report_qs_rdp() comment with reality

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d5a6d2f..39f9c73 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2354,12 +2354,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 
 /*
  * Record a quiescent state for the specified CPU to that CPU's rcu_data
- * structure.  This must be either called from the specified CPU, or
- * called when the specified CPU is known to be offline (and when it is
- * also known that no other CPU is concurrently trying to help the offline
- * CPU).  The lastcomp argument is used to make sure we are still in the
- * grace period of interest.  We don't want to end the current grace period
- * based on quiescent states detected in an earlier grace period!
+ * structure.  This must be called from the specified CPU.
  */
 static void
 rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
-- 
cgit v0.10.2


From 3500efae4410454522697c94c23fc40323c0cee9 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Mon, 19 Oct 2015 14:45:02 -0700
Subject: rcu: Remove rcu_user_hooks_switch

Because there are neither uses nor intended uses for the
rcu_user_hooks_switch() function that was orginally intended
for nohz use, this commit removes it.

Signed-off-by: Yang Shi <yang.shi@linaro.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 14e6f47..b5d48bd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -360,8 +360,6 @@ void rcu_user_exit(void);
 #else
 static inline void rcu_user_enter(void) { }
 static inline void rcu_user_exit(void) { }
-static inline void rcu_user_hooks_switch(struct task_struct *prev,
-					 struct task_struct *next) { }
 #endif /* CONFIG_NO_HZ_FULL */
 
 #ifdef CONFIG_RCU_NOCB_CPU
-- 
cgit v0.10.2


From 4f2a848c567c72f778352d65cc7c155d1a0977fd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 1 Jan 2016 13:38:12 -0800
Subject: rcu: Export rcu_gp_is_normal()

This commit exports rcu_gp_is_normal() in order to allow it to be used
by rcutorture and rcuperf.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 76b94e1..ca828b4 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -128,6 +128,7 @@ bool rcu_gp_is_normal(void)
 {
 	return READ_ONCE(rcu_normal);
 }
+EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
 
 static atomic_t rcu_expedited_nesting =
 	ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
-- 
cgit v0.10.2


From 7c139db2e579669c3313f92d2dd2256b255fcc07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Irest=C3=A5l?= <andreas.irestal@axis.com>
Date: Tue, 16 Feb 2016 13:56:41 +0100
Subject: ASoC: adau17x1: Fix incorrect BCLK ratio definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Andreas Irestål <andire@axis.com>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/adau17x1.h b/sound/soc/codecs/adau17x1.h
index e13583e..5ae87a0 100644
--- a/sound/soc/codecs/adau17x1.h
+++ b/sound/soc/codecs/adau17x1.h
@@ -103,9 +103,9 @@ bool adau17x1_has_dsp(struct adau *adau);
 #define ADAU17X1_CLOCK_CONTROL_CORECLK_SRC_PLL	BIT(3)
 #define ADAU17X1_CLOCK_CONTROL_SYSCLK_EN	BIT(0)
 
-#define ADAU17X1_SERIAL_PORT1_BCLK32		(0x0 << 5)
-#define ADAU17X1_SERIAL_PORT1_BCLK48		(0x1 << 5)
-#define ADAU17X1_SERIAL_PORT1_BCLK64		(0x2 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK64		(0x0 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK32		(0x1 << 5)
+#define ADAU17X1_SERIAL_PORT1_BCLK48		(0x2 << 5)
 #define ADAU17X1_SERIAL_PORT1_BCLK128		(0x3 << 5)
 #define ADAU17X1_SERIAL_PORT1_BCLK256		(0x4 << 5)
 #define ADAU17X1_SERIAL_PORT1_BCLK_MASK		(0x7 << 5)
-- 
cgit v0.10.2


From f4bcfa1da6fdcbc7a0854a28603bffc3c5656332 Mon Sep 17 00:00:00 2001
From: Stafford Horne <shorne@gmail.com>
Date: Tue, 23 Feb 2016 22:39:28 +0900
Subject: sched/wait: Fix wait_event_freezable() documentation

I noticed the comment label 'wait_event' was wrong.

Signed-off-by: Stafford Horne <shorne@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1456234768-24933-1-git-send-email-shorne@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/wait.h b/include/linux/wait.h
index ae71a76..27d7a0a 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -338,7 +338,7 @@ do {									\
 			    schedule(); try_to_freeze())
 
 /**
- * wait_event - sleep (or freeze) until a condition gets true
+ * wait_event_freezable - sleep (or freeze) until a condition gets true
  * @wq: the waitqueue to wait on
  * @condition: a C expression for the event to wait for
  *
-- 
cgit v0.10.2


From 54fbad54ebcde9db9c7459e9e379f2350c25e1f1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 24 Feb 2016 09:46:42 +0100
Subject: perf mem record: Check for memory events support

Check if current kernel support available memory events and display the
status within -e  list option:

  $ perf mem record -e list
  ldlat-loads  : available
  ldlat-stores : available

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456303616-26926-2-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index b3f8a89..f1fa7b8 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -40,10 +40,11 @@ static int parse_record_events(const struct option *opt,
 	for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
 		struct perf_mem_event *e = &perf_mem_events[j];
 
-		fprintf(stderr, "%-20s%s",
-			e->tag, verbose ? "" : "\n");
-		if (verbose)
-			fprintf(stderr, " [%s]\n", e->name);
+		fprintf(stderr, "%-13s%-*s%s\n",
+			e->tag,
+			verbose ? 25 : 0,
+			verbose ? e->name : "",
+			e->supported ? ": available" : "");
 	}
 	exit(0);
 }
@@ -92,6 +93,12 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 		if (!perf_mem_events[j].record)
 			continue;
 
+		if (!perf_mem_events[j].supported) {
+			pr_err("failed: event '%s' not supported\n",
+			       perf_mem_events[j].name);
+			return -1;
+		}
+
 		rec_argv[i++] = "-e";
 		rec_argv[i++] = perf_mem_events[j].name;
 	};
@@ -355,6 +362,11 @@ int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
 		NULL
 	};
 
+	if (perf_mem_events__init()) {
+		pr_err("failed: memory events not supported\n");
+		return -1;
+	}
+
 	argc = parse_options_subcommand(argc, argv, mem_options, mem_subcommands,
 					mem_usage, PARSE_OPT_STOP_AT_NON_OPTION);
 
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index b1507c0..e21853f 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -2,15 +2,20 @@
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <api/fs/fs.h>
 #include "mem-events.h"
 #include "debug.h"
 
-#define E(t, n) { .tag = t, .name = n }
+#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
 
 struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
-	E("ldlat-loads",	"cpu/mem-loads,ldlat=30/P"),
-	E("ldlat-stores",	"cpu/mem-stores/P"),
+	E("ldlat-loads",	"cpu/mem-loads,ldlat=30/P",	"mem-loads"),
+	E("ldlat-stores",	"cpu/mem-stores/P",		"mem-stores"),
 };
+#undef E
 
 #undef E
 
@@ -49,3 +54,27 @@ int perf_mem_events__parse(const char *str)
 	pr_err("failed: event '%s' not found, use '-e list' to get list of available events\n", str);
 	return -1;
 }
+
+int perf_mem_events__init(void)
+{
+	const char *mnt = sysfs__mount();
+	bool found = false;
+	int j;
+
+	if (!mnt)
+		return -ENOENT;
+
+	for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
+		char path[PATH_MAX];
+		struct perf_mem_event *e = &perf_mem_events[j];
+		struct stat st;
+
+		scnprintf(path, PATH_MAX, "%s/devices/cpu/events/%s",
+			  mnt, e->sysfs_name);
+
+		if (!stat(path, &st))
+			e->supported = found = true;
+	}
+
+	return found ? 0 : -ENOENT;
+}
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 2995bae..75c1660 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -5,8 +5,10 @@
 
 struct perf_mem_event {
 	bool		record;
+	bool		supported;
 	const char	*tag;
 	const char	*name;
+	const char	*sysfs_name;
 };
 
 enum {
@@ -18,5 +20,6 @@ enum {
 extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
 
 int perf_mem_events__parse(const char *str);
+int perf_mem_events__init(void);
 
 #endif /* __PERF_MEM_EVENTS_H */
-- 
cgit v0.10.2


From 2ba7ac5814a6952aad647ce31696b893772cbe83 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 24 Feb 2016 09:46:43 +0100
Subject: perf mem: Introduce perf_mem_events__name function

Wrap perf_mem_events[].name into perf_mem_events__name() so we could alter the
events name if needed.

This will be handy when changing latency settings for loads event in following
patch.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456303616-26926-3-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index f1fa7b8..88aeac9 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -43,7 +43,7 @@ static int parse_record_events(const struct option *opt,
 		fprintf(stderr, "%-13s%-*s%s\n",
 			e->tag,
 			verbose ? 25 : 0,
-			verbose ? e->name : "",
+			verbose ? perf_mem_events__name(j) : "",
 			e->supported ? ": available" : "");
 	}
 	exit(0);
@@ -95,12 +95,12 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
 
 		if (!perf_mem_events[j].supported) {
 			pr_err("failed: event '%s' not supported\n",
-			       perf_mem_events[j].name);
+			       perf_mem_events__name(j));
 			return -1;
 		}
 
 		rec_argv[i++] = "-e";
-		rec_argv[i++] = perf_mem_events[j].name;
+		rec_argv[i++] = perf_mem_events__name(j);
 	};
 
 	for (j = 0; j < argc; j++, i++)
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index e21853f..2330db5 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -19,6 +19,11 @@ struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
 
 #undef E
 
+char *perf_mem_events__name(int i)
+{
+	return (char *)perf_mem_events[i].name;
+}
+
 int perf_mem_events__parse(const char *str)
 {
 	char *tok, *saveptr = NULL;
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 75c1660..2a91b95 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -22,4 +22,5 @@ extern struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX];
 int perf_mem_events__parse(const char *str);
 int perf_mem_events__init(void);
 
+char *perf_mem_events__name(int i);
 #endif /* __PERF_MEM_EVENTS_H */
-- 
cgit v0.10.2


From 0c877d759d3a62a01d75dc6de4a923a686bb285a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 24 Feb 2016 09:46:46 +0100
Subject: perf tools: Introduce perf_mem__tlb_scnprintf function

Move meminfo's tlb display function into mem-events.c object, so it
could be reused later from script code.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456303616-26926-6-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index 2330db5..4be3eb7 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -8,6 +8,7 @@
 #include <api/fs/fs.h>
 #include "mem-events.h"
 #include "debug.h"
+#include "symbol.h"
 
 #define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
 
@@ -83,3 +84,49 @@ int perf_mem_events__init(void)
 
 	return found ? 0 : -ENOENT;
 }
+
+static const char * const tlb_access[] = {
+	"N/A",
+	"HIT",
+	"MISS",
+	"L1",
+	"L2",
+	"Walker",
+	"Fault",
+};
+
+void perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+	size_t l = 0, i;
+	u64 m = PERF_MEM_TLB_NA;
+	u64 hit, miss;
+
+	sz -= 1; /* -1 for null termination */
+	out[0] = '\0';
+
+	if (mem_info)
+		m = mem_info->data_src.mem_dtlb;
+
+	hit = m & PERF_MEM_TLB_HIT;
+	miss = m & PERF_MEM_TLB_MISS;
+
+	/* already taken care of */
+	m &= ~(PERF_MEM_TLB_HIT|PERF_MEM_TLB_MISS);
+
+	for (i = 0; m && i < ARRAY_SIZE(tlb_access); i++, m >>= 1) {
+		if (!(m & 0x1))
+			continue;
+		if (l) {
+			strcat(out, " or ");
+			l += 4;
+		}
+		strncat(out, tlb_access[i], sz - l);
+		l += strlen(tlb_access[i]);
+	}
+	if (*out == '\0')
+		strcpy(out, "N/A");
+	if (hit)
+		strncat(out, " hit", sz - l);
+	if (miss)
+		strncat(out, " miss", sz - l);
+}
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 2a91b95..d8fb8e1 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -23,4 +23,7 @@ int perf_mem_events__parse(const char *str);
 int perf_mem_events__init(void);
 
 char *perf_mem_events__name(int i);
+
+struct mem_info;
+void perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 #endif /* __PERF_MEM_EVENTS_H */
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 5388f79..160df20 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -6,6 +6,7 @@
 #include "evsel.h"
 #include "evlist.h"
 #include <traceevent/event-parse.h>
+#include "mem-events.h"
 
 regex_t		parent_regex;
 const char	default_parent_pattern[] = "^sys_|^do_page_fault";
@@ -829,53 +830,12 @@ sort__tlb_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(data_src_r.mem_dtlb - data_src_l.mem_dtlb);
 }
 
-static const char * const tlb_access[] = {
-	"N/A",
-	"HIT",
-	"MISS",
-	"L1",
-	"L2",
-	"Walker",
-	"Fault",
-};
-
 static int hist_entry__tlb_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
 	char out[64];
-	size_t sz = sizeof(out) - 1; /* -1 for null termination */
-	size_t l = 0, i;
-	u64 m = PERF_MEM_TLB_NA;
-	u64 hit, miss;
-
-	out[0] = '\0';
-
-	if (he->mem_info)
-		m = he->mem_info->data_src.mem_dtlb;
-
-	hit = m & PERF_MEM_TLB_HIT;
-	miss = m & PERF_MEM_TLB_MISS;
-
-	/* already taken care of */
-	m &= ~(PERF_MEM_TLB_HIT|PERF_MEM_TLB_MISS);
-
-	for (i = 0; m && i < ARRAY_SIZE(tlb_access); i++, m >>= 1) {
-		if (!(m & 0x1))
-			continue;
-		if (l) {
-			strcat(out, " or ");
-			l += 4;
-		}
-		strncat(out, tlb_access[i], sz - l);
-		l += strlen(tlb_access[i]);
-	}
-	if (*out == '\0')
-		strcpy(out, "N/A");
-	if (hit)
-		strncat(out, " hit", sz - l);
-	if (miss)
-		strncat(out, " miss", sz - l);
 
+	perf_mem__tlb_scnprintf(out, sizeof(out), he->mem_info);
 	return repsep_snprintf(bf, size, "%-*s", width, out);
 }
 
-- 
cgit v0.10.2


From 071e9a1e12dceaec6f9d3ffe6e77ee68364166d6 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 24 Feb 2016 09:46:47 +0100
Subject: perf tools: Introduce perf_mem__lvl_scnprintf function

Move meminfo's lvl display function into mem-events.c object, so it
could be reused later from script code.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456303616-26926-7-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index 4be3eb7..bddb121 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -130,3 +130,56 @@ void perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 	if (miss)
 		strncat(out, " miss", sz - l);
 }
+
+static const char * const mem_lvl[] = {
+	"N/A",
+	"HIT",
+	"MISS",
+	"L1",
+	"LFB",
+	"L2",
+	"L3",
+	"Local RAM",
+	"Remote RAM (1 hop)",
+	"Remote RAM (2 hops)",
+	"Remote Cache (1 hop)",
+	"Remote Cache (2 hops)",
+	"I/O",
+	"Uncached",
+};
+
+void perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+	size_t i, l = 0;
+	u64 m =  PERF_MEM_LVL_NA;
+	u64 hit, miss;
+
+	if (mem_info)
+		m  = mem_info->data_src.mem_lvl;
+
+	sz -= 1; /* -1 for null termination */
+	out[0] = '\0';
+
+	hit = m & PERF_MEM_LVL_HIT;
+	miss = m & PERF_MEM_LVL_MISS;
+
+	/* already taken care of */
+	m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS);
+
+	for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) {
+		if (!(m & 0x1))
+			continue;
+		if (l) {
+			strcat(out, " or ");
+			l += 4;
+		}
+		strncat(out, mem_lvl[i], sz - l);
+		l += strlen(mem_lvl[i]);
+	}
+	if (*out == '\0')
+		strcpy(out, "N/A");
+	if (hit)
+		strncat(out, " hit", sz - l);
+	if (miss)
+		strncat(out, " miss", sz - l);
+}
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index d8fb8e1..bd0d7f7 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -26,4 +26,5 @@ char *perf_mem_events__name(int i);
 
 struct mem_info;
 void perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+void perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 #endif /* __PERF_MEM_EVENTS_H */
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 160df20..d894759 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -858,60 +858,12 @@ sort__lvl_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(data_src_r.mem_lvl - data_src_l.mem_lvl);
 }
 
-static const char * const mem_lvl[] = {
-	"N/A",
-	"HIT",
-	"MISS",
-	"L1",
-	"LFB",
-	"L2",
-	"L3",
-	"Local RAM",
-	"Remote RAM (1 hop)",
-	"Remote RAM (2 hops)",
-	"Remote Cache (1 hop)",
-	"Remote Cache (2 hops)",
-	"I/O",
-	"Uncached",
-};
-
 static int hist_entry__lvl_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
 	char out[64];
-	size_t sz = sizeof(out) - 1; /* -1 for null termination */
-	size_t i, l = 0;
-	u64 m =  PERF_MEM_LVL_NA;
-	u64 hit, miss;
-
-	if (he->mem_info)
-		m  = he->mem_info->data_src.mem_lvl;
-
-	out[0] = '\0';
-
-	hit = m & PERF_MEM_LVL_HIT;
-	miss = m & PERF_MEM_LVL_MISS;
-
-	/* already taken care of */
-	m &= ~(PERF_MEM_LVL_HIT|PERF_MEM_LVL_MISS);
-
-	for (i = 0; m && i < ARRAY_SIZE(mem_lvl); i++, m >>= 1) {
-		if (!(m & 0x1))
-			continue;
-		if (l) {
-			strcat(out, " or ");
-			l += 4;
-		}
-		strncat(out, mem_lvl[i], sz - l);
-		l += strlen(mem_lvl[i]);
-	}
-	if (*out == '\0')
-		strcpy(out, "N/A");
-	if (hit)
-		strncat(out, " hit", sz - l);
-	if (miss)
-		strncat(out, " miss", sz - l);
 
+	perf_mem__lvl_scnprintf(out, sizeof(out), he->mem_info);
 	return repsep_snprintf(bf, size, "%-*s", width, out);
 }
 
-- 
cgit v0.10.2


From 2c07af13dcd4d971578041b50598f1269b33e68a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 24 Feb 2016 09:46:48 +0100
Subject: perf tools: Introduce perf_mem__snp_scnprintf function

Move meminfo's snp display function into mem-events.c object, so it
could be reused later from script code.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456303616-26926-8-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index bddb121..d03edc2 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -183,3 +183,37 @@ void perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 	if (miss)
 		strncat(out, " miss", sz - l);
 }
+
+static const char * const snoop_access[] = {
+	"N/A",
+	"None",
+	"Miss",
+	"Hit",
+	"HitM",
+};
+
+void perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+	size_t i, l = 0;
+	u64 m = PERF_MEM_SNOOP_NA;
+
+	sz -= 1; /* -1 for null termination */
+	out[0] = '\0';
+
+	if (mem_info)
+		m = mem_info->data_src.mem_snoop;
+
+	for (i = 0; m && i < ARRAY_SIZE(snoop_access); i++, m >>= 1) {
+		if (!(m & 0x1))
+			continue;
+		if (l) {
+			strcat(out, " or ");
+			l += 4;
+		}
+		strncat(out, snoop_access[i], sz - l);
+		l += strlen(snoop_access[i]);
+	}
+
+	if (*out == '\0')
+		strcpy(out, "N/A");
+}
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index bd0d7f7..6efdd6f 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -27,4 +27,5 @@ char *perf_mem_events__name(int i);
 struct mem_info;
 void perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 void perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+void perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 #endif /* __PERF_MEM_EVENTS_H */
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index d894759..2007c3b 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -886,41 +886,12 @@ sort__snoop_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(data_src_r.mem_snoop - data_src_l.mem_snoop);
 }
 
-static const char * const snoop_access[] = {
-	"N/A",
-	"None",
-	"Miss",
-	"Hit",
-	"HitM",
-};
-
 static int hist_entry__snoop_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
 	char out[64];
-	size_t sz = sizeof(out) - 1; /* -1 for null termination */
-	size_t i, l = 0;
-	u64 m = PERF_MEM_SNOOP_NA;
-
-	out[0] = '\0';
-
-	if (he->mem_info)
-		m = he->mem_info->data_src.mem_snoop;
-
-	for (i = 0; m && i < ARRAY_SIZE(snoop_access); i++, m >>= 1) {
-		if (!(m & 0x1))
-			continue;
-		if (l) {
-			strcat(out, " or ");
-			l += 4;
-		}
-		strncat(out, snoop_access[i], sz - l);
-		l += strlen(snoop_access[i]);
-	}
-
-	if (*out == '\0')
-		strcpy(out, "N/A");
 
+	perf_mem__snp_scnprintf(out, sizeof(out), he->mem_info);
 	return repsep_snprintf(bf, size, "%-*s", width, out);
 }
 
-- 
cgit v0.10.2


From 69a77275926ccd0c08fde103de52b59f18370f5a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 24 Feb 2016 09:46:49 +0100
Subject: perf tools: Introduce perf_mem__lck_scnprintf function

Move meminfo's lck display function into mem-events.c object, so it
could be reused later from script code.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456303616-26926-9-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index d03edc2..9844e3e 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -217,3 +217,19 @@ void perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 	if (*out == '\0')
 		strcpy(out, "N/A");
 }
+
+void perf_mem__lck_scnprintf(char *out, size_t sz __maybe_unused,
+			     struct mem_info *mem_info)
+{
+	u64 mask = PERF_MEM_LOCK_NA;
+
+	if (mem_info)
+		mask = mem_info->data_src.mem_lock;
+
+	if (mask & PERF_MEM_LOCK_NA)
+		strncat(out, "N/A", 3);
+	else if (mask & PERF_MEM_LOCK_LOCKED)
+		strncat(out, "Yes", 3);
+	else
+		strncat(out, "No", 2);
+}
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 6efdd6f..99678b5 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -28,4 +28,6 @@ struct mem_info;
 void perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 void perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 void perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+void perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+
 #endif /* __PERF_MEM_EVENTS_H */
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 2007c3b..4175b29 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -795,19 +795,9 @@ sort__locked_cmp(struct hist_entry *left, struct hist_entry *right)
 static int hist_entry__locked_snprintf(struct hist_entry *he, char *bf,
 				    size_t size, unsigned int width)
 {
-	const char *out;
-	u64 mask = PERF_MEM_LOCK_NA;
-
-	if (he->mem_info)
-		mask = he->mem_info->data_src.mem_lock;
-
-	if (mask & PERF_MEM_LOCK_NA)
-		out = "N/A";
-	else if (mask & PERF_MEM_LOCK_LOCKED)
-		out = "Yes";
-	else
-		out = "No";
+	char out[10];
 
+	perf_mem__lck_scnprintf(out, sizeof(out), he->mem_info);
 	return repsep_snprintf(bf, size, "%.*s", width, out);
 }
 
-- 
cgit v0.10.2


From b1a5fbea3d69511e445b8d9efe6dc605edb508c8 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 24 Feb 2016 09:46:50 +0100
Subject: perf tools: Change perf_mem__tlb_scnprintf to return nb of displayed
 bytes

Moving strncat/strcpy calls into scnprintf to easily track
number of displayed bytes. It will be used in following patch.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456303616-26926-10-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index 9844e3e..b58d32e 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -95,7 +95,7 @@ static const char * const tlb_access[] = {
 	"Fault",
 };
 
-void perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 {
 	size_t l = 0, i;
 	u64 m = PERF_MEM_TLB_NA;
@@ -120,15 +120,16 @@ void perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 			strcat(out, " or ");
 			l += 4;
 		}
-		strncat(out, tlb_access[i], sz - l);
-		l += strlen(tlb_access[i]);
+		l += scnprintf(out + l, sz - l, tlb_access[i]);
 	}
 	if (*out == '\0')
-		strcpy(out, "N/A");
+		l += scnprintf(out, sz - l, "N/A");
 	if (hit)
-		strncat(out, " hit", sz - l);
+		l += scnprintf(out + l, sz - l, " hit");
 	if (miss)
-		strncat(out, " miss", sz - l);
+		l += scnprintf(out + l, sz - l, " miss");
+
+	return l;
 }
 
 static const char * const mem_lvl[] = {
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 99678b5..4141df6 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -25,7 +25,7 @@ int perf_mem_events__init(void);
 char *perf_mem_events__name(int i);
 
 struct mem_info;
-void perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 void perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 void perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 void perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
-- 
cgit v0.10.2


From 969075630e3abd1c740ac4f3183949cbf54b410d Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 24 Feb 2016 09:46:51 +0100
Subject: perf tools: Change perf_mem__lvl_scnprintf to return nb of displayed
 bytes

Moving strncat/strcpy calls into scnprintf to easily track number of
displayed bytes. It will be used in following patch.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456303616-26926-11-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index b58d32e..249250f 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -149,7 +149,7 @@ static const char * const mem_lvl[] = {
 	"Uncached",
 };
 
-void perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 {
 	size_t i, l = 0;
 	u64 m =  PERF_MEM_LVL_NA;
@@ -174,15 +174,16 @@ void perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 			strcat(out, " or ");
 			l += 4;
 		}
-		strncat(out, mem_lvl[i], sz - l);
-		l += strlen(mem_lvl[i]);
+		l += scnprintf(out + l, sz - l, mem_lvl[i]);
 	}
 	if (*out == '\0')
-		strcpy(out, "N/A");
+		l += scnprintf(out, sz - l, "N/A");
 	if (hit)
-		strncat(out, " hit", sz - l);
+		l += scnprintf(out + l, sz - l, " hit");
 	if (miss)
-		strncat(out, " miss", sz - l);
+		l += scnprintf(out + l, sz - l, " miss");
+
+	return l;
 }
 
 static const char * const snoop_access[] = {
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 4141df6..0467f67 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -26,7 +26,7 @@ char *perf_mem_events__name(int i);
 
 struct mem_info;
 int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
-void perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 void perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 void perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 
-- 
cgit v0.10.2


From 149d75076778d3b14e13b79d683b4f4fdd4fdb01 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 24 Feb 2016 09:46:52 +0100
Subject: perf tools: Change perf_mem__snp_scnprintf to return nb of displayed
 bytes

Moving strncat/strcpy calls into scnprintf to easily track number of
displayed bytes. It will be used in following patch.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456303616-26926-12-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index 249250f..de981dd 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -194,7 +194,7 @@ static const char * const snoop_access[] = {
 	"HitM",
 };
 
-void perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 {
 	size_t i, l = 0;
 	u64 m = PERF_MEM_SNOOP_NA;
@@ -212,12 +212,13 @@ void perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 			strcat(out, " or ");
 			l += 4;
 		}
-		strncat(out, snoop_access[i], sz - l);
-		l += strlen(snoop_access[i]);
+		l += scnprintf(out + l, sz - l, snoop_access[i]);
 	}
 
 	if (*out == '\0')
-		strcpy(out, "N/A");
+		l += scnprintf(out, sz - l, "N/A");
+
+	return l;
 }
 
 void perf_mem__lck_scnprintf(char *out, size_t sz __maybe_unused,
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 0467f67..84c79a4 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -27,7 +27,7 @@ char *perf_mem_events__name(int i);
 struct mem_info;
 int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
-void perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 void perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 
 #endif /* __PERF_MEM_EVENTS_H */
-- 
cgit v0.10.2


From 8b0819c8a3c97279b815581b606407c0387cc26f Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 24 Feb 2016 09:46:53 +0100
Subject: perf tools: Change perf_mem__lck_scnprintf to return nb of displayed
 bytes

Moving strncat call into scnprintf to easily track number of displayed
bytes. It will be used in following patch.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456303616-26926-13-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index de981dd..eadb83d 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -221,18 +221,20 @@ int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 	return l;
 }
 
-void perf_mem__lck_scnprintf(char *out, size_t sz __maybe_unused,
-			     struct mem_info *mem_info)
+int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 {
 	u64 mask = PERF_MEM_LOCK_NA;
+	int l;
 
 	if (mem_info)
 		mask = mem_info->data_src.mem_lock;
 
 	if (mask & PERF_MEM_LOCK_NA)
-		strncat(out, "N/A", 3);
+		l = scnprintf(out, sz, "N/A");
 	else if (mask & PERF_MEM_LOCK_LOCKED)
-		strncat(out, "Yes", 3);
+		l = scnprintf(out, sz, "Yes");
 	else
-		strncat(out, "No", 2);
+		l = scnprintf(out, sz, "No");
+
+	return l;
 }
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 84c79a4..87c44ff 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -28,6 +28,6 @@ struct mem_info;
 int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
-void perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
+int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 
 #endif /* __PERF_MEM_EVENTS_H */
-- 
cgit v0.10.2


From c19ac91245a2f8d26aafd7f23256f3b76314d5d4 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 24 Feb 2016 09:46:54 +0100
Subject: perf script: Display data_src values

Adding support to display data_src values, for events with data_src data
in sample.

Example:
  $ perf script
  ...
           rcuos/3    32 [002] ... 68501042 Local RAM hit|SNP None or Hit|TLB L1 or L2 hit|LCK No   ...
           rcuos/3    32 [002] ... 68100142 L1 hit|SNP None|TLB L1 or L2 hit|LCK No                 ...
           swapper     0 [002] ... 68100242 LFB hit|SNP None|TLB L1 or L2 hit|LCK No                ...
           swapper     0 [000] ... 68100142 L1 hit|SNP None|TLB L1 or L2 hit|LCK No                 ...
           swapper     0 [000] ... 50100142 L1 hit|SNP None|TLB L2 miss|LCK No                      ...
           rcuos/3    32 [002] ... 68100142 L1 hit|SNP None|TLB L1 or L2 hit|LCK No                 ...
   plugin-containe 16538 [000] ... 6a100142 L1 hit|SNP None|TLB L1 or L2 hit|LCK Yes                ...
           gkrellm  1736 [000] ... 68100242 LFB hit|SNP None|TLB L1 or L2 hit|LCK No                ...
           gkrellm  1736 [000] ... 6a100142 L1 hit|SNP None|TLB L1 or L2 hit|LCK Yes                ...

                                   ^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                             data_src value                     data_src translation

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456303616-26926-14-git-send-email-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index f4caf48..8ff5ff0 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -23,6 +23,7 @@
 #include "util/stat.h"
 #include <linux/bitmap.h>
 #include "asm/bug.h"
+#include "util/mem-events.h"
 
 static char const		*script_name;
 static char const		*generate_script_lang;
@@ -649,6 +650,23 @@ static int perf_evlist__max_name_len(struct perf_evlist *evlist)
 	return max;
 }
 
+static size_t data_src__printf(u64 data_src)
+{
+	struct mem_info mi = { .data_src.val = data_src };
+	char decode[100];
+	char out[100];
+	static int maxlen;
+	int len;
+
+	perf_script__meminfo_scnprintf(decode, 100, &mi);
+
+	len = scnprintf(out, 100, "%16" PRIx64 " %s", data_src, decode);
+	if (maxlen < len)
+		maxlen = len;
+
+	return printf("%-*s", maxlen, out);
+}
+
 static void process_event(struct perf_script *script, union perf_event *event,
 			  struct perf_sample *sample, struct perf_evsel *evsel,
 			  struct addr_location *al)
@@ -689,7 +707,7 @@ static void process_event(struct perf_script *script, union perf_event *event,
 		print_sample_addr(event, sample, thread, attr);
 
 	if (PRINT_FIELD(DATA_SRC))
-		printf("%16" PRIx64, sample->data_src);
+		data_src__printf(sample->data_src);
 
 	if (PRINT_FIELD(WEIGHT))
 		printf("%16" PRIu64, sample->weight);
diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c
index eadb83d..75465f8 100644
--- a/tools/perf/util/mem-events.c
+++ b/tools/perf/util/mem-events.c
@@ -238,3 +238,18 @@ int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
 
 	return l;
 }
+
+int perf_script__meminfo_scnprintf(char *out, size_t sz, struct mem_info *mem_info)
+{
+	int i = 0;
+
+	i += perf_mem__lvl_scnprintf(out, sz, mem_info);
+	i += scnprintf(out + i, sz - i, "|SNP ");
+	i += perf_mem__snp_scnprintf(out + i, sz - i, mem_info);
+	i += scnprintf(out + i, sz - i, "|TLB ");
+	i += perf_mem__tlb_scnprintf(out + i, sz - i, mem_info);
+	i += scnprintf(out + i, sz - i, "|LCK ");
+	i += perf_mem__lck_scnprintf(out + i, sz - i, mem_info);
+
+	return i;
+}
diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h
index 87c44ff..5d6d930 100644
--- a/tools/perf/util/mem-events.h
+++ b/tools/perf/util/mem-events.h
@@ -30,4 +30,6 @@ int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info);
 
+int perf_script__meminfo_scnprintf(char *bf, size_t size, struct mem_info *mem_info);
+
 #endif /* __PERF_MEM_EVENTS_H */
-- 
cgit v0.10.2


From c339b1a90e6cd638a1d99cbbf49d870ce233198e Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Wed, 24 Feb 2016 11:20:44 +0000
Subject: perf tools: Make binary data printer code in trace_event public
 available

Move code printing binray data from trace_event() to utils.c and allows
passing different printer. Further commits will use this logic to print
bpf output event.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456312845-111583-2-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/debug.c b/tools/perf/util/debug.c
index ff7e86a..8c4212a 100644
--- a/tools/perf/util/debug.c
+++ b/tools/perf/util/debug.c
@@ -106,40 +106,61 @@ int dump_printf(const char *fmt, ...)
 	return ret;
 }
 
+static void trace_event_printer(enum binary_printer_ops op,
+				unsigned int val, void *extra)
+{
+	const char *color = PERF_COLOR_BLUE;
+	union perf_event *event = (union perf_event *)extra;
+	unsigned char ch = (unsigned char)val;
+
+	switch (op) {
+	case BINARY_PRINT_DATA_BEGIN:
+		printf(".");
+		color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
+				event->header.size);
+		break;
+	case BINARY_PRINT_LINE_BEGIN:
+		printf(".");
+		break;
+	case BINARY_PRINT_ADDR:
+		color_fprintf(stdout, color, "  %04x: ", val);
+		break;
+	case BINARY_PRINT_NUM_DATA:
+		color_fprintf(stdout, color, " %02x", val);
+		break;
+	case BINARY_PRINT_NUM_PAD:
+		color_fprintf(stdout, color, "   ");
+		break;
+	case BINARY_PRINT_SEP:
+		color_fprintf(stdout, color, "  ");
+		break;
+	case BINARY_PRINT_CHAR_DATA:
+		color_fprintf(stdout, color, "%c",
+			      isprint(ch) ? ch : '.');
+		break;
+	case BINARY_PRINT_CHAR_PAD:
+		color_fprintf(stdout, color, " ");
+		break;
+	case BINARY_PRINT_LINE_END:
+		color_fprintf(stdout, color, "\n");
+		break;
+	case BINARY_PRINT_DATA_END:
+		printf("\n");
+		break;
+	default:
+		break;
+	}
+}
+
 void trace_event(union perf_event *event)
 {
 	unsigned char *raw_event = (void *)event;
-	const char *color = PERF_COLOR_BLUE;
-	int i, j;
 
 	if (!dump_trace)
 		return;
 
-	printf(".");
-	color_fprintf(stdout, color, "\n. ... raw event: size %d bytes\n",
-		      event->header.size);
-
-	for (i = 0; i < event->header.size; i++) {
-		if ((i & 15) == 0) {
-			printf(".");
-			color_fprintf(stdout, color, "  %04x: ", i);
-		}
-
-		color_fprintf(stdout, color, " %02x", raw_event[i]);
-
-		if (((i & 15) == 15) || i == event->header.size-1) {
-			color_fprintf(stdout, color, "  ");
-			for (j = 0; j < 15-(i & 15); j++)
-				color_fprintf(stdout, color, "   ");
-			for (j = i & ~15; j <= i; j++) {
-				color_fprintf(stdout, color, "%c",
-					      isprint(raw_event[j]) ?
-					      raw_event[j] : '.');
-			}
-			color_fprintf(stdout, color, "\n");
-		}
-	}
-	printf(".\n");
+	print_binary(raw_event, event->header.size, 16,
+		     trace_event_printer, event);
 }
 
 static struct debug_variable {
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 35b20dd..b7766c5 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -14,6 +14,7 @@
 #include <limits.h>
 #include <byteswap.h>
 #include <linux/kernel.h>
+#include <linux/log2.h>
 #include <unistd.h>
 #include "callchain.h"
 #include "strlist.h"
@@ -670,3 +671,39 @@ int fetch_current_timestamp(char *buf, size_t sz)
 
 	return 0;
 }
+
+void print_binary(unsigned char *data, size_t len,
+		  size_t bytes_per_line, print_binary_t printer,
+		  void *extra)
+{
+	size_t i, j, mask;
+
+	if (!printer)
+		return;
+
+	bytes_per_line = roundup_pow_of_two(bytes_per_line);
+	mask = bytes_per_line - 1;
+
+	printer(BINARY_PRINT_DATA_BEGIN, 0, extra);
+	for (i = 0; i < len; i++) {
+		if ((i & mask) == 0) {
+			printer(BINARY_PRINT_LINE_BEGIN, -1, extra);
+			printer(BINARY_PRINT_ADDR, i, extra);
+		}
+
+		printer(BINARY_PRINT_NUM_DATA, data[i], extra);
+
+		if (((i & mask) == mask) || i == len - 1) {
+			for (j = 0; j < mask-(i & mask); j++)
+				printer(BINARY_PRINT_NUM_PAD, -1, extra);
+
+			printer(BINARY_PRINT_SEP, i, extra);
+			for (j = i & ~mask; j <= i; j++)
+				printer(BINARY_PRINT_CHAR_DATA, data[j], extra);
+			for (j = 0; j < mask-(i & mask); j++)
+				printer(BINARY_PRINT_CHAR_PAD, i, extra);
+			printer(BINARY_PRINT_LINE_END, -1, extra);
+		}
+	}
+	printer(BINARY_PRINT_DATA_END, -1, extra);
+}
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 3dd0408..7015019 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -345,4 +345,24 @@ const char *perf_tip(const char *dirpath);
 bool is_regular_file(const char *file);
 int fetch_current_timestamp(char *buf, size_t sz);
 
+enum binary_printer_ops {
+	BINARY_PRINT_DATA_BEGIN,
+	BINARY_PRINT_LINE_BEGIN,
+	BINARY_PRINT_ADDR,
+	BINARY_PRINT_NUM_DATA,
+	BINARY_PRINT_NUM_PAD,
+	BINARY_PRINT_SEP,
+	BINARY_PRINT_CHAR_DATA,
+	BINARY_PRINT_CHAR_PAD,
+	BINARY_PRINT_LINE_END,
+	BINARY_PRINT_DATA_END,
+};
+
+typedef void (*print_binary_t)(enum binary_printer_ops,
+			       unsigned int val,
+			       void *extra);
+
+void print_binary(unsigned char *data, size_t len,
+		  size_t bytes_per_line, print_binary_t printer,
+		  void *extra);
 #endif /* GIT_COMPAT_UTIL_H */
-- 
cgit v0.10.2


From 30372f04c9dc159f99f1f09c61e5e0dbe4c91251 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Wed, 24 Feb 2016 11:20:45 +0000
Subject: perf script: Print bpf-output events in 'perf script'

This patch allows 'perf script' output messages from BPF program.  For
example, use test_bpf_output_3.c at the end of this commit message,

  # ./perf record -e bpf-output/no-inherit,name=evt/ \
                 -e ./test_bpf_output_3.c/map:channel.event=evt/ \
                 usleep 100000

  # ./perf script
          usleep  4882 21384.532523:                       evt:  ffffffff810e97d1 sys_nanosleep ([kernel.kallsyms])
      BPF output: 0000: 52 61 69 73 65 20 61 20  Raise a
                  0008: 42 50 46 20 65 76 65 6e  BPF even
                  0010: 74 21 00 00              t!..
      BPF string: "Raise a BPF event!"

          usleep  4882 21384.632606:                       evt:  ffffffff8105c609 kretprobe_trampoline_holder ([kernel.kallsyms
      BPF output: 0000: 52 61 69 73 65 20 61 20  Raise a
                  0008: 42 50 46 20 65 76 65 6e  BPF even
                  0010: 74 21 00 00              t!..
      BPF string: "Raise a BPF event!"

Two samples from BPF output are printed by both binary and string
format.

If BPF program output something unprintable, string format is
suppressed.

  /************************ BEGIN **************************/
  #include <uapi/linux/bpf.h>
  struct bpf_map_def {
         unsigned int type;
         unsigned int key_size;
         unsigned int value_size;
         unsigned int max_entries;
  };
  #define SEC(NAME) __attribute__((section(NAME), used))
  static u64 (*ktime_get_ns)(void) =
         (void *)BPF_FUNC_ktime_get_ns;
  static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
         (void *)BPF_FUNC_trace_printk;
  static int (*get_smp_processor_id)(void) =
         (void *)BPF_FUNC_get_smp_processor_id;
  static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *, unsigned long) =
         (void *)BPF_FUNC_perf_event_output;

  struct bpf_map_def SEC("maps") channel = {
         .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
         .key_size = sizeof(int),
         .value_size = sizeof(u32),
         .max_entries = __NR_CPUS__,
  };

  static inline int __attribute__((always_inline))
  func(void *ctx, int type)
  {
         char output_str[] = "Raise a BPF event!";

         perf_event_output(ctx, &channel, get_smp_processor_id(),
                           &output_str, sizeof(output_str));
         return 0;
  }
  SEC("func_begin=sys_nanosleep")
  int func_begin(void *ctx) {return func(ctx, 1);}
  SEC("func_end=sys_nanosleep%return")
  int func_end(void *ctx) { return func(ctx, 2);}
  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  /************************* END ***************************/

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456312845-111583-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 8ff5ff0..ec4fbd4 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -61,6 +61,7 @@ enum perf_output_field {
 	PERF_OUTPUT_BRSTACKSYM	    = 1U << 16,
 	PERF_OUTPUT_DATA_SRC	    = 1U << 17,
 	PERF_OUTPUT_WEIGHT	    = 1U << 18,
+	PERF_OUTPUT_BPF_OUTPUT	    = 1U << 19,
 };
 
 struct output_option {
@@ -86,6 +87,7 @@ struct output_option {
 	{.str = "brstacksym", .field = PERF_OUTPUT_BRSTACKSYM},
 	{.str = "data_src", .field = PERF_OUTPUT_DATA_SRC},
 	{.str = "weight",   .field = PERF_OUTPUT_WEIGHT},
+	{.str = "bpf-output",   .field = PERF_OUTPUT_BPF_OUTPUT},
 };
 
 /* default set to maintain compatibility with current format */
@@ -106,7 +108,7 @@ static struct {
 			      PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
 			      PERF_OUTPUT_PERIOD,
 
-		.invalid_fields = PERF_OUTPUT_TRACE,
+		.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
 	},
 
 	[PERF_TYPE_SOFTWARE] = {
@@ -116,7 +118,7 @@ static struct {
 			      PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
 			      PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
 			      PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
-			      PERF_OUTPUT_PERIOD,
+			      PERF_OUTPUT_PERIOD | PERF_OUTPUT_BPF_OUTPUT,
 
 		.invalid_fields = PERF_OUTPUT_TRACE,
 	},
@@ -126,7 +128,7 @@ static struct {
 
 		.fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
 				  PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
-				  PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE,
+				  PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE
 	},
 
 	[PERF_TYPE_RAW] = {
@@ -139,7 +141,7 @@ static struct {
 			      PERF_OUTPUT_PERIOD |  PERF_OUTPUT_ADDR |
 			      PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT,
 
-		.invalid_fields = PERF_OUTPUT_TRACE,
+		.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
 	},
 
 	[PERF_TYPE_BREAKPOINT] = {
@@ -151,7 +153,7 @@ static struct {
 			      PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
 			      PERF_OUTPUT_PERIOD,
 
-		.invalid_fields = PERF_OUTPUT_TRACE,
+		.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
 	},
 };
 
@@ -624,6 +626,84 @@ static void print_sample_flags(u32 flags)
 	printf("  %-4s ", str);
 }
 
+struct printer_data {
+	int line_no;
+	bool hit_nul;
+	bool is_printable;
+};
+
+static void
+print_sample_bpf_output_printer(enum binary_printer_ops op,
+				unsigned int val,
+				void *extra)
+{
+	unsigned char ch = (unsigned char)val;
+	struct printer_data *printer_data = extra;
+
+	switch (op) {
+	case BINARY_PRINT_DATA_BEGIN:
+		printf("\n");
+		break;
+	case BINARY_PRINT_LINE_BEGIN:
+		printf("%17s", !printer_data->line_no ? "BPF output:" :
+						        "           ");
+		break;
+	case BINARY_PRINT_ADDR:
+		printf(" %04x:", val);
+		break;
+	case BINARY_PRINT_NUM_DATA:
+		printf(" %02x", val);
+		break;
+	case BINARY_PRINT_NUM_PAD:
+		printf("   ");
+		break;
+	case BINARY_PRINT_SEP:
+		printf("  ");
+		break;
+	case BINARY_PRINT_CHAR_DATA:
+		if (printer_data->hit_nul && ch)
+			printer_data->is_printable = false;
+
+		if (!isprint(ch)) {
+			printf("%c", '.');
+
+			if (!printer_data->is_printable)
+				break;
+
+			if (ch == '\0')
+				printer_data->hit_nul = true;
+			else
+				printer_data->is_printable = false;
+		} else {
+			printf("%c", ch);
+		}
+		break;
+	case BINARY_PRINT_CHAR_PAD:
+		printf(" ");
+		break;
+	case BINARY_PRINT_LINE_END:
+		printf("\n");
+		printer_data->line_no++;
+		break;
+	case BINARY_PRINT_DATA_END:
+	default:
+		break;
+	}
+}
+
+static void print_sample_bpf_output(struct perf_sample *sample)
+{
+	unsigned int nr_bytes = sample->raw_size;
+	struct printer_data printer_data = {0, false, true};
+
+	print_binary(sample->raw_data, nr_bytes, 8,
+		     print_sample_bpf_output_printer, &printer_data);
+
+	if (printer_data.is_printable && printer_data.hit_nul)
+		printf("%17s \"%s\"\n", "BPF string:",
+		       (char *)(sample->raw_data));
+}
+
 struct perf_script {
 	struct perf_tool	tool;
 	struct perf_session	*session;
@@ -731,6 +811,9 @@ static void process_event(struct perf_script *script, union perf_event *event,
 	else if (PRINT_FIELD(BRSTACKSYM))
 		print_sample_brstacksym(event, sample, thread, attr);
 
+	if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
+		print_sample_bpf_output(sample);
+
 	printf("\n");
 }
 
-- 
cgit v0.10.2


From a9c6e46c04ba38925e94c4c2fa9217460338db43 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:33 +0900
Subject: perf tools: Add helper functions for some sort keys

The 'trace', 'srcline' and 'srcfile' sort keys updates hist entry's
field later.  With the hierarchy mode, those fields are passed to a
matching entry so it needs to identify the sort keys.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 97baa1d..044419b3 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -301,6 +301,9 @@ void perf_hpp__append_sort_keys(struct perf_hpp_list *list);
 bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format);
 bool perf_hpp__is_dynamic_entry(struct perf_hpp_fmt *format);
 bool perf_hpp__defined_dynamic_entry(struct perf_hpp_fmt *fmt, struct hists *hists);
+bool perf_hpp__is_trace_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_srcline_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_srcfile_entry(struct perf_hpp_fmt *fmt);
 
 static inline bool perf_hpp__should_skip(struct perf_hpp_fmt *format,
 					 struct hists *hists)
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 4175b29..358035c 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1391,6 +1391,39 @@ bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format)
 	return format->header == __sort__hpp_header;
 }
 
+bool perf_hpp__is_trace_entry(struct perf_hpp_fmt *fmt)
+{
+	struct hpp_sort_entry *hse;
+
+	if (!perf_hpp__is_sort_entry(fmt))
+		return false;
+
+	hse = container_of(fmt, struct hpp_sort_entry, hpp);
+	return hse->se == &sort_trace;
+}
+
+bool perf_hpp__is_srcline_entry(struct perf_hpp_fmt *fmt)
+{
+	struct hpp_sort_entry *hse;
+
+	if (!perf_hpp__is_sort_entry(fmt))
+		return false;
+
+	hse = container_of(fmt, struct hpp_sort_entry, hpp);
+	return hse->se == &sort_srcline;
+}
+
+bool perf_hpp__is_srcfile_entry(struct perf_hpp_fmt *fmt)
+{
+	struct hpp_sort_entry *hse;
+
+	if (!perf_hpp__is_sort_entry(fmt))
+		return false;
+
+	hse = container_of(fmt, struct hpp_sort_entry, hpp);
+	return hse->se == &sort_srcfile;
+}
+
 static bool __sort__hpp_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
 {
 	struct hpp_sort_entry *hse_a;
-- 
cgit v0.10.2


From c7dfe3abf40ec9a298884a4c0775062eac27afc9 Mon Sep 17 00:00:00 2001
From: Sebastian Frias <sf84@laposte.net>
Date: Mon, 22 Feb 2016 13:33:14 +0100
Subject: net: ethernet: nb8800: support fixed-link DT node

Under some circumstances, e.g. when connecting to a switch, the ethernet
port will not be connected to a PHY. In that case a "fixed-link" DT node
can be used to replace it.

https://stackoverflow.com/questions/31046172/device-tree-for-phy-less-connection-to-a-dsa-switch

This patch adds support for the "fixed-link" node to the nb8800 driver.

Signed-off-by: Sebastian Frias <sf84@laposte.net>
Acked-by: Mans Rullgard <mans@mansr.com>
Cc: Mason <slash.tmp@free.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/aurora/nb8800.c b/drivers/net/ethernet/aurora/nb8800.c
index f71ab26..08a23e6 100644
--- a/drivers/net/ethernet/aurora/nb8800.c
+++ b/drivers/net/ethernet/aurora/nb8800.c
@@ -1460,7 +1460,19 @@ static int nb8800_probe(struct platform_device *pdev)
 		goto err_disable_clk;
 	}
 
-	priv->phy_node = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0);
+	if (of_phy_is_fixed_link(pdev->dev.of_node)) {
+		ret = of_phy_register_fixed_link(pdev->dev.of_node);
+		if (ret < 0) {
+			dev_err(&pdev->dev, "bad fixed-link spec\n");
+			goto err_free_bus;
+		}
+		priv->phy_node = of_node_get(pdev->dev.of_node);
+	}
+
+	if (!priv->phy_node)
+		priv->phy_node = of_parse_phandle(pdev->dev.of_node,
+						  "phy-handle", 0);
+
 	if (!priv->phy_node) {
 		dev_err(&pdev->dev, "no PHY specified\n");
 		ret = -ENODEV;
-- 
cgit v0.10.2


From aef810ec4e6b638facb6c81803c019906f34f014 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:34 +0900
Subject: perf hists: Basic support of hierarchical report view

In the hierarchical view, entries will be grouped and sorted on the
first key, and then on the second key, and so on.  Add the
he->hroot_{in,out} fields to keep the lower level entries. Actually this
can share space, in a union, with callchain's 'sorted_root' since the
hroots are only used by non-leaf entries and callchain is only used by
leaf entries.

It also adds the 'parent_he' and 'depth' fields which can be used by browsers.

This patch only implements collapsing part which creates internal
entries for each sort key.  These need to be sorted by output_sort stage
and to be displayed properly in the later patch(es).

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 017eb5c..8814524 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -396,6 +396,9 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template,
 		}
 		INIT_LIST_HEAD(&he->pairs.node);
 		thread__get(he->thread);
+
+		if (!symbol_conf.report_hierarchy)
+			he->leaf = true;
 	}
 
 	return he;
@@ -1049,6 +1052,114 @@ int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp,
  * collapse the histogram
  */
 
+static void hists__apply_filters(struct hists *hists, struct hist_entry *he);
+
+static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
+						 struct rb_root *root,
+						 struct hist_entry *he,
+						 struct perf_hpp_fmt *fmt)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter, *new;
+	int64_t cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node_in);
+
+		cmp = fmt->collapse(fmt, iter, he);
+		if (!cmp) {
+			he_stat__add_stat(&iter->stat, &he->stat);
+			return iter;
+		}
+
+		if (cmp < 0)
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+
+	new = hist_entry__new(he, true);
+	if (new == NULL)
+		return NULL;
+
+	hists__apply_filters(hists, new);
+	hists->nr_entries++;
+
+	/* save related format for output */
+	new->fmt = fmt;
+
+	/* some fields are now passed to 'new' */
+	if (perf_hpp__is_trace_entry(fmt))
+		he->trace_output = NULL;
+	else
+		new->trace_output = NULL;
+
+	if (perf_hpp__is_srcline_entry(fmt))
+		he->srcline = NULL;
+	else
+		new->srcline = NULL;
+
+	if (perf_hpp__is_srcfile_entry(fmt))
+		he->srcfile = NULL;
+	else
+		new->srcfile = NULL;
+
+	rb_link_node(&new->rb_node_in, parent, p);
+	rb_insert_color(&new->rb_node_in, root);
+	return new;
+}
+
+static int hists__hierarchy_insert_entry(struct hists *hists,
+					 struct rb_root *root,
+					 struct hist_entry *he)
+{
+	struct perf_hpp_fmt *fmt;
+	struct hist_entry *new_he = NULL;
+	struct hist_entry *parent = NULL;
+	int depth = 0;
+	int ret = 0;
+
+	hists__for_each_sort_list(hists, fmt) {
+		if (!perf_hpp__is_sort_entry(fmt) &&
+		    !perf_hpp__is_dynamic_entry(fmt))
+			continue;
+		if (perf_hpp__should_skip(fmt, hists))
+			continue;
+
+		/* insert copy of 'he' for each fmt into the hierarchy */
+		new_he = hierarchy_insert_entry(hists, root, he, fmt);
+		if (new_he == NULL) {
+			ret = -1;
+			break;
+		}
+
+		root = &new_he->hroot_in;
+		new_he->parent_he = parent;
+		new_he->depth = depth++;
+		parent = new_he;
+	}
+
+	if (new_he) {
+		new_he->leaf = true;
+
+		if (symbol_conf.use_callchain) {
+			callchain_cursor_reset(&callchain_cursor);
+			if (callchain_merge(&callchain_cursor,
+					    new_he->callchain,
+					    he->callchain) < 0)
+				ret = -1;
+		}
+	}
+
+	/* 'he' is no longer used */
+	hist_entry__delete(he);
+
+	/* return 0 (or -1) since it already applied filters */
+	return ret;
+}
+
 int hists__collapse_insert_entry(struct hists *hists, struct rb_root *root,
 				 struct hist_entry *he)
 {
@@ -1057,6 +1168,9 @@ int hists__collapse_insert_entry(struct hists *hists, struct rb_root *root,
 	struct hist_entry *iter;
 	int64_t cmp;
 
+	if (symbol_conf.report_hierarchy)
+		return hists__hierarchy_insert_entry(hists, root, he);
+
 	while (*p != NULL) {
 		parent = *p;
 		iter = rb_entry(parent, struct hist_entry, rb_node_in);
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 5b9c624..10315e0 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -96,9 +96,11 @@ struct hist_entry {
 	s32			socket;
 	s32			cpu;
 	u8			cpumode;
+	u8			depth;
 
 	/* We are added by hists__add_dummy_entry. */
 	bool			dummy;
+	bool			leaf;
 
 	char			level;
 	u8			filtered;
@@ -120,13 +122,22 @@ struct hist_entry {
 	char			*srcline;
 	char			*srcfile;
 	struct symbol		*parent;
-	struct rb_root		sorted_chain;
 	struct branch_info	*branch_info;
 	struct hists		*hists;
 	struct mem_info		*mem_info;
 	void			*raw_data;
 	u32			raw_size;
 	void			*trace_output;
+	struct perf_hpp_fmt	*fmt;
+	struct hist_entry	*parent_he;
+	union {
+		/* this is for hierarchical entry structure */
+		struct {
+			struct rb_root	hroot_in;
+			struct rb_root  hroot_out;
+		};				/* non-leaf entries */
+		struct rb_root	sorted_chain;	/* leaf entry has callchains */
+	};
 	struct callchain_root	callchain[0]; /* must be last member */
 };
 
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index ccd1caa..a937053 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -110,7 +110,8 @@ struct symbol_conf {
 			has_filter,
 			show_ref_callgraph,
 			hide_unresolved,
-			raw_trace;
+			raw_trace,
+			report_hierarchy;
 	const char	*vmlinux_name,
 			*kallsyms_name,
 			*source_prefix,
-- 
cgit v0.10.2


From a8c4a2522a0808c5c2143612909717d1115c40cf Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Mon, 22 Feb 2016 18:43:25 +0100
Subject: ipv4: only create late gso-skb if skb is already set up with
 CHECKSUM_PARTIAL

Otherwise we break the contract with GSO to only pass CHECKSUM_PARTIAL
skbs down. This can easily happen with UDP+IPv4 sockets with the first
MSG_MORE write smaller than the MTU, second write is a sendfile.

Returning -EOPNOTSUPP lets the callers fall back into normal sendmsg path,
were we calculate the checksum manually during copying.

Commit d749c9cbffd6 ("ipv4: no CHECKSUM_PARTIAL on MSG_MORE corked
sockets") started to exposes this bug.

Fixes: d749c9cbffd6 ("ipv4: no CHECKSUM_PARTIAL on MSG_MORE corked sockets")
Reported-by: Jiri Benc <jbenc@redhat.com>
Cc: Jiri Benc <jbenc@redhat.com>
Reported-by: Wakko Warner <wakko@animx.eu.org>
Cc: Wakko Warner <wakko@animx.eu.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 64878ef..565bf64 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1236,13 +1236,16 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 	if (!skb)
 		return -EINVAL;
 
-	cork->length += size;
 	if ((size + skb->len > mtu) &&
 	    (sk->sk_protocol == IPPROTO_UDP) &&
 	    (rt->dst.dev->features & NETIF_F_UFO)) {
+		if (skb->ip_summed != CHECKSUM_PARTIAL)
+			return -EOPNOTSUPP;
+
 		skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 	}
+	cork->length += size;
 
 	while (size > 0) {
 		if (skb_is_gso(skb)) {
-- 
cgit v0.10.2


From 1a3906a7e6b9cbfaf2a3d00c310aed8af8e10d92 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:35 +0900
Subject: perf hists: Resort hist entries with hierarchy

For hierarchical output, each entry must be sorted in their rbtree
(hroot) properly.  Add hists__hierarchy_output_resort() to do the job.
Note that those hierarchy entries share the period counts, it'd be
important to update the hists->stats only once (for leaves).

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-4-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 8814524..6ddac2f 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1318,6 +1318,86 @@ void hists__inc_stats(struct hists *hists, struct hist_entry *h)
 	hists->stats.total_period += h->stat.period;
 }
 
+static void hierarchy_insert_output_entry(struct rb_root *root,
+					  struct hist_entry *he)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		if (hist_entry__sort(he, iter) > 0)
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, root);
+}
+
+static void hists__hierarchy_output_resort(struct hists *hists,
+					   struct ui_progress *prog,
+					   struct rb_root *root_in,
+					   struct rb_root *root_out,
+					   u64 min_callchain_hits,
+					   bool use_callchain)
+{
+	struct rb_node *node;
+	struct hist_entry *he;
+
+	*root_out = RB_ROOT;
+	node = rb_first(root_in);
+
+	while (node) {
+		he = rb_entry(node, struct hist_entry, rb_node_in);
+		node = rb_next(node);
+
+		hierarchy_insert_output_entry(root_out, he);
+
+		if (prog)
+			ui_progress__update(prog, 1);
+
+		if (!he->leaf) {
+			hists__hierarchy_output_resort(hists, prog,
+						       &he->hroot_in,
+						       &he->hroot_out,
+						       min_callchain_hits,
+						       use_callchain);
+			hists->nr_entries++;
+			if (!he->filtered) {
+				hists->nr_non_filtered_entries++;
+				hists__calc_col_len(hists, he);
+			}
+
+			continue;
+		}
+
+		/* only update stat for leaf entries to avoid duplication */
+		hists__inc_stats(hists, he);
+		if (!he->filtered)
+			hists__calc_col_len(hists, he);
+
+		if (!use_callchain)
+			continue;
+
+		if (callchain_param.mode == CHAIN_GRAPH_REL) {
+			u64 total = he->stat.period;
+
+			if (symbol_conf.cumulate_callchain)
+				total = he->stat_acc->period;
+
+			min_callchain_hits = total * (callchain_param.min_percent / 100);
+		}
+
+		callchain_param.sort(&he->sorted_chain, he->callchain,
+				     min_callchain_hits, &callchain_param);
+	}
+}
+
 static void __hists__insert_output_entry(struct rb_root *entries,
 					 struct hist_entry *he,
 					 u64 min_callchain_hits,
@@ -1369,6 +1449,17 @@ static void output_resort(struct hists *hists, struct ui_progress *prog,
 
 	min_callchain_hits = callchain_total * (callchain_param.min_percent / 100);
 
+	hists__reset_stats(hists);
+	hists__reset_col_len(hists);
+
+	if (symbol_conf.report_hierarchy) {
+		return hists__hierarchy_output_resort(hists, prog,
+						      &hists->entries_collapsed,
+						      &hists->entries,
+						      min_callchain_hits,
+						      use_callchain);
+	}
+
 	if (sort__need_collapse)
 		root = &hists->entries_collapsed;
 	else
@@ -1377,9 +1468,6 @@ static void output_resort(struct hists *hists, struct ui_progress *prog,
 	next = rb_first(root);
 	hists->entries = RB_ROOT;
 
-	hists__reset_stats(hists);
-	hists__reset_col_len(hists);
-
 	while (next) {
 		n = rb_entry(next, struct hist_entry, rb_node_in);
 		next = rb_next(&n->rb_node_in);
-- 
cgit v0.10.2


From 8c01872fe3c17fde1ce74eecf523d6d7fce5ffec Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:36 +0900
Subject: perf hists: Add helper functions for hierarchy mode

The rb_hierarchy_{next,prev,last} functions are to traverse all hist
entries in a hierarchy.  They will be used by various function which
supports hierarchy output.

As the rb_hierarchy_next() is used to traverse the whole hierarchy, it
sometime needs to visit entries regardless of current folding state.  So
add enum hierarchy_move_dir and pass it to __rb_hierarchy_next() for
those cases.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-5-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 6ddac2f..358af7e 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1500,6 +1500,62 @@ void hists__output_resort(struct hists *hists, struct ui_progress *prog)
 	output_resort(hists, prog, symbol_conf.use_callchain);
 }
 
+static bool can_goto_child(struct hist_entry *he, enum hierarchy_move_dir hmd)
+{
+	if (he->leaf || hmd == HMD_FORCE_SIBLING)
+		return false;
+
+	if (he->unfolded || hmd == HMD_FORCE_CHILD)
+		return true;
+
+	return false;
+}
+
+struct rb_node *rb_hierarchy_last(struct rb_node *node)
+{
+	struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+	while (can_goto_child(he, HMD_NORMAL)) {
+		node = rb_last(&he->hroot_out);
+		he = rb_entry(node, struct hist_entry, rb_node);
+	}
+	return node;
+}
+
+struct rb_node *__rb_hierarchy_next(struct rb_node *node, enum hierarchy_move_dir hmd)
+{
+	struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+	if (can_goto_child(he, hmd))
+		node = rb_first(&he->hroot_out);
+	else
+		node = rb_next(node);
+
+	while (node == NULL) {
+		he = he->parent_he;
+		if (he == NULL)
+			break;
+
+		node = rb_next(&he->rb_node);
+	}
+	return node;
+}
+
+struct rb_node *rb_hierarchy_prev(struct rb_node *node)
+{
+	struct hist_entry *he = rb_entry(node, struct hist_entry, rb_node);
+
+	node = rb_prev(node);
+	if (node)
+		return rb_hierarchy_last(node);
+
+	he = he->parent_he;
+	if (he == NULL)
+		return NULL;
+
+	return &he->rb_node;
+}
+
 static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h,
 				       enum hist_filter filter)
 {
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 044419b3..5690906 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -418,4 +418,20 @@ int perf_hist_config(const char *var, const char *value);
 
 void perf_hpp_list__init(struct perf_hpp_list *list);
 
+enum hierarchy_move_dir {
+	HMD_NORMAL,
+	HMD_FORCE_SIBLING,
+	HMD_FORCE_CHILD,
+};
+
+struct rb_node *rb_hierarchy_last(struct rb_node *node);
+struct rb_node *__rb_hierarchy_next(struct rb_node *node,
+				    enum hierarchy_move_dir hmd);
+struct rb_node *rb_hierarchy_prev(struct rb_node *node);
+
+static inline struct rb_node *rb_hierarchy_next(struct rb_node *node)
+{
+	return __rb_hierarchy_next(node, HMD_NORMAL);
+}
+
 #endif	/* __PERF_HIST_H */
-- 
cgit v0.10.2


From 2da897e51d7f23f872224218b9a4314503b0d360 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 23 Feb 2016 02:05:26 +0100
Subject: bpf: fix csum setting for bpf_set_tunnel_key

The fix in 35e2d1152b22 ("tunnels: Allow IPv6 UDP checksums to be correctly
controlled.") changed behavior for bpf_set_tunnel_key() when in use with
IPv6 and thus uncovered a bug that TUNNEL_CSUM needed to be set but wasn't.
As a result, the stack dropped ingress vxlan IPv6 packets, that have been
sent via eBPF through collect meta data mode due to checksum now being zero.

Since after LCO, we enable IPv4 checksum by default, so make that analogous
and only provide a flag BPF_F_ZERO_CSUM_TX for the user to turn it off in
IPv4 case.

Fixes: 35e2d1152b22 ("tunnels: Allow IPv6 UDP checksums to be correctly controlled.")
Fixes: c6c33454072f ("bpf: support ipv6 for bpf_skb_{set,get}_tunnel_key")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa6f857..5df4881 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -292,6 +292,9 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
+/* BPF_FUNC_skb_set_tunnel_key flags. */
+#define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/net/core/filter.c b/net/core/filter.c
index 94d2620..bba502f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1752,7 +1752,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	u8 compat[sizeof(struct bpf_tunnel_key)];
 	struct ip_tunnel_info *info;
 
-	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6)))
+	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX)))
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
@@ -1776,7 +1776,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	info = &md->u.tun_info;
 	info->mode = IP_TUNNEL_INFO_TX;
 
-	info->key.tun_flags = TUNNEL_KEY;
+	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
 	info->key.tos = from->tunnel_tos;
 	info->key.ttl = from->tunnel_ttl;
@@ -1787,6 +1787,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		       sizeof(from->remote_ipv6));
 	} else {
 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+		if (flags & BPF_F_ZERO_CSUM_TX)
+			info->key.tun_flags &= ~TUNNEL_CSUM;
 	}
 
 	return 0;
-- 
cgit v0.10.2


From 54430101d2af260dba2d129cc9d9b7c7e60087b0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:37 +0900
Subject: perf hists: Introduce hist_entry__filter()

The hist_entry__filter() function is to filter hist entries using sort
key related info.  This is needed to support hierarchy mode since each
hist entry will be associated with a hpp fmt which has a sort key.  So
each entry should compare to only matching type of filters.

To do that, add the ->se_filter callback field to struct sort_entry.
This callback takes 'type' argument which determines whether it's
matching sort key or not.  It returns -1 for non-matching type, 0 for
filtered entry and 1 for not filtered entries.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-6-git-send-email-namhyung@kernel.org
[ 'socket' is reserved in sys/socket.h, so replace it with 'sk' ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 5690906..480d2eb 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -305,6 +305,8 @@ bool perf_hpp__is_trace_entry(struct perf_hpp_fmt *fmt);
 bool perf_hpp__is_srcline_entry(struct perf_hpp_fmt *fmt);
 bool perf_hpp__is_srcfile_entry(struct perf_hpp_fmt *fmt);
 
+int hist_entry__filter(struct hist_entry *he, int type, const void *arg);
+
 static inline bool perf_hpp__should_skip(struct perf_hpp_fmt *format,
 					 struct hists *hists)
 {
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 358035c..6bee8bd 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -90,10 +90,21 @@ static int hist_entry__thread_snprintf(struct hist_entry *he, char *bf,
 			       width, width, comm ?: "");
 }
 
+static int hist_entry__thread_filter(struct hist_entry *he, int type, const void *arg)
+{
+	const struct thread *th = arg;
+
+	if (type != HIST_FILTER__THREAD)
+		return -1;
+
+	return th && he->thread != th;
+}
+
 struct sort_entry sort_thread = {
 	.se_header	= "  Pid:Command",
 	.se_cmp		= sort__thread_cmp,
 	.se_snprintf	= hist_entry__thread_snprintf,
+	.se_filter	= hist_entry__thread_filter,
 	.se_width_idx	= HISTC_THREAD,
 };
 
@@ -131,6 +142,7 @@ struct sort_entry sort_comm = {
 	.se_collapse	= sort__comm_collapse,
 	.se_sort	= sort__comm_sort,
 	.se_snprintf	= hist_entry__comm_snprintf,
+	.se_filter	= hist_entry__thread_filter,
 	.se_width_idx	= HISTC_COMM,
 };
 
@@ -180,10 +192,21 @@ static int hist_entry__dso_snprintf(struct hist_entry *he, char *bf,
 	return _hist_entry__dso_snprintf(he->ms.map, bf, size, width);
 }
 
+static int hist_entry__dso_filter(struct hist_entry *he, int type, const void *arg)
+{
+	const struct dso *dso = arg;
+
+	if (type != HIST_FILTER__DSO)
+		return -1;
+
+	return dso && (!he->ms.map || he->ms.map->dso != dso);
+}
+
 struct sort_entry sort_dso = {
 	.se_header	= "Shared Object",
 	.se_cmp		= sort__dso_cmp,
 	.se_snprintf	= hist_entry__dso_snprintf,
+	.se_filter	= hist_entry__dso_filter,
 	.se_width_idx	= HISTC_DSO,
 };
 
@@ -277,11 +300,22 @@ static int hist_entry__sym_snprintf(struct hist_entry *he, char *bf,
 					 he->level, bf, size, width);
 }
 
+static int hist_entry__sym_filter(struct hist_entry *he, int type, const void *arg)
+{
+	const char *sym = arg;
+
+	if (type != HIST_FILTER__SYMBOL)
+		return -1;
+
+	return sym && (!he->ms.sym || !strstr(he->ms.sym->name, sym));
+}
+
 struct sort_entry sort_sym = {
 	.se_header	= "Symbol",
 	.se_cmp		= sort__sym_cmp,
 	.se_sort	= sort__sym_sort,
 	.se_snprintf	= hist_entry__sym_snprintf,
+	.se_filter	= hist_entry__sym_filter,
 	.se_width_idx	= HISTC_SYMBOL,
 };
 
@@ -440,10 +474,21 @@ static int hist_entry__socket_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%*.*d", width, width-3, he->socket);
 }
 
+static int hist_entry__socket_filter(struct hist_entry *he, int type, const void *arg)
+{
+	int sk = *(const int *)arg;
+
+	if (type != HIST_FILTER__SOCKET)
+		return -1;
+
+	return sk >= 0 && he->socket != sk;
+}
+
 struct sort_entry sort_socket = {
 	.se_header      = "Socket",
 	.se_cmp	        = sort__socket_cmp,
 	.se_snprintf    = hist_entry__socket_snprintf,
+	.se_filter      = hist_entry__socket_filter,
 	.se_width_idx	= HISTC_SOCKET,
 };
 
@@ -530,6 +575,18 @@ static int hist_entry__dso_from_snprintf(struct hist_entry *he, char *bf,
 		return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
 }
 
+static int hist_entry__dso_from_filter(struct hist_entry *he, int type,
+				       const void *arg)
+{
+	const struct dso *dso = arg;
+
+	if (type != HIST_FILTER__DSO)
+		return -1;
+
+	return dso && (!he->branch_info || !he->branch_info->from.map ||
+		       he->branch_info->from.map->dso != dso);
+}
+
 static int64_t
 sort__dso_to_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -550,6 +607,18 @@ static int hist_entry__dso_to_snprintf(struct hist_entry *he, char *bf,
 		return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
 }
 
+static int hist_entry__dso_to_filter(struct hist_entry *he, int type,
+				     const void *arg)
+{
+	const struct dso *dso = arg;
+
+	if (type != HIST_FILTER__DSO)
+		return -1;
+
+	return dso && (!he->branch_info || !he->branch_info->to.map ||
+		       he->branch_info->to.map->dso != dso);
+}
+
 static int64_t
 sort__sym_from_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -611,10 +680,35 @@ static int hist_entry__sym_to_snprintf(struct hist_entry *he, char *bf,
 	return repsep_snprintf(bf, size, "%-*.*s", width, width, "N/A");
 }
 
+static int hist_entry__sym_from_filter(struct hist_entry *he, int type,
+				       const void *arg)
+{
+	const char *sym = arg;
+
+	if (type != HIST_FILTER__SYMBOL)
+		return -1;
+
+	return sym && !(he->branch_info && he->branch_info->from.sym &&
+			strstr(he->branch_info->from.sym->name, sym));
+}
+
+static int hist_entry__sym_to_filter(struct hist_entry *he, int type,
+				       const void *arg)
+{
+	const char *sym = arg;
+
+	if (type != HIST_FILTER__SYMBOL)
+		return -1;
+
+	return sym && !(he->branch_info && he->branch_info->to.sym &&
+		        strstr(he->branch_info->to.sym->name, sym));
+}
+
 struct sort_entry sort_dso_from = {
 	.se_header	= "Source Shared Object",
 	.se_cmp		= sort__dso_from_cmp,
 	.se_snprintf	= hist_entry__dso_from_snprintf,
+	.se_filter	= hist_entry__dso_from_filter,
 	.se_width_idx	= HISTC_DSO_FROM,
 };
 
@@ -622,6 +716,7 @@ struct sort_entry sort_dso_to = {
 	.se_header	= "Target Shared Object",
 	.se_cmp		= sort__dso_to_cmp,
 	.se_snprintf	= hist_entry__dso_to_snprintf,
+	.se_filter	= hist_entry__dso_to_filter,
 	.se_width_idx	= HISTC_DSO_TO,
 };
 
@@ -629,6 +724,7 @@ struct sort_entry sort_sym_from = {
 	.se_header	= "Source Symbol",
 	.se_cmp		= sort__sym_from_cmp,
 	.se_snprintf	= hist_entry__sym_from_snprintf,
+	.se_filter	= hist_entry__sym_from_filter,
 	.se_width_idx	= HISTC_SYMBOL_FROM,
 };
 
@@ -636,6 +732,7 @@ struct sort_entry sort_sym_to = {
 	.se_header	= "Target Symbol",
 	.se_cmp		= sort__sym_to_cmp,
 	.se_snprintf	= hist_entry__sym_to_snprintf,
+	.se_filter	= hist_entry__sym_to_filter,
 	.se_width_idx	= HISTC_SYMBOL_TO,
 };
 
@@ -1498,6 +1595,22 @@ static struct perf_hpp_fmt *__hpp_dimension__alloc_hpp(struct hpp_dimension *hd)
 	return fmt;
 }
 
+int hist_entry__filter(struct hist_entry *he, int type, const void *arg)
+{
+	struct perf_hpp_fmt *fmt;
+	struct hpp_sort_entry *hse;
+
+	fmt = he->fmt;
+	if (fmt == NULL || !perf_hpp__is_sort_entry(fmt))
+		return -1;
+
+	hse = container_of(fmt, struct hpp_sort_entry, hpp);
+	if (hse->se->se_filter == NULL)
+		return -1;
+
+	return hse->se->se_filter(he, type, arg);
+}
+
 static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd)
 {
 	struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 10315e0..a8d53ff 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -245,6 +245,7 @@ struct sort_entry {
 	int64_t	(*se_sort)(struct hist_entry *, struct hist_entry *);
 	int	(*se_snprintf)(struct hist_entry *he, char *bf, size_t size,
 			       unsigned int width);
+	int	(*se_filter)(struct hist_entry *he, int type, const void *arg);
 	u8	se_width_idx;
 };
 
-- 
cgit v0.10.2


From 155e9afff77916931f615a394cef187b342530dc Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:38 +0900
Subject: perf hists: Support filtering in hierarchy mode

The hists__filter_hierarchy() function implements filtering in hierarchy
mode.  Now we have hist_entry__filter() so use it for entries in the
hierarchy.  It returns 3 kind of values.

A negative value means that it's not filtered by this type.  It marks
current entry as filtered tentatively so if a lower level entry removes
the filter it also removes the all parent so that we can find the entry
in the output.

Zero means it's filtered out by this type. A positive value means it's
not filtered so it removes the filter and shows in the output.  In these
cases, it moves to next entry since lower level entry won't match by
this type of filter anymore.  Thus all children will be filtered or not
together.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-7-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 358af7e..dbab977 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1560,6 +1560,27 @@ static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h
 				       enum hist_filter filter)
 {
 	h->filtered &= ~(1 << filter);
+
+	if (symbol_conf.report_hierarchy) {
+		struct hist_entry *parent = h->parent_he;
+
+		while (parent) {
+			he_stat__add_stat(&parent->stat, &h->stat);
+
+			parent->filtered &= ~(1 << filter);
+
+			if (parent->filtered)
+				goto next;
+
+			/* force fold unfiltered entry for simplicity */
+			parent->unfolded = false;
+			parent->row_offset = 0;
+			parent->nr_rows = 0;
+next:
+			parent = parent->parent_he;
+		}
+	}
+
 	if (h->filtered)
 		return;
 
@@ -1645,28 +1666,92 @@ static void hists__filter_by_type(struct hists *hists, int type, filter_fn_t fil
 	}
 }
 
+static void hists__filter_hierarchy(struct hists *hists, int type, const void *arg)
+{
+	struct rb_node *nd;
+
+	hists->stats.nr_non_filtered_samples = 0;
+
+	hists__reset_filter_stats(hists);
+	hists__reset_col_len(hists);
+
+	nd = rb_first(&hists->entries);
+	while (nd) {
+		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+		int ret;
+
+		ret = hist_entry__filter(h, type, arg);
+
+		/*
+		 * case 1. non-matching type
+		 * zero out the period, set filter marker and move to child
+		 */
+		if (ret < 0) {
+			memset(&h->stat, 0, sizeof(h->stat));
+			h->filtered |= (1 << type);
+
+			nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_CHILD);
+		}
+		/*
+		 * case 2. matched type (filter out)
+		 * set filter marker and move to next
+		 */
+		else if (ret == 1) {
+			h->filtered |= (1 << type);
+
+			nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_SIBLING);
+		}
+		/*
+		 * case 3. ok (not filtered)
+		 * add period to hists and parents, erase the filter marker
+		 * and move to next sibling
+		 */
+		else {
+			hists__remove_entry_filter(hists, h, type);
+
+			nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_SIBLING);
+		}
+	}
+}
+
 void hists__filter_by_thread(struct hists *hists)
 {
-	hists__filter_by_type(hists, HIST_FILTER__THREAD,
-			      hists__filter_entry_by_thread);
+	if (symbol_conf.report_hierarchy)
+		hists__filter_hierarchy(hists, HIST_FILTER__THREAD,
+					hists->thread_filter);
+	else
+		hists__filter_by_type(hists, HIST_FILTER__THREAD,
+				      hists__filter_entry_by_thread);
 }
 
 void hists__filter_by_dso(struct hists *hists)
 {
-	hists__filter_by_type(hists, HIST_FILTER__DSO,
-			      hists__filter_entry_by_dso);
+	if (symbol_conf.report_hierarchy)
+		hists__filter_hierarchy(hists, HIST_FILTER__DSO,
+					hists->dso_filter);
+	else
+		hists__filter_by_type(hists, HIST_FILTER__DSO,
+				      hists__filter_entry_by_dso);
 }
 
 void hists__filter_by_symbol(struct hists *hists)
 {
-	hists__filter_by_type(hists, HIST_FILTER__SYMBOL,
-			      hists__filter_entry_by_symbol);
+	if (symbol_conf.report_hierarchy)
+		hists__filter_hierarchy(hists, HIST_FILTER__SYMBOL,
+					hists->symbol_filter_str);
+	else
+		hists__filter_by_type(hists, HIST_FILTER__SYMBOL,
+				      hists__filter_entry_by_symbol);
 }
 
 void hists__filter_by_socket(struct hists *hists)
 {
-	hists__filter_by_type(hists, HIST_FILTER__SOCKET,
-			      hists__filter_entry_by_socket);
+	if (symbol_conf.report_hierarchy)
+		hists__filter_hierarchy(hists, HIST_FILTER__SOCKET,
+					&hists->socket_filter);
+	else
+		hists__filter_by_type(hists, HIST_FILTER__SOCKET,
+				      hists__filter_entry_by_socket);
 }
 
 void events_stats__inc(struct events_stats *stats, u32 type)
-- 
cgit v0.10.2


From 70642850fa581df219d7bc03cd7aca6e1956968c Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:39 +0900
Subject: perf hists: Resort after filtering hierarchy

In hierarchy mode, a filter can affect periods of entries in upper
hierarchy.  So it needs to resort the hists after filter.

For example, let's look at following example:

 Overhead      Command / Shared Object / Symbol
 ------------  --------------------------------
 30.00%        perf
    20.00%        perf
       10.00%        main
        5.00%        pr_debug
        5.00%        memcpy
    10.00%        [kernel.vmlinux]
        8.00%        memset
        2.00%        cpu_idle

If we apply simbol filter for 'mem' it should look like this

 13.00%        perf
     8.00%        [kernel.vmlinux]
        8.00%        memset
     5.00%        perf
        5.00%        memcpy

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-8-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index dbab977..a44bf5a 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1666,9 +1666,47 @@ static void hists__filter_by_type(struct hists *hists, int type, filter_fn_t fil
 	}
 }
 
+static void resort_filtered_entry(struct rb_root *root, struct hist_entry *he)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+	struct rb_root new_root = RB_ROOT;
+	struct rb_node *nd;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		if (hist_entry__sort(he, iter) > 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, root);
+
+	if (he->leaf || he->filtered)
+		return;
+
+	nd = rb_first(&he->hroot_out);
+	while (nd) {
+		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+		nd = rb_next(nd);
+		rb_erase(&h->rb_node, &he->hroot_out);
+
+		resort_filtered_entry(&new_root, h);
+	}
+
+	he->hroot_out = new_root;
+}
+
 static void hists__filter_hierarchy(struct hists *hists, int type, const void *arg)
 {
 	struct rb_node *nd;
+	struct rb_root new_root = RB_ROOT;
 
 	hists->stats.nr_non_filtered_samples = 0;
 
@@ -1712,6 +1750,22 @@ static void hists__filter_hierarchy(struct hists *hists, int type, const void *a
 			nd = __rb_hierarchy_next(&h->rb_node, HMD_FORCE_SIBLING);
 		}
 	}
+
+	/*
+	 * resort output after applying a new filter since filter in a lower
+	 * hierarchy can change periods in a upper hierarchy.
+	 */
+	nd = rb_first(&hists->entries);
+	while (nd) {
+		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
+
+		nd = rb_next(nd);
+		rb_erase(&h->rb_node, &hists->entries);
+
+		resort_filtered_entry(&new_root, h);
+	}
+
+	hists->entries = new_root;
 }
 
 void hists__filter_by_thread(struct hists *hists)
-- 
cgit v0.10.2


From 1f2d72cf3258eacd667cd1920e64c9b64b9984d5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:40 +0900
Subject: perf hists: Count number of sort keys

It'll be used for hierarchy output mode to indent entries properly.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-9-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 12223d7..edbf854 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -514,6 +514,9 @@ void perf_hpp_list__column_register(struct perf_hpp_list *list,
 void perf_hpp_list__register_sort_field(struct perf_hpp_list *list,
 					struct perf_hpp_fmt *format)
 {
+	if (perf_hpp__is_sort_entry(format) || perf_hpp__is_dynamic_entry(format))
+		list->nr_sort_keys++;
+
 	list_add_tail(&format->sort_list, &list->sorts);
 }
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 480d2eb..d08e4f3 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -237,6 +237,7 @@ struct perf_hpp_fmt {
 struct perf_hpp_list {
 	struct list_head fields;
 	struct list_head sorts;
+	int nr_sort_keys;
 };
 
 extern struct perf_hpp_list perf_hpp_list;
-- 
cgit v0.10.2


From ef86d68a088c324e4bd85f82387d1f9a571affd0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:41 +0900
Subject: perf ui/stdio: Implement hierarchy output mode

The hierarchy output mode is to group entries for each level so that
user can see higher level picture more easily.  It also helps to find
out which component is most costly.  The output will look like below:

      15.11%     swapper
         14.97%     [kernel.vmlinux]
          0.09%     [libahci]
          0.05%     [iwlwifi]
      10.29%     irq/33-iwlwifi
          6.45%     [kernel.vmlinux]
          1.41%     [mac80211]
          1.15%     [iwldvm]
          1.14%     [iwlwifi]
          0.14%     [cfg80211]
       4.81%     firefox
          3.92%     libxul.so
          0.34%     [kernel.vmlinux]

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-10-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 87b022f..90b8677 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -410,6 +410,76 @@ static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
 	return hpp->buf - start;
 }
 
+static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
+					 struct perf_hpp *hpp,
+					 int nr_sort_key, struct hists *hists,
+					 FILE *fp)
+{
+	const char *sep = symbol_conf.field_sep;
+	struct perf_hpp_fmt *fmt;
+	char *buf = hpp->buf;
+	int ret, printed = 0;
+	bool first = true;
+
+	if (symbol_conf.exclude_other && !he->parent)
+		return 0;
+
+	ret = scnprintf(hpp->buf, hpp->size, "%*s", he->depth * HIERARCHY_INDENT, "");
+	advance_hpp(hpp, ret);
+
+	hists__for_each_format(he->hists, fmt) {
+		if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
+			break;
+
+		/*
+		 * If there's no field_sep, we still need
+		 * to display initial '  '.
+		 */
+		if (!sep || !first) {
+			ret = scnprintf(hpp->buf, hpp->size, "%s", sep ?: "  ");
+			advance_hpp(hpp, ret);
+		} else
+			first = false;
+
+		if (perf_hpp__use_color() && fmt->color)
+			ret = fmt->color(fmt, hpp, he);
+		else
+			ret = fmt->entry(fmt, hpp, he);
+
+		ret = hist_entry__snprintf_alignment(he, hpp, fmt, ret);
+		advance_hpp(hpp, ret);
+	}
+
+	if (sep)
+		ret = scnprintf(hpp->buf, hpp->size, "%s", sep);
+	else
+		ret = scnprintf(hpp->buf, hpp->size, "%*s",
+				(nr_sort_key - 1) * HIERARCHY_INDENT + 2, "");
+	advance_hpp(hpp, ret);
+
+	/*
+	 * No need to call hist_entry__snprintf_alignment() since this
+	 * fmt is always the last column in the hierarchy mode.
+	 */
+	fmt = he->fmt;
+	if (perf_hpp__use_color() && fmt->color)
+		fmt->color(fmt, hpp, he);
+	else
+		fmt->entry(fmt, hpp, he);
+
+	printed += fprintf(fp, "%s\n", buf);
+
+	if (symbol_conf.use_callchain && he->leaf) {
+		u64 total = hists__total_period(hists);
+
+		printed += hist_entry_callchain__fprintf(he, total, 0, fp);
+		goto out;
+	}
+
+out:
+	return printed;
+}
+
 static int hist_entry__fprintf(struct hist_entry *he, size_t size,
 			       struct hists *hists,
 			       char *bf, size_t bfsz, FILE *fp)
@@ -424,6 +494,13 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
 	if (size == 0 || size > bfsz)
 		size = hpp.size = bfsz;
 
+	if (symbol_conf.report_hierarchy) {
+		int nr_sort = hists->hpp_list->nr_sort_keys;
+
+		return hist_entry__hierarchy_fprintf(he, &hpp, nr_sort,
+						     hists, fp);
+	}
+
 	hist_entry__snprintf(he, &hpp);
 
 	ret = fprintf(fp, "%s\n", bf);
@@ -522,7 +599,7 @@ print_entries:
 		goto out;
 	}
 
-	for (nd = rb_first(&hists->entries); nd; nd = rb_next(nd)) {
+	for (nd = rb_first(&hists->entries); nd; nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 		float percent;
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index d08e4f3..722aa44 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -437,4 +437,6 @@ static inline struct rb_node *rb_hierarchy_next(struct rb_node *node)
 	return __rb_hierarchy_next(node, HMD_NORMAL);
 }
 
+#define HIERARCHY_INDENT  3
+
 #endif	/* __PERF_HIST_H */
-- 
cgit v0.10.2


From 8e2fc44f46ba4d3d1ee8b6ba0839a282a9f3fdd7 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:42 +0900
Subject: perf ui/stdio: Align column header for hierarchy output

The hierarchy output mode is to group entries so the existing columns
won't fit to the new output.  Treat all sort keys as a single column and
separate headers by "/".

  #    Overhead  Command / Shared Object
  # ...........  ................................
  #
      15.11%     swapper
         14.97%     [kernel.vmlinux]
          0.09%     [libahci]
          0.05%     [iwlwifi]
  ...

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-11-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 90b8677..435eaaa 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -511,6 +511,106 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
 	return ret;
 }
 
+static int print_hierarchy_indent(const char *sep, int nr_sort,
+				  const char *line, FILE *fp)
+{
+	if (sep != NULL || nr_sort < 1)
+		return 0;
+
+	return fprintf(fp, "%-.*s", (nr_sort - 1) * HIERARCHY_INDENT, line);
+}
+
+static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
+				  const char *sep, FILE *fp)
+{
+	bool first = true;
+	int nr_sort;
+	unsigned width = 0;
+	unsigned header_width = 0;
+	struct perf_hpp_fmt *fmt;
+
+	nr_sort = hists->hpp_list->nr_sort_keys;
+
+	/* preserve max indent depth for column headers */
+	print_hierarchy_indent(sep, nr_sort, spaces, fp);
+
+	hists__for_each_format(hists, fmt) {
+		if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
+			break;
+
+		if (!first)
+			fprintf(fp, "%s", sep ?: "  ");
+		else
+			first = false;
+
+		fmt->header(fmt, hpp, hists_to_evsel(hists));
+		fprintf(fp, "%s", hpp->buf);
+	}
+
+	/* combine sort headers with ' / ' */
+	first = true;
+	hists__for_each_format(hists, fmt) {
+		if (!perf_hpp__is_sort_entry(fmt) && !perf_hpp__is_dynamic_entry(fmt))
+			continue;
+		if (perf_hpp__should_skip(fmt, hists))
+			continue;
+
+		if (!first)
+			header_width += fprintf(fp, " / ");
+		else {
+			header_width += fprintf(fp, "%s", sep ?: "  ");
+			first = false;
+		}
+
+		fmt->header(fmt, hpp, hists_to_evsel(hists));
+		rtrim(hpp->buf);
+
+		header_width += fprintf(fp, "%s", hpp->buf);
+	}
+
+	/* preserve max indent depth for combined sort headers */
+	print_hierarchy_indent(sep, nr_sort, spaces, fp);
+
+	fprintf(fp, "\n# ");
+
+	/* preserve max indent depth for initial dots */
+	print_hierarchy_indent(sep, nr_sort, dots, fp);
+
+	first = true;
+	hists__for_each_format(hists, fmt) {
+		if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
+			break;
+
+		if (!first)
+			fprintf(fp, "%s", sep ?: "  ");
+		else
+			first = false;
+
+		width = fmt->width(fmt, hpp, hists_to_evsel(hists));
+		fprintf(fp, "%.*s", width, dots);
+	}
+
+	hists__for_each_format(hists, fmt) {
+		if (!perf_hpp__is_sort_entry(fmt) && !perf_hpp__is_dynamic_entry(fmt))
+			continue;
+		if (perf_hpp__should_skip(fmt, hists))
+			continue;
+
+		width = fmt->width(fmt, hpp, hists_to_evsel(hists));
+		if (width > header_width)
+			header_width = width;
+	}
+
+	fprintf(fp, "%s%-.*s", sep ?: "  ", header_width, dots);
+
+	/* preserve max indent depth for dots under sort headers */
+	print_hierarchy_indent(sep, nr_sort, dots, fp);
+
+	fprintf(fp, "\n#\n");
+
+	return 2;
+}
+
 size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 		      int max_cols, float min_pcnt, FILE *fp)
 {
@@ -542,6 +642,11 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 
 	fprintf(fp, "# ");
 
+	if (symbol_conf.report_hierarchy) {
+		nr_rows += print_hierarchy_header(hists, &dummy_hpp, sep, fp);
+		goto print_entries;
+	}
+
 	hists__for_each_format(hists, fmt) {
 		if (perf_hpp__should_skip(fmt, hists))
 			continue;
diff --git a/tools/perf/util/ctype.c b/tools/perf/util/ctype.c
index aada3ac..d4a5a21 100644
--- a/tools/perf/util/ctype.c
+++ b/tools/perf/util/ctype.c
@@ -32,8 +32,17 @@ unsigned char sane_ctype[256] = {
 
 const char *graph_line =
 	"_____________________________________________________________________"
+	"_____________________________________________________________________"
 	"_____________________________________________________________________";
 const char *graph_dotted_line =
 	"---------------------------------------------------------------------"
 	"---------------------------------------------------------------------"
 	"---------------------------------------------------------------------";
+const char *spaces =
+	"                                                                     "
+	"                                                                     "
+	"                                                                     ";
+const char *dots =
+	"....................................................................."
+	"....................................................................."
+	".....................................................................";
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 7015019..d0d50ce 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -82,6 +82,8 @@
 
 extern const char *graph_line;
 extern const char *graph_dotted_line;
+extern const char *spaces;
+extern const char *dots;
 extern char buildid_dir[];
 
 /* On most systems <limits.h> would have given us this, but
-- 
cgit v0.10.2


From f5b763feebe9770c3e6b01f1e19860e95f24b623 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:43 +0900
Subject: perf hists browser: Count number of hierarchy entries

Add nr_hierarchy_entries field to keep current number of (unfolded) hist
entries.  And the hist_entry->nr_rows carries number of direct children.
But in the hierarchy mode, entry can have grand children and callchains.
So update the number properly using hierarchy_count_rows() when toggling
the folded state (by pressing ENTER key).

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-12-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 1819771..de1d6f0 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -32,6 +32,7 @@ struct hist_browser {
 	bool		     show_headers;
 	float		     min_pcnt;
 	u64		     nr_non_filtered_entries;
+	u64		     nr_hierarchy_entries;
 	u64		     nr_callchain_rows;
 };
 
@@ -58,11 +59,11 @@ static int hist_browser__get_folding(struct hist_browser *browser)
 
 	for (nd = rb_first(&hists->entries);
 	     (nd = hists__filter_entries(nd, browser->min_pcnt)) != NULL;
-	     nd = rb_next(nd)) {
+	     nd = rb_hierarchy_next(nd)) {
 		struct hist_entry *he =
 			rb_entry(nd, struct hist_entry, rb_node);
 
-		if (he->unfolded)
+		if (he->leaf && he->unfolded)
 			unfolded_rows += he->nr_rows;
 	}
 	return unfolded_rows;
@@ -72,7 +73,9 @@ static u32 hist_browser__nr_entries(struct hist_browser *hb)
 {
 	u32 nr_entries;
 
-	if (hist_browser__has_filter(hb))
+	if (symbol_conf.report_hierarchy)
+		nr_entries = hb->nr_hierarchy_entries;
+	else if (hist_browser__has_filter(hb))
 		nr_entries = hb->nr_non_filtered_entries;
 	else
 		nr_entries = hb->hists->nr_entries;
@@ -247,6 +250,35 @@ static int callchain__count_rows(struct rb_root *chain)
 	return n;
 }
 
+static int hierarchy_count_rows(struct hist_browser *hb, struct hist_entry *he,
+				bool include_children)
+{
+	int count = 0;
+	struct rb_node *node;
+	struct hist_entry *child;
+
+	if (he->leaf)
+		return callchain__count_rows(&he->sorted_chain);
+
+	node = rb_first(&he->hroot_out);
+	while (node) {
+		float percent;
+
+		child = rb_entry(node, struct hist_entry, rb_node);
+		percent = hist_entry__get_percent_limit(child);
+
+		if (!child->filtered && percent >= hb->min_pcnt) {
+			count++;
+
+			if (include_children && child->unfolded)
+				count += hierarchy_count_rows(hb, child, true);
+		}
+
+		node = rb_next(node);
+	}
+	return count;
+}
+
 static bool hist_entry__toggle_fold(struct hist_entry *he)
 {
 	if (!he)
@@ -326,11 +358,17 @@ static void callchain__init_have_children(struct rb_root *root)
 
 static void hist_entry__init_have_children(struct hist_entry *he)
 {
-	if (!he->init_have_children) {
+	if (he->init_have_children)
+		return;
+
+	if (he->leaf) {
 		he->has_children = !RB_EMPTY_ROOT(&he->sorted_chain);
 		callchain__init_have_children(&he->sorted_chain);
-		he->init_have_children = true;
+	} else {
+		he->has_children = !RB_EMPTY_ROOT(&he->hroot_out);
 	}
+
+	he->init_have_children = true;
 }
 
 static bool hist_browser__toggle_fold(struct hist_browser *browser)
@@ -349,17 +387,41 @@ static bool hist_browser__toggle_fold(struct hist_browser *browser)
 		has_children = callchain_list__toggle_fold(cl);
 
 	if (has_children) {
+		int child_rows = 0;
+
 		hist_entry__init_have_children(he);
 		browser->b.nr_entries -= he->nr_rows;
-		browser->nr_callchain_rows -= he->nr_rows;
 
-		if (he->unfolded)
-			he->nr_rows = callchain__count_rows(&he->sorted_chain);
+		if (he->leaf)
+			browser->nr_callchain_rows -= he->nr_rows;
 		else
+			browser->nr_hierarchy_entries -= he->nr_rows;
+
+		if (symbol_conf.report_hierarchy)
+			child_rows = hierarchy_count_rows(browser, he, true);
+
+		if (he->unfolded) {
+			if (he->leaf)
+				he->nr_rows = callchain__count_rows(&he->sorted_chain);
+			else
+				he->nr_rows = hierarchy_count_rows(browser, he, false);
+
+			/* account grand children */
+			if (symbol_conf.report_hierarchy)
+				browser->b.nr_entries += child_rows - he->nr_rows;
+		} else {
+			if (symbol_conf.report_hierarchy)
+				browser->b.nr_entries -= child_rows - he->nr_rows;
+
 			he->nr_rows = 0;
+		}
 
 		browser->b.nr_entries += he->nr_rows;
-		browser->nr_callchain_rows += he->nr_rows;
+
+		if (he->leaf)
+			browser->nr_callchain_rows += he->nr_rows;
+		else
+			browser->nr_hierarchy_entries += he->nr_rows;
 
 		return true;
 	}
@@ -2025,17 +2087,18 @@ static void hist_browser__update_nr_entries(struct hist_browser *hb)
 	u64 nr_entries = 0;
 	struct rb_node *nd = rb_first(&hb->hists->entries);
 
-	if (hb->min_pcnt == 0) {
+	if (hb->min_pcnt == 0 && !symbol_conf.report_hierarchy) {
 		hb->nr_non_filtered_entries = hb->hists->nr_non_filtered_entries;
 		return;
 	}
 
 	while ((nd = hists__filter_entries(nd, hb->min_pcnt)) != NULL) {
 		nr_entries++;
-		nd = rb_next(nd);
+		nd = rb_hierarchy_next(nd);
 	}
 
 	hb->nr_non_filtered_entries = nr_entries;
+	hb->nr_hierarchy_entries = nr_entries;
 }
 
 static void hist_browser__update_percent_limit(struct hist_browser *hb,
-- 
cgit v0.10.2


From 492b1010606e9222690992ad8e4898a88a696856 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:44 +0900
Subject: perf hists browser: Support collapsing/expanding whole entries in
 hierarchy

The 'C' and 'E' keys are to collapse/expand all hist entries.  Update
nr_hierarchy_entries properly in this case.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-13-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index de1d6f0..857b9be 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -484,13 +484,38 @@ static int callchain__set_folding(struct rb_root *chain, bool unfold)
 	return n;
 }
 
-static void hist_entry__set_folding(struct hist_entry *he, bool unfold)
+static int hierarchy_set_folding(struct hist_browser *hb, struct hist_entry *he,
+				 bool unfold __maybe_unused)
+{
+	float percent;
+	struct rb_node *nd;
+	struct hist_entry *child;
+	int n = 0;
+
+	for (nd = rb_first(&he->hroot_out); nd; nd = rb_next(nd)) {
+		child = rb_entry(nd, struct hist_entry, rb_node);
+		percent = hist_entry__get_percent_limit(child);
+		if (!child->filtered && percent >= hb->min_pcnt)
+			n++;
+	}
+
+	return n;
+}
+
+static void hist_entry__set_folding(struct hist_entry *he,
+				    struct hist_browser *hb, bool unfold)
 {
 	hist_entry__init_have_children(he);
 	he->unfolded = unfold ? he->has_children : false;
 
 	if (he->has_children) {
-		int n = callchain__set_folding(&he->sorted_chain, unfold);
+		int n;
+
+		if (he->leaf)
+			n = callchain__set_folding(&he->sorted_chain, unfold);
+		else
+			n = hierarchy_set_folding(hb, he, unfold);
+
 		he->nr_rows = unfold ? n : 0;
 	} else
 		he->nr_rows = 0;
@@ -500,19 +525,32 @@ static void
 __hist_browser__set_folding(struct hist_browser *browser, bool unfold)
 {
 	struct rb_node *nd;
-	struct hists *hists = browser->hists;
+	struct hist_entry *he;
+	double percent;
 
-	for (nd = rb_first(&hists->entries);
-	     (nd = hists__filter_entries(nd, browser->min_pcnt)) != NULL;
-	     nd = rb_next(nd)) {
-		struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
-		hist_entry__set_folding(he, unfold);
-		browser->nr_callchain_rows += he->nr_rows;
+	nd = rb_first(&browser->hists->entries);
+	while (nd) {
+		he = rb_entry(nd, struct hist_entry, rb_node);
+
+		/* set folding state even if it's currently folded */
+		nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD);
+
+		hist_entry__set_folding(he, browser, unfold);
+
+		percent = hist_entry__get_percent_limit(he);
+		if (he->filtered || percent < browser->min_pcnt)
+			continue;
+
+		if (!he->depth || unfold)
+			browser->nr_hierarchy_entries++;
+		if (he->leaf)
+			browser->nr_callchain_rows += he->nr_rows;
 	}
 }
 
 static void hist_browser__set_folding(struct hist_browser *browser, bool unfold)
 {
+	browser->nr_hierarchy_entries = 0;
 	browser->nr_callchain_rows = 0;
 	__hist_browser__set_folding(browser, unfold);
 
@@ -2131,7 +2169,7 @@ static void hist_browser__update_percent_limit(struct hist_browser *hb,
 
 		/* force to re-evaluate folding state of callchains */
 		he->init_have_children = false;
-		hist_entry__set_folding(he, false);
+		hist_entry__set_folding(he, hb, false);
 
 		nd = rb_next(nd);
 	}
-- 
cgit v0.10.2


From d0506edbec7d04dcca632fddfc162faa78d5527a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:45 +0900
Subject: perf hists browser: Implement hierarchy output

Implement hierarchy mode in TUI.  The output is look like stdio but it
also supports to fold/unfold children dynamically.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-14-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 857b9be..2bccf68 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1260,6 +1260,158 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 	return printed;
 }
 
+static int hist_browser__show_hierarchy_entry(struct hist_browser *browser,
+					      struct hist_entry *entry,
+					      unsigned short row,
+					      int level, int nr_sort_keys)
+{
+	int printed = 0;
+	int width = browser->b.width;
+	char folded_sign = ' ';
+	bool current_entry = ui_browser__is_current_entry(&browser->b, row);
+	off_t row_offset = entry->row_offset;
+	bool first = true;
+	struct perf_hpp_fmt *fmt;
+	struct hpp_arg arg = {
+		.b		= &browser->b,
+		.current_entry	= current_entry,
+	};
+	int column = 0;
+	int hierarchy_indent = (nr_sort_keys - 1) * HIERARCHY_INDENT;
+
+	if (current_entry) {
+		browser->he_selection = entry;
+		browser->selection = &entry->ms;
+	}
+
+	hist_entry__init_have_children(entry);
+	folded_sign = hist_entry__folded(entry);
+	arg.folded_sign = folded_sign;
+
+	if (entry->leaf && row_offset) {
+		row_offset--;
+		goto show_callchain;
+	}
+
+	hist_browser__gotorc(browser, row, 0);
+
+	if (current_entry && browser->b.navkeypressed)
+		ui_browser__set_color(&browser->b, HE_COLORSET_SELECTED);
+	else
+		ui_browser__set_color(&browser->b, HE_COLORSET_NORMAL);
+
+	ui_browser__write_nstring(&browser->b, "", level * HIERARCHY_INDENT);
+	width -= level * HIERARCHY_INDENT;
+
+	hists__for_each_format(entry->hists, fmt) {
+		char s[2048];
+		struct perf_hpp hpp = {
+			.buf		= s,
+			.size		= sizeof(s),
+			.ptr		= &arg,
+		};
+
+		if (perf_hpp__should_skip(fmt, entry->hists) ||
+		    column++ < browser->b.horiz_scroll)
+			continue;
+
+		if (perf_hpp__is_sort_entry(fmt) ||
+		    perf_hpp__is_dynamic_entry(fmt))
+			break;
+
+		if (current_entry && browser->b.navkeypressed) {
+			ui_browser__set_color(&browser->b,
+					      HE_COLORSET_SELECTED);
+		} else {
+			ui_browser__set_color(&browser->b,
+					      HE_COLORSET_NORMAL);
+		}
+
+		if (first) {
+			ui_browser__printf(&browser->b, "%c", folded_sign);
+			width--;
+			first = false;
+		} else {
+			ui_browser__printf(&browser->b, "  ");
+			width -= 2;
+		}
+
+		if (fmt->color) {
+			int ret = fmt->color(fmt, &hpp, entry);
+			hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+			/*
+			 * fmt->color() already used ui_browser to
+			 * print the non alignment bits, skip it (+ret):
+			 */
+			ui_browser__printf(&browser->b, "%s", s + ret);
+		} else {
+			int ret = fmt->entry(fmt, &hpp, entry);
+			hist_entry__snprintf_alignment(entry, &hpp, fmt, ret);
+			ui_browser__printf(&browser->b, "%s", s);
+		}
+		width -= hpp.buf - s;
+	}
+
+	ui_browser__write_nstring(&browser->b, "", hierarchy_indent);
+	width -= hierarchy_indent;
+
+	if (column >= browser->b.horiz_scroll) {
+		char s[2048];
+		struct perf_hpp hpp = {
+			.buf		= s,
+			.size		= sizeof(s),
+			.ptr		= &arg,
+		};
+
+		if (current_entry && browser->b.navkeypressed) {
+			ui_browser__set_color(&browser->b,
+					      HE_COLORSET_SELECTED);
+		} else {
+			ui_browser__set_color(&browser->b,
+					      HE_COLORSET_NORMAL);
+		}
+
+		ui_browser__write_nstring(&browser->b, "", 2);
+		width -= 2;
+
+		/*
+		 * No need to call hist_entry__snprintf_alignment()
+		 * since this fmt is always the last column in the
+		 * hierarchy mode.
+		 */
+		fmt = entry->fmt;
+		if (fmt->color) {
+			width -= fmt->color(fmt, &hpp, entry);
+		} else {
+			width -= fmt->entry(fmt, &hpp, entry);
+			ui_browser__printf(&browser->b, "%s", s);
+		}
+	}
+
+	/* The scroll bar isn't being used */
+	if (!browser->b.navkeypressed)
+		width += 1;
+
+	ui_browser__write_nstring(&browser->b, "", width);
+
+	++row;
+	++printed;
+
+show_callchain:
+	if (entry->leaf && folded_sign == '-' && row != browser->b.rows) {
+		struct callchain_print_arg carg = {
+			.row_offset = row_offset,
+		};
+
+		printed += hist_browser__show_callchain(browser, entry,
+					level + 1, row,
+					hist_browser__show_callchain_entry, &carg,
+					hist_browser__check_output_full);
+	}
+
+	return printed;
+}
+
 static int advance_hpp_check(struct perf_hpp *hpp, int inc)
 {
 	advance_hpp(hpp, inc);
@@ -1325,6 +1477,7 @@ static unsigned int hist_browser__refresh(struct ui_browser *browser)
 	u16 header_offset = 0;
 	struct rb_node *nd;
 	struct hist_browser *hb = container_of(browser, struct hist_browser, b);
+	int nr_sort = hb->hists->hpp_list->nr_sort_keys;
 
 	if (hb->show_headers) {
 		hist_browser__show_headers(hb);
@@ -1335,18 +1488,28 @@ static unsigned int hist_browser__refresh(struct ui_browser *browser)
 	hb->he_selection = NULL;
 	hb->selection = NULL;
 
-	for (nd = browser->top; nd; nd = rb_next(nd)) {
+	for (nd = browser->top; nd; nd = rb_hierarchy_next(nd)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 		float percent;
 
-		if (h->filtered)
+		if (h->filtered) {
+			/* let it move to sibling */
+			h->unfolded = false;
 			continue;
+		}
 
 		percent = hist_entry__get_percent_limit(h);
 		if (percent < hb->min_pcnt)
 			continue;
 
-		row += hist_browser__show_entry(hb, h, row);
+		if (symbol_conf.report_hierarchy) {
+			row += hist_browser__show_hierarchy_entry(hb, h, row,
+								  h->depth,
+								  nr_sort);
+		} else {
+			row += hist_browser__show_entry(hb, h, row);
+		}
+
 		if (row == browser->rows)
 			break;
 	}
@@ -1364,7 +1527,14 @@ static struct rb_node *hists__filter_entries(struct rb_node *nd,
 		if (!h->filtered && percent >= min_pcnt)
 			return nd;
 
-		nd = rb_next(nd);
+		/*
+		 * If it's filtered, its all children also were filtered.
+		 * So move to sibling node.
+		 */
+		if (rb_next(nd))
+			nd = rb_next(nd);
+		else
+			nd = rb_hierarchy_next(nd);
 	}
 
 	return NULL;
@@ -1380,7 +1550,7 @@ static struct rb_node *hists__filter_prev_entries(struct rb_node *nd,
 		if (!h->filtered && percent >= min_pcnt)
 			return nd;
 
-		nd = rb_prev(nd);
+		nd = rb_hierarchy_prev(nd);
 	}
 
 	return NULL;
@@ -1410,8 +1580,8 @@ static void ui_browser__hists_seek(struct ui_browser *browser,
 		nd = browser->top;
 		goto do_offset;
 	case SEEK_END:
-		nd = hists__filter_prev_entries(rb_last(browser->entries),
-						hb->min_pcnt);
+		nd = rb_hierarchy_last(rb_last(browser->entries));
+		nd = hists__filter_prev_entries(nd, hb->min_pcnt);
 		first = false;
 		break;
 	default:
@@ -1445,7 +1615,7 @@ do_offset:
 	if (offset > 0) {
 		do {
 			h = rb_entry(nd, struct hist_entry, rb_node);
-			if (h->unfolded) {
+			if (h->unfolded && h->leaf) {
 				u16 remaining = h->nr_rows - h->row_offset;
 				if (offset > remaining) {
 					offset -= remaining;
@@ -1457,7 +1627,8 @@ do_offset:
 					break;
 				}
 			}
-			nd = hists__filter_entries(rb_next(nd), hb->min_pcnt);
+			nd = hists__filter_entries(rb_hierarchy_next(nd),
+						   hb->min_pcnt);
 			if (nd == NULL)
 				break;
 			--offset;
@@ -1466,7 +1637,7 @@ do_offset:
 	} else if (offset < 0) {
 		while (1) {
 			h = rb_entry(nd, struct hist_entry, rb_node);
-			if (h->unfolded) {
+			if (h->unfolded && h->leaf) {
 				if (first) {
 					if (-offset > h->row_offset) {
 						offset += h->row_offset;
@@ -1490,7 +1661,7 @@ do_offset:
 				}
 			}
 
-			nd = hists__filter_prev_entries(rb_prev(nd),
+			nd = hists__filter_prev_entries(rb_hierarchy_prev(nd),
 							hb->min_pcnt);
 			if (nd == NULL)
 				break;
@@ -1503,7 +1674,7 @@ do_offset:
 				 * row_offset at its last entry.
 				 */
 				h = rb_entry(nd, struct hist_entry, rb_node);
-				if (h->unfolded)
+				if (h->unfolded && h->leaf)
 					h->row_offset = h->nr_rows;
 				break;
 			}
@@ -1517,13 +1688,14 @@ do_offset:
 }
 
 static int hist_browser__fprintf_callchain(struct hist_browser *browser,
-					   struct hist_entry *he, FILE *fp)
+					   struct hist_entry *he, FILE *fp,
+					   int level)
 {
 	struct callchain_print_arg arg  = {
 		.fp = fp,
 	};
 
-	hist_browser__show_callchain(browser, he, 1, 0,
+	hist_browser__show_callchain(browser, he, level, 0,
 				     hist_browser__fprintf_callchain_entry, &arg,
 				     hist_browser__check_dump_full);
 	return arg.printed;
@@ -1566,7 +1738,65 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
 	printed += fprintf(fp, "%s\n", s);
 
 	if (folded_sign == '-')
-		printed += hist_browser__fprintf_callchain(browser, he, fp);
+		printed += hist_browser__fprintf_callchain(browser, he, fp, 1);
+
+	return printed;
+}
+
+
+static int hist_browser__fprintf_hierarchy_entry(struct hist_browser *browser,
+						 struct hist_entry *he,
+						 FILE *fp, int level,
+						 int nr_sort_keys)
+{
+	char s[8192];
+	int printed = 0;
+	char folded_sign = ' ';
+	struct perf_hpp hpp = {
+		.buf = s,
+		.size = sizeof(s),
+	};
+	struct perf_hpp_fmt *fmt;
+	bool first = true;
+	int ret;
+	int hierarchy_indent = (nr_sort_keys + 1) * HIERARCHY_INDENT;
+
+	printed = fprintf(fp, "%*s", level * HIERARCHY_INDENT, "");
+
+	folded_sign = hist_entry__folded(he);
+	printed += fprintf(fp, "%c", folded_sign);
+
+	hists__for_each_format(he->hists, fmt) {
+		if (perf_hpp__should_skip(fmt, he->hists))
+			continue;
+
+		if (perf_hpp__is_sort_entry(fmt) ||
+		    perf_hpp__is_dynamic_entry(fmt))
+			break;
+
+		if (!first) {
+			ret = scnprintf(hpp.buf, hpp.size, "  ");
+			advance_hpp(&hpp, ret);
+		} else
+			first = false;
+
+		ret = fmt->entry(fmt, &hpp, he);
+		advance_hpp(&hpp, ret);
+	}
+
+	ret = scnprintf(hpp.buf, hpp.size, "%*s", hierarchy_indent, "");
+	advance_hpp(&hpp, ret);
+
+	fmt = he->fmt;
+	ret = fmt->entry(fmt, &hpp, he);
+	advance_hpp(&hpp, ret);
+
+	printed += fprintf(fp, "%s\n", rtrim(s));
+
+	if (he->leaf && folded_sign == '-') {
+		printed += hist_browser__fprintf_callchain(browser, he, fp,
+							   he->depth + 1);
+	}
 
 	return printed;
 }
@@ -1576,12 +1806,22 @@ static int hist_browser__fprintf(struct hist_browser *browser, FILE *fp)
 	struct rb_node *nd = hists__filter_entries(rb_first(browser->b.entries),
 						   browser->min_pcnt);
 	int printed = 0;
+	int nr_sort = browser->hists->hpp_list->nr_sort_keys;
 
 	while (nd) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 
-		printed += hist_browser__fprintf_entry(browser, h, fp);
-		nd = hists__filter_entries(rb_next(nd), browser->min_pcnt);
+		if (symbol_conf.report_hierarchy) {
+			printed += hist_browser__fprintf_hierarchy_entry(browser,
+									 h, fp,
+									 h->depth,
+									 nr_sort);
+		} else {
+			printed += hist_browser__fprintf_entry(browser, h, fp);
+		}
+
+		nd = hists__filter_entries(rb_hierarchy_next(nd),
+					   browser->min_pcnt);
 	}
 
 	return printed;
@@ -2149,12 +2389,12 @@ static void hist_browser__update_percent_limit(struct hist_browser *hb,
 
 	hb->min_pcnt = callchain_param.min_percent = percent;
 
-	if (!symbol_conf.use_callchain)
-		return;
-
 	while ((nd = hists__filter_entries(nd, hb->min_pcnt)) != NULL) {
 		he = rb_entry(nd, struct hist_entry, rb_node);
 
+		if (!he->leaf || !symbol_conf.use_callchain)
+			goto next;
+
 		if (callchain_param.mode == CHAIN_GRAPH_REL) {
 			total = he->stat.period;
 
@@ -2167,11 +2407,17 @@ static void hist_browser__update_percent_limit(struct hist_browser *hb,
 		callchain_param.sort(&he->sorted_chain, he->callchain,
 				     min_callchain_hits, &callchain_param);
 
+next:
+		/*
+		 * Tentatively set unfolded so that the rb_hierarchy_next()
+		 * can toggle children of folded entries too.
+		 */
+		he->unfolded = he->has_children;
+		nd = rb_hierarchy_next(nd);
+
 		/* force to re-evaluate folding state of callchains */
 		he->init_have_children = false;
 		hist_entry__set_folding(he, hb, false);
-
-		nd = rb_next(nd);
 	}
 }
 
-- 
cgit v0.10.2


From d8b92400d3ba6bb9a310c42b7518a81eb90f83be Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:46 +0900
Subject: perf hists browser: Align column header in hierarchy mode

Like in stdio, fit column header to hierarchy output.  Merge column
headers with "/" as a separator.

   Overhead        Command / Shared Object / Symbol
  ...
  +   0.09%        dwm
  +   0.06%        emacs
  -   0.05%        perf
     -   0.05%        [kernel.vmlinux]
        +   0.03%        [k] memcpy_orig
        +   0.01%        [k] unmap_single_vma
        +   0.01%        [k] smp_call_function_single
        +   0.00%        [k] native_irq_return_iret
        +   0.00%        [k] arch_trigger_all_cpu_backtrace_handler
        +   0.00%        [k] native_write_msr_safe

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-15-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 2bccf68..6bcd767 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1451,11 +1451,80 @@ static int hists_browser__scnprintf_headers(struct hist_browser *browser, char *
 	return ret;
 }
 
+static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *browser, char *buf, size_t size)
+{
+	struct hists *hists = browser->hists;
+	struct perf_hpp dummy_hpp = {
+		.buf    = buf,
+		.size   = size,
+	};
+	struct perf_hpp_fmt *fmt;
+	size_t ret = 0;
+	int column = 0;
+	int nr_sort_keys = hists->hpp_list->nr_sort_keys;
+	bool first = true;
+
+	ret = scnprintf(buf, size, " ");
+	if (advance_hpp_check(&dummy_hpp, ret))
+		return ret;
+
+	hists__for_each_format(hists, fmt) {
+		if (column++ < browser->b.horiz_scroll)
+			continue;
+
+		if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
+			break;
+
+		ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
+		if (advance_hpp_check(&dummy_hpp, ret))
+			break;
+
+		ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "  ");
+		if (advance_hpp_check(&dummy_hpp, ret))
+			break;
+	}
+
+	ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "%*s",
+			(nr_sort_keys - 1) * HIERARCHY_INDENT, "");
+	if (advance_hpp_check(&dummy_hpp, ret))
+		return ret;
+
+	hists__for_each_format(hists, fmt) {
+		if (!perf_hpp__is_sort_entry(fmt) && !perf_hpp__is_dynamic_entry(fmt))
+			continue;
+		if (perf_hpp__should_skip(fmt, hists))
+			continue;
+
+		if (first) {
+			first = false;
+		} else {
+			ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, " / ");
+			if (advance_hpp_check(&dummy_hpp, ret))
+				break;
+		}
+
+		ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
+		dummy_hpp.buf[ret] = '\0';
+		rtrim(dummy_hpp.buf);
+
+		ret = strlen(dummy_hpp.buf);
+		if (advance_hpp_check(&dummy_hpp, ret))
+			break;
+	}
+
+	return ret;
+}
+
 static void hist_browser__show_headers(struct hist_browser *browser)
 {
 	char headers[1024];
 
-	hists_browser__scnprintf_headers(browser, headers, sizeof(headers));
+	if (symbol_conf.report_hierarchy)
+		hists_browser__scnprintf_hierarchy_headers(browser, headers,
+							   sizeof(headers));
+	else
+		hists_browser__scnprintf_headers(browser, headers,
+						 sizeof(headers));
 	ui_browser__gotorc(&browser->b, 0, 0);
 	ui_browser__set_color(&browser->b, HE_COLORSET_ROOT);
 	ui_browser__write_nstring(&browser->b, headers, browser->b.width + 1);
-- 
cgit v0.10.2


From e311ec1e5df6d5de377e67f704bca43fb023ca5e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:47 +0900
Subject: perf ui/gtk: Implement hierarchy output mode

The hierarchy output mode is to group entries for each level so that
user can see higher level picture more easily.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-16-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 32cc38a..7f343339 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -396,6 +396,164 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists,
 	gtk_container_add(GTK_CONTAINER(window), view);
 }
 
+static void perf_gtk__add_hierarchy_entries(struct hists *hists,
+					    struct rb_root *root,
+					    GtkTreeStore *store,
+					    GtkTreeIter *parent,
+					    struct perf_hpp *hpp,
+					    float min_pcnt)
+{
+	int col_idx = 0;
+	struct rb_node *node;
+	struct hist_entry *he;
+	struct perf_hpp_fmt *fmt;
+	u64 total = hists__total_period(hists);
+
+	for (node = rb_first(root); node; node = rb_next(node)) {
+		GtkTreeIter iter;
+		float percent;
+
+		he = rb_entry(node, struct hist_entry, rb_node);
+		if (he->filtered)
+			continue;
+
+		percent = hist_entry__get_percent_limit(he);
+		if (percent < min_pcnt)
+			continue;
+
+		gtk_tree_store_append(store, &iter, parent);
+
+		col_idx = 0;
+		hists__for_each_format(hists, fmt) {
+			if (perf_hpp__is_sort_entry(fmt) ||
+			    perf_hpp__is_dynamic_entry(fmt))
+				break;
+
+			if (fmt->color)
+				fmt->color(fmt, hpp, he);
+			else
+				fmt->entry(fmt, hpp, he);
+
+			gtk_tree_store_set(store, &iter, col_idx++, hpp->buf, -1);
+		}
+
+		fmt = he->fmt;
+		if (fmt->color)
+			fmt->color(fmt, hpp, he);
+		else
+			fmt->entry(fmt, hpp, he);
+
+		gtk_tree_store_set(store, &iter, col_idx, rtrim(hpp->buf), -1);
+
+		if (!he->leaf) {
+			perf_gtk__add_hierarchy_entries(hists, &he->hroot_out,
+							store, &iter, hpp,
+							min_pcnt);
+		}
+
+		if (symbol_conf.use_callchain && he->leaf) {
+			if (callchain_param.mode == CHAIN_GRAPH_REL)
+				total = symbol_conf.cumulate_callchain ?
+					he->stat_acc->period : he->stat.period;
+
+			perf_gtk__add_callchain(&he->sorted_chain, store, &iter,
+						col_idx, total);
+		}
+	}
+
+}
+
+static void perf_gtk__show_hierarchy(GtkWidget *window, struct hists *hists,
+				     float min_pcnt)
+{
+	struct perf_hpp_fmt *fmt;
+	GType col_types[MAX_COLUMNS];
+	GtkCellRenderer *renderer;
+	GtkTreeStore *store;
+	GtkWidget *view;
+	int col_idx;
+	int nr_cols = 0;
+	char s[512];
+	char buf[512];
+	bool first = true;
+	struct perf_hpp hpp = {
+		.buf		= s,
+		.size		= sizeof(s),
+	};
+
+	hists__for_each_format(hists, fmt) {
+		if (perf_hpp__is_sort_entry(fmt) ||
+		    perf_hpp__is_dynamic_entry(fmt))
+			break;
+
+		col_types[nr_cols++] = G_TYPE_STRING;
+	}
+	col_types[nr_cols++] = G_TYPE_STRING;
+
+	store = gtk_tree_store_newv(nr_cols, col_types);
+	view = gtk_tree_view_new();
+	renderer = gtk_cell_renderer_text_new();
+
+	col_idx = 0;
+	hists__for_each_format(hists, fmt) {
+		if (perf_hpp__is_sort_entry(fmt) ||
+		    perf_hpp__is_dynamic_entry(fmt))
+			break;
+
+		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+							    -1, fmt->name,
+							    renderer, "markup",
+							    col_idx++, NULL);
+	}
+
+	/* construct merged column header since sort keys share single column */
+	buf[0] = '\0';
+	hists__for_each_format(hists ,fmt) {
+		if (!perf_hpp__is_sort_entry(fmt) &&
+		    !perf_hpp__is_dynamic_entry(fmt))
+			continue;
+		if (perf_hpp__should_skip(fmt, hists))
+			continue;
+
+		if (first)
+			first = false;
+		else
+			strcat(buf, " / ");
+
+		fmt->header(fmt, &hpp, hists_to_evsel(hists));
+		strcat(buf, rtrim(hpp.buf));
+	}
+
+	gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
+						    -1, buf,
+						    renderer, "markup",
+						    col_idx++, NULL);
+
+	for (col_idx = 0; col_idx < nr_cols; col_idx++) {
+		GtkTreeViewColumn *column;
+
+		column = gtk_tree_view_get_column(GTK_TREE_VIEW(view), col_idx);
+		gtk_tree_view_column_set_resizable(column, TRUE);
+
+		if (col_idx == 0) {
+			gtk_tree_view_set_expander_column(GTK_TREE_VIEW(view),
+							  column);
+		}
+	}
+
+	gtk_tree_view_set_model(GTK_TREE_VIEW(view), GTK_TREE_MODEL(store));
+	g_object_unref(GTK_TREE_MODEL(store));
+
+	perf_gtk__add_hierarchy_entries(hists, &hists->entries, store,
+					NULL, &hpp, min_pcnt);
+
+	gtk_tree_view_set_rules_hint(GTK_TREE_VIEW(view), TRUE);
+
+	g_signal_connect(view, "row-activated",
+			 G_CALLBACK(on_row_activated), NULL);
+	gtk_container_add(GTK_CONTAINER(window), view);
+}
+
 int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
 				  const char *help,
 				  struct hist_browser_timer *hbt __maybe_unused,
@@ -463,7 +621,10 @@ int perf_evlist__gtk_browse_hists(struct perf_evlist *evlist,
 							GTK_POLICY_AUTOMATIC,
 							GTK_POLICY_AUTOMATIC);
 
-		perf_gtk__show_hists(scrolled_window, hists, min_pcnt);
+		if (symbol_conf.report_hierarchy)
+			perf_gtk__show_hierarchy(scrolled_window, hists, min_pcnt);
+		else
+			perf_gtk__show_hists(scrolled_window, hists, min_pcnt);
 
 		tab_label = gtk_label_new(evname);
 
-- 
cgit v0.10.2


From 4251446d778e669db5ac9f86b02d38064bdbbf9a Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:48 +0900
Subject: perf report: Add --hierarchy option

The --hierarchy option is to show output in hierarchy mode.  It extends
folding/unfolding in the TUI and GTK browsers to support sort items as
well as callchains.  Users can toggle the items to see the performance
result at wanted level.

  $ perf report --hierarchy --tui
   Overhead       Command / Shared Object / Symbol
  --------------------------------------------------
  +  32.96%       gnome-shell
  -  15.11%       swapper
     -  14.97%       [kernel.vmlinux]
           6.82%        [k] intel_idle
           0.66%        [k] menu_select
           0.43%        [k] __hrtimer_start_range_ns
  ...

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-17-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 89cab84..1211399 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -401,6 +401,9 @@ include::itrace.txt[]
 --raw-trace::
 	When displaying traceevent output, do not use print fmt or plugins.
 
+--hierarchy::
+	Enable hierarchical output.
+
 include::callchain-overhead-calculation.txt[]
 
 SEE ALSO
diff --git a/tools/perf/Documentation/tips.txt b/tools/perf/Documentation/tips.txt
index e0ce957..5950b5a 100644
--- a/tools/perf/Documentation/tips.txt
+++ b/tools/perf/Documentation/tips.txt
@@ -27,3 +27,4 @@ Skip collecing build-id when recording: perf record -B
 To change sampling frequency to 100 Hz: perf record -F 100
 See assembly instructions with percentage: perf annotate <symbol>
 If you prefer Intel style assembly, try: perf annotate -M intel
+For hierarchical output, try: perf report --hierarchy
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 760e886..f4d8244 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -811,6 +811,8 @@ int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
 		    "only show processor socket that match with this filter"),
 	OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
 		    "Show raw trace event output (do not use print fmt or plugins)"),
+	OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
+		    "Show entries in a hierarchy"),
 	OPT_END()
 	};
 	struct perf_data_file file = {
@@ -920,6 +922,21 @@ repeat:
 		symbol_conf.cumulate_callchain = false;
 	}
 
+	if (symbol_conf.report_hierarchy) {
+		/* disable incompatible options */
+		symbol_conf.event_group = false;
+		symbol_conf.cumulate_callchain = false;
+
+		if (field_order) {
+			pr_err("Error: --hierarchy and --fields options cannot be used together\n");
+			parse_options_usage(report_usage, options, "F", 1);
+			parse_options_usage(NULL, options, "hierarchy", 0);
+			goto error;
+		}
+
+		sort__need_collapse = true;
+	}
+
 	/* Force tty output for header output and per-thread stat. */
 	if (report.header || report.header_only || report.show_threads)
 		use_browser = 0;
-- 
cgit v0.10.2


From 5d8200ae67724960f7761b3a2216a1ca651fcc65 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:49 +0900
Subject: perf hists: Support decaying in hierarchy mode

In the hierarchy mode, hist entries should decay their children too.
Also update hists__delete_entry() to be able to free child entries.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-18-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index a44bf5a..1c53042 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -248,6 +248,8 @@ static void he_stat__decay(struct he_stat *he_stat)
 	/* XXX need decay for weight too? */
 }
 
+static void hists__delete_entry(struct hists *hists, struct hist_entry *he);
+
 static bool hists__decay_entry(struct hists *hists, struct hist_entry *he)
 {
 	u64 prev_period = he->stat.period;
@@ -263,21 +265,45 @@ static bool hists__decay_entry(struct hists *hists, struct hist_entry *he)
 
 	diff = prev_period - he->stat.period;
 
-	hists->stats.total_period -= diff;
-	if (!he->filtered)
-		hists->stats.total_non_filtered_period -= diff;
+	if (!he->depth) {
+		hists->stats.total_period -= diff;
+		if (!he->filtered)
+			hists->stats.total_non_filtered_period -= diff;
+	}
+
+	if (!he->leaf) {
+		struct hist_entry *child;
+		struct rb_node *node = rb_first(&he->hroot_out);
+		while (node) {
+			child = rb_entry(node, struct hist_entry, rb_node);
+			node = rb_next(node);
+
+			if (hists__decay_entry(hists, child))
+				hists__delete_entry(hists, child);
+		}
+	}
 
 	return he->stat.period == 0;
 }
 
 static void hists__delete_entry(struct hists *hists, struct hist_entry *he)
 {
-	rb_erase(&he->rb_node, &hists->entries);
+	struct rb_root *root_in;
+	struct rb_root *root_out;
 
-	if (sort__need_collapse)
-		rb_erase(&he->rb_node_in, &hists->entries_collapsed);
-	else
-		rb_erase(&he->rb_node_in, hists->entries_in);
+	if (he->parent_he) {
+		root_in  = &he->parent_he->hroot_in;
+		root_out = &he->parent_he->hroot_out;
+	} else {
+		if (sort__need_collapse)
+			root_in = &hists->entries_collapsed;
+		else
+			root_in = hists->entries_in;
+		root_out = &hists->entries;
+	}
+
+	rb_erase(&he->rb_node_in, root_in);
+	rb_erase(&he->rb_node, root_out);
 
 	--hists->nr_entries;
 	if (!he->filtered)
-- 
cgit v0.10.2


From c92fcfde3486fb4b9e476ee5ad5995a62e401cce Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 25 Feb 2016 00:13:50 +0900
Subject: perf top: Add --hierarchy option

Support hierarchy output for perf-top using --hierarchy option.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456326830-30456-19-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index b0e60e1..19f046f 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -233,6 +233,9 @@ Default is to monitor all CPUS.
 --raw-trace::
 	When displaying traceevent output, do not use print fmt or plugins.
 
+--hierarchy::
+	Enable hierarchy output.
+
 INTERACTIVE PROMPTING KEYS
 --------------------------
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index a75de39..b86b623 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1214,6 +1214,8 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 		     parse_branch_stack),
 	OPT_BOOLEAN(0, "raw-trace", &symbol_conf.raw_trace,
 		    "Show raw trace event output (do not use print fmt or plugins)"),
+	OPT_BOOLEAN(0, "hierarchy", &symbol_conf.report_hierarchy,
+		    "Show entries in a hierarchy"),
 	OPT_END()
 	};
 	const char * const top_usage[] = {
@@ -1241,6 +1243,19 @@ int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
 		goto out_delete_evlist;
 	}
 
+	if (symbol_conf.report_hierarchy) {
+		/* disable incompatible options */
+		symbol_conf.event_group = false;
+		symbol_conf.cumulate_callchain = false;
+
+		if (field_order) {
+			pr_err("Error: --hierarchy and --fields options cannot be used together\n");
+			parse_options_usage(top_usage, options, "fields", 0);
+			parse_options_usage(NULL, options, "hierarchy", 0);
+			goto out_delete_evlist;
+		}
+	}
+
 	sort__mode = SORT_MODE__TOP;
 	/* display thread wants entries to be collapsed in a different tree */
 	sort__need_collapse = 1;
-- 
cgit v0.10.2


From d144dfea8af7108f613139623e63952ed7e69c0c Mon Sep 17 00:00:00 2001
From: Peter Chen <peter.chen@nxp.com>
Date: Wed, 24 Feb 2016 11:05:25 +0800
Subject: usb: chipidea: otg: change workqueue ci_otg as freezable

If we use USB ID pin as wakeup source, and there is a USB block
device on this USB OTG (ID) cable, the system will be deadlock
after system resume.

The root cause for this problem is: the workqueue ci_otg may try
to remove hcd before the driver resume has finished, and hcd will
disconnect the device on it, then, it will call device_release_driver,
and holds the device lock "dev->mutex", but it is never unlocked since
it waits workqueue writeback to run to flush the block information, but
the workqueue writeback is freezable, it is not thawed before driver
resume has finished.

When the driver (device: sd 0:0:0:0:) resume goes to dpm_complete, it
tries to get its device lock "dev->mutex", but it can't get it forever,
then the deadlock occurs. Below call stacks show the situation.

So, in order to fix this problem, we need to change workqueue ci_otg
as freezable, then the work item in this workqueue will be run after
driver's resume, this workqueue will not be blocked forever like above
case since the workqueue writeback has been thawed too.

Tested at: i.mx6qdl-sabresd and i.mx6sx-sdb.

[  555.178869] kworker/u2:13   D c07de74c     0   826      2 0x00000000
[  555.185310] Workqueue: ci_otg ci_otg_work
[  555.189353] Backtrace:
[  555.191849] [<c07de4fc>] (__schedule) from [<c07dec6c>] (schedule+0x48/0xa0)
[  555.198912]  r10:ee471ba0 r9:00000000 r8:00000000 r7:00000002 r6:ee470000 r5:ee471ba4
[  555.206867]  r4:ee470000
[  555.209453] [<c07dec24>] (schedule) from [<c07e2fc4>] (schedule_timeout+0x15c/0x1e0)
[  555.217212]  r4:7fffffff r3:edc2b000
[  555.220862] [<c07e2e68>] (schedule_timeout) from [<c07df6c8>] (wait_for_common+0x94/0x144)
[  555.229140]  r8:00000000 r7:00000002 r6:ee470000 r5:ee471ba4 r4:7fffffff
[  555.235980] [<c07df634>] (wait_for_common) from [<c07df790>] (wait_for_completion+0x18/0x1c)
[  555.244430]  r10:00000001 r9:c0b5563c r8:c0042e48 r7:ef086000 r6:eea4372c r5:ef131b00
[  555.252383]  r4:00000000
[  555.254970] [<c07df778>] (wait_for_completion) from [<c0043cb8>] (flush_work+0x19c/0x234)
[  555.263177] [<c0043b1c>] (flush_work) from [<c0043fac>] (flush_delayed_work+0x48/0x4c)
[  555.271106]  r8:ed5b5000 r7:c0b38a3c r6:eea439cc r5:eea4372c r4:eea4372c
[  555.277958] [<c0043f64>] (flush_delayed_work) from [<c00eae18>] (bdi_unregister+0x84/0xec)
[  555.286236]  r4:eea43520 r3:20000153
[  555.289885] [<c00ead94>] (bdi_unregister) from [<c02c2154>] (blk_cleanup_queue+0x180/0x29c)
[  555.298250]  r5:eea43808 r4:eea43400
[  555.301909] [<c02c1fd4>] (blk_cleanup_queue) from [<c0417914>] (__scsi_remove_device+0x48/0xb8)
[  555.310623]  r7:00000000 r6:20000153 r5:ededa950 r4:ededa800
[  555.316403] [<c04178cc>] (__scsi_remove_device) from [<c0415e90>] (scsi_forget_host+0x64/0x68)
[  555.325028]  r5:ededa800 r4:ed5b5000
[  555.328689] [<c0415e2c>] (scsi_forget_host) from [<c0409828>] (scsi_remove_host+0x78/0x104)
[  555.337054]  r5:ed5b5068 r4:ed5b5000
[  555.340709] [<c04097b0>] (scsi_remove_host) from [<c04cdfcc>] (usb_stor_disconnect+0x50/0xb4)
[  555.349247]  r6:ed5b56e4 r5:ed5b5818 r4:ed5b5690 r3:00000008
[  555.355025] [<c04cdf7c>] (usb_stor_disconnect) from [<c04b3bc8>] (usb_unbind_interface+0x78/0x25c)
[  555.363997]  r8:c13919b4 r7:edd3c000 r6:edd3c020 r5:ee551c68 r4:ee551c00 r3:c04cdf7c
[  555.371892] [<c04b3b50>] (usb_unbind_interface) from [<c03dc248>] (__device_release_driver+0x8c/0x118)
[  555.381213]  r10:00000001 r9:edd90c00 r8:c13919b4 r7:ee551c68 r6:c0b546e0 r5:c0b5563c
[  555.389167]  r4:edd3c020
[  555.391752] [<c03dc1bc>] (__device_release_driver) from [<c03dc2fc>] (device_release_driver+0x28/0x34)
[  555.401071]  r5:edd3c020 r4:edd3c054
[  555.404721] [<c03dc2d4>] (device_release_driver) from [<c03db304>] (bus_remove_device+0xe0/0x110)
[  555.413607]  r5:edd3c020 r4:ef17f04c
[  555.417253] [<c03db224>] (bus_remove_device) from [<c03d8128>] (device_del+0x114/0x21c)
[  555.425270]  r6:edd3c028 r5:edd3c020 r4:ee551c00 r3:00000000
[  555.431045] [<c03d8014>] (device_del) from [<c04b1560>] (usb_disable_device+0xa4/0x1e8)
[  555.439061]  r8:edd3c000 r7:eded8000 r6:00000000 r5:00000001 r4:ee551c00
[  555.445906] [<c04b14bc>] (usb_disable_device) from [<c04a8e54>] (usb_disconnect+0x74/0x224)
[  555.454271]  r9:edd90c00 r8:ee551000 r7:ee551c68 r6:ee551c9c r5:ee551c00 r4:00000001
[  555.462156] [<c04a8de0>] (usb_disconnect) from [<c04a8fb8>] (usb_disconnect+0x1d8/0x224)
[  555.470259]  r10:00000001 r9:edd90000 r8:ee471e2c r7:ee551468 r6:ee55149c r5:ee551400
[  555.478213]  r4:00000001
[  555.480797] [<c04a8de0>] (usb_disconnect) from [<c04ae5ec>] (usb_remove_hcd+0xa0/0x1ac)
[  555.488813]  r10:00000001 r9:ee471eb0 r8:00000000 r7:ef3d9500 r6:eded810c r5:eded80b0
[  555.496765]  r4:eded8000
[  555.499351] [<c04ae54c>] (usb_remove_hcd) from [<c04d4158>] (host_stop+0x28/0x64)
[  555.506847]  r6:eeb50010 r5:eded8000 r4:eeb51010
[  555.511563] [<c04d4130>] (host_stop) from [<c04d09b8>] (ci_otg_work+0xc4/0x124)
[  555.518885]  r6:00000001 r5:eeb50010 r4:eeb502a0 r3:c04d4130
[  555.524665] [<c04d08f4>] (ci_otg_work) from [<c00454f0>] (process_one_work+0x194/0x420)
[  555.532682]  r6:ef086000 r5:eeb502a0 r4:edc44480
[  555.537393] [<c004535c>] (process_one_work) from [<c00457b0>] (worker_thread+0x34/0x514)
[  555.545496]  r10:edc44480 r9:ef086000 r8:c0b1a100 r7:ef086034 r6:00000088 r5:edc44498
[  555.553450]  r4:ef086000
[  555.556032] [<c004577c>] (worker_thread) from [<c004bab4>] (kthread+0xdc/0xf8)
[  555.563268]  r10:00000000 r9:00000000 r8:00000000 r7:c004577c r6:edc44480 r5:eddc15c0
[  555.571221]  r4:00000000
[  555.573804] [<c004b9d8>] (kthread) from [<c000fef0>] (ret_from_fork+0x14/0x24)
[  555.581040]  r7:00000000 r6:00000000 r5:c004b9d8 r4:eddc15c0

[  553.429383] sh              D c07de74c     0   694    691 0x00000000
[  553.435801] Backtrace:
[  553.438295] [<c07de4fc>] (__schedule) from [<c07dec6c>] (schedule+0x48/0xa0)
[  553.445358]  r10:edd3c054 r9:edd3c078 r8:edddbd50 r7:edcbbc00 r6:c1377c34 r5:60000153
[  553.453313]  r4:eddda000
[  553.455896] [<c07dec24>] (schedule) from [<c07deff8>] (schedule_preempt_disabled+0x10/0x14)
[  553.464261]  r4:edd3c058 r3:0000000a
[  553.467910] [<c07defe8>] (schedule_preempt_disabled) from [<c07e0bbc>] (mutex_lock_nested+0x1a0/0x3e8)
[  553.477254] [<c07e0a1c>] (mutex_lock_nested) from [<c03e927c>] (dpm_complete+0xc0/0x1b0)
[  553.485358]  r10:00561408 r9:edd3c054 r8:c0b4863c r7:edddbd90 r6:c0b485d8 r5:edd3c020
[  553.493313]  r4:edd3c0d0
[  553.495896] [<c03e91bc>] (dpm_complete) from [<c03e9388>] (dpm_resume_end+0x1c/0x20)
[  553.503652]  r9:00000000 r8:c0b1a9d0 r7:c1334ec0 r6:c1334edc r5:00000003 r4:00000010
[  553.511544] [<c03e936c>] (dpm_resume_end) from [<c0079894>] (suspend_devices_and_enter+0x158/0x504)
[  553.520604]  r4:00000000 r3:c1334efc
[  553.524250] [<c007973c>] (suspend_devices_and_enter) from [<c0079e74>] (pm_suspend+0x234/0x2cc)
[  553.532961]  r10:00561408 r9:ed6b7300 r8:00000004 r7:c1334eec r6:00000000 r5:c1334ee8
[  553.540914]  r4:00000003
[  553.543493] [<c0079c40>] (pm_suspend) from [<c0078a6c>] (state_store+0x6c/0xc0)

[  555.703684] 7 locks held by kworker/u2:13/826:
[  555.708140]  #0:  ("%s""ci_otg"){++++.+}, at: [<c0045484>] process_one_work+0x128/0x420
[  555.716277]  #1:  ((&ci->work)){+.+.+.}, at: [<c0045484>] process_one_work+0x128/0x420
[  555.724317]  #2:  (usb_bus_list_lock){+.+.+.}, at: [<c04ae5e4>] usb_remove_hcd+0x98/0x1ac
[  555.732626]  #3:  (&dev->mutex){......}, at: [<c04a8e28>] usb_disconnect+0x48/0x224
[  555.740403]  #4:  (&dev->mutex){......}, at: [<c04a8e28>] usb_disconnect+0x48/0x224
[  555.748179]  #5:  (&dev->mutex){......}, at: [<c03dc2f4>] device_release_driver+0x20/0x34
[  555.756487]  #6:  (&shost->scan_mutex){+.+.+.}, at: [<c04097d0>] scsi_remove_host+0x20/0x104

Cc: <stable@vger.kernel.org> #v3.14+
Cc: Jun Li <jun.li@nxp.com>
Signed-off-by: Peter Chen <peter.chen@nxp.com>

diff --git a/drivers/usb/chipidea/otg.c b/drivers/usb/chipidea/otg.c
index 45f86da..03b6743 100644
--- a/drivers/usb/chipidea/otg.c
+++ b/drivers/usb/chipidea/otg.c
@@ -158,7 +158,7 @@ static void ci_otg_work(struct work_struct *work)
 int ci_hdrc_otg_init(struct ci_hdrc *ci)
 {
 	INIT_WORK(&ci->work, ci_otg_work);
-	ci->wq = create_singlethread_workqueue("ci_otg");
+	ci->wq = create_freezable_workqueue("ci_otg");
 	if (!ci->wq) {
 		dev_err(ci->dev, "can't create workqueue\n");
 		return -ENODEV;
-- 
cgit v0.10.2


From ee50c130c82175eaa0820c96b6d3763928af2241 Mon Sep 17 00:00:00 2001
From: Diego Viola <diego.viola@gmail.com>
Date: Tue, 23 Feb 2016 12:04:04 -0300
Subject: net: jme: fix suspend/resume on JMC260

The JMC260 network card fails to suspend/resume because the call to
jme_start_irq() was too early, moving the call to jme_start_irq() after
the call to jme_reset_link() makes it work.

Prior this change suspend/resume would fail unless /sys/power/pm_async=0
was explicitly specified.

Relevant bug report: https://bugzilla.kernel.org/show_bug.cgi?id=112351

Signed-off-by: Diego Viola <diego.viola@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index b1de7af..8adbe8f 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -3312,13 +3312,14 @@ jme_resume(struct device *dev)
 		jme_reset_phy_processor(jme);
 	jme_phy_calibration(jme);
 	jme_phy_setEA(jme);
-	jme_start_irq(jme);
 	netif_device_attach(netdev);
 
 	atomic_inc(&jme->link_changing);
 
 	jme_reset_link(jme);
 
+	jme_start_irq(jme);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 405e1133d00e0271cedef75c17ecb773ff3e2732 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Wed, 10 Feb 2016 02:03:00 -0700
Subject: x86/mm: Avoid premature success when changing page attributes

set_memory_nx() (and set_memory_x()) currently differ in behavior from
all other set_memory_*() functions when encountering a virtual address
space hole within the kernel address range: They stop processing at the
hole, but nevertheless report success (making the caller believe the
operation was carried out on the entire range). While observed to be a
problem - triggering the CONFIG_DEBUG_WX warning - only with out of
tree code, I suspect (but didn't check) that on x86-64 the
CONFIG_DEBUG_PAGEALLOC logic in free_init_pages() would, when called
from free_initmem(), have the same effect on the set_memory_nx() called
from mark_rodata_ro().

This unexpected behavior is a result of change_page_attr_set_clr()
special casing changes to only the NX bit, in that it passes "false" as
the "checkalias" argument to __change_page_attr_set_clr(). Since this
flag becomes the "primary" argument of both __change_page_attr() and
__cpa_process_fault(), the latter would so far return success without
adjusting cpa->numpages. Success to the higher level callers, however,
means that whatever cpa->numpages currently holds is the count of
successfully processed pages. The cases when __change_page_attr() calls
__cpa_process_fault(), otoh, don't generally mean the entire range got
processed (as can be seen from one of the two success return paths in
__cpa_process_fault() already adjusting ->numpages).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/56BB0AD402000078000D05BF@prv-mh.provo.novell.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 2440814..3dd6afd 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1122,8 +1122,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
 	/*
 	 * Ignore all non primary paths.
 	 */
-	if (!primary)
+	if (!primary) {
+		cpa->numpages = 1;
 		return 0;
+	}
 
 	/*
 	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
-- 
cgit v0.10.2


From 0abefbaab4edbcec637e00fefcdeccb52797fe4f Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:12 +0000
Subject: genirq: Add new IPI irqdomain flags

These flags will be used to identify an IPI domain. We have two flavours of
IPI implementations:

IRQ_DOMAIN_FLAG_IPI_PER_CPU: Each CPU has its own virq and hwirq
IRQ_DOMAIN_FLAG_IPI_SINGLE : A single virq and hwirq for all CPUs

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-2-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 04579d9..9bb0a9c 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -172,6 +172,12 @@ enum {
 	/* Core calls alloc/free recursive through the domain hierarchy. */
 	IRQ_DOMAIN_FLAG_AUTO_RECURSIVE	= (1 << 1),
 
+	/* Irq domain is an IPI domain with virq per cpu */
+	IRQ_DOMAIN_FLAG_IPI_PER_CPU	= (1 << 2),
+
+	/* Irq domain is an IPI domain with single virq */
+	IRQ_DOMAIN_FLAG_IPI_SINGLE	= (1 << 3),
+
 	/*
 	 * Flags starting from IRQ_DOMAIN_FLAG_NONCORE are reserved
 	 * for implementation specific purposes and ignored by the
@@ -400,6 +406,22 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain)
 {
 	return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY;
 }
+
+static inline bool irq_domain_is_ipi(struct irq_domain *domain)
+{
+	return domain->flags &
+		(IRQ_DOMAIN_FLAG_IPI_PER_CPU | IRQ_DOMAIN_FLAG_IPI_SINGLE);
+}
+
+static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain)
+{
+	return domain->flags & IRQ_DOMAIN_FLAG_IPI_PER_CPU;
+}
+
+static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
+{
+	return domain->flags & IRQ_DOMAIN_FLAG_IPI_SINGLE;
+}
 #else	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 static inline void irq_domain_activate_irq(struct irq_data *data) { }
 static inline void irq_domain_deactivate_irq(struct irq_data *data) { }
@@ -413,6 +435,21 @@ static inline bool irq_domain_is_hierarchy(struct irq_domain *domain)
 {
 	return false;
 }
+
+static inline bool irq_domain_is_ipi(struct irq_domain *domain)
+{
+	return false;
+}
+
+static inline bool irq_domain_is_ipi_per_cpu(struct irq_domain *domain)
+{
+	return false;
+}
+
+static inline bool irq_domain_is_ipi_single(struct irq_domain *domain)
+{
+	return false;
+}
 #endif	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
 
 #else /* CONFIG_IRQ_DOMAIN */
-- 
cgit v0.10.2


From 29d5c8db26ad54592436508819ac617119306f96 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:13 +0000
Subject: genirq: Add DOMAIN_BUS_IPI

We need a way to search and match IPI domains.

Using the new enum we can use irq_find_matching_host() to do that.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-3-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 9bb0a9c..130e1c3 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -74,6 +74,7 @@ enum irq_domain_bus_token {
 	DOMAIN_BUS_PCI_MSI,
 	DOMAIN_BUS_PLATFORM_MSI,
 	DOMAIN_BUS_NEXUS,
+	DOMAIN_BUS_IPI,
 };
 
 /**
-- 
cgit v0.10.2


From 379b656446a36c7a80b27cbdc34eec8e57b5f290 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:14 +0000
Subject: genirq: Add GENERIC_IRQ_IPI Kconfig symbol

Select this to enable the generic IPI domain support

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-4-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 3b48dab..3bbfd6a 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY
 	bool
 	select IRQ_DOMAIN
 
+# Generic IRQ IPI support
+config GENERIC_IRQ_IPI
+	bool
+
 # Generic MSI interrupt support
 config GENERIC_MSI_IRQ
 	bool
-- 
cgit v0.10.2


From 955bfe5912e7839abcc83694f06867535487404b Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:17 +0000
Subject: genirq: Add an extra comment about the use of affinity in
 irq_common_data

Affinity will have dual meaning depends on the type of the irq. If it is
a normal irq, it'll have the standard affinity meaning.

If it is an IPI, it will hold the mask of the cpus to which an IPI can be
sent.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-7-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3c1c967..0817afd 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -133,7 +133,9 @@ struct irq_domain;
  *			Use accessor functions to deal with it
  * @node:		node index useful for balancing
  * @handler_data:	per-IRQ data for the irq_chip methods
- * @affinity:		IRQ affinity on SMP
+ * @affinity:		IRQ affinity on SMP. If this is an IPI
+ *			related irq, then this is the mask of the
+ *			CPUs to which an IPI can be sent.
  * @msi_desc:		MSI descriptor
  */
 struct irq_common_data {
-- 
cgit v0.10.2


From f256c9a0c54820ffef21b126f8226be2bece3dd7 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:16 +0000
Subject: genirq: Add ipi_offset to irq_common_data

IPIs are always assumed to be consecutively allocated, hence virqs and hwirqs
can be inferred by using CPU id as an offset. But the first cpu doesn't always
have to start at offset 0. ipi_offset stores the position of the first cpu so
that we can easily calculate the virq or hwirq of an IPI associated with a
specific cpu.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-6-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 0817afd..a32b47f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -137,6 +137,7 @@ struct irq_domain;
  *			related irq, then this is the mask of the
  *			CPUs to which an IPI can be sent.
  * @msi_desc:		MSI descriptor
+ * @ipi_offset:		Offset of first IPI target cpu in @affinity. Optional.
  */
 struct irq_common_data {
 	unsigned int		state_use_accessors;
@@ -146,6 +147,9 @@ struct irq_common_data {
 	void			*handler_data;
 	struct msi_desc		*msi_desc;
 	cpumask_var_t		affinity;
+#ifdef CONFIG_GENERIC_IRQ_IPI
+	unsigned int		ipi_offset;
+#endif
 };
 
 /**
-- 
cgit v0.10.2


From ac0a0cd266d1f21236d5975ca6bced9b377a2a6a Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:18 +0000
Subject: genirq: Make irq_domain_alloc_descs() non static

We will need to use this function to implement irq_reserve_ipi() later. So
make it non static and move the prototype to irqdomain.h to allow using it
outside irqdomain.c

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-8-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 130e1c3..c466cc1 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -213,6 +213,8 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 extern struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
 						   enum irq_domain_bus_token bus_token);
 extern void irq_set_default_host(struct irq_domain *host);
+extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
+				  irq_hw_number_t hwirq, int node);
 
 static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node)
 {
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3e56d2f0..8681154 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex);
 static DEFINE_MUTEX(revmap_trees_mutex);
 static struct irq_domain *irq_default_domain;
 
-static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
-				  irq_hw_number_t hwirq, int node);
 static void irq_domain_check_hierarchy(struct irq_domain *domain);
 
 struct irqchip_fwid {
@@ -840,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = {
 };
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 
-static int irq_domain_alloc_descs(int virq, unsigned int cnt,
-				  irq_hw_number_t hwirq, int node)
+int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
+			   int node)
 {
 	unsigned int hint;
 
-- 
cgit v0.10.2


From d17bf24e695290d3fe7943aca52ab48098a10653 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:19 +0000
Subject: genirq: Add a new generic IPI reservation code to irq core

Add a generic mechanism to dynamically allocate an IPI. Depending on the
underlying implementation this creates either a single Linux irq or a
consective range of Linux irqs. The Linux irq is used later to send IPIs to
other CPUs.

[ tglx: Massaged the code and removed the 'consecutive mask' restriction for
  	the single IRQ case ]

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-9-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index a32b47f..95f4f66 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -940,4 +940,7 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc,
 		return readl(gc->reg_base + reg_offset);
 }
 
+/* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */
+#define INVALID_HWIRQ	(~0UL)
+
 #endif /* _LINUX_IRQ_H */
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index c466cc1..ed48594 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -344,6 +344,11 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr,
 			const u32 *intspec, unsigned int intsize,
 			irq_hw_number_t *out_hwirq, unsigned int *out_type);
 
+/* IPI functions */
+unsigned int irq_reserve_ipi(struct irq_domain *domain,
+			     const struct cpumask *dest);
+void irq_destroy_ipi(unsigned int irq);
+
 /* V2 interfaces to support hierarchy IRQ domains. */
 extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
 						unsigned int virq);
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2fc9cbd..2ee42e9 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
 obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
 obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
+obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
new file mode 100644
index 0000000..340af27
--- /dev/null
+++ b/kernel/irq/ipi.c
@@ -0,0 +1,137 @@
+/*
+ * linux/kernel/irq/ipi.c
+ *
+ * Copyright (C) 2015 Imagination Technologies Ltd
+ * Author: Qais Yousef <qais.yousef@imgtec.com>
+ *
+ * This file contains driver APIs to the IPI subsystem.
+ */
+
+#define pr_fmt(fmt) "genirq/ipi: " fmt
+
+#include <linux/irqdomain.h>
+#include <linux/irq.h>
+
+/**
+ * irq_reserve_ipi() - Setup an IPI to destination cpumask
+ * @domain:	IPI domain
+ * @dest:	cpumask of cpus which can receive the IPI
+ *
+ * Allocate a virq that can be used to send IPI to any CPU in dest mask.
+ *
+ * On success it'll return linux irq number and 0 on failure
+ */
+unsigned int irq_reserve_ipi(struct irq_domain *domain,
+			     const struct cpumask *dest)
+{
+	unsigned int nr_irqs, offset;
+	struct irq_data *data;
+	int virq, i;
+
+	if (!domain ||!irq_domain_is_ipi(domain)) {
+		pr_warn("Reservation on a non IPI domain\n");
+		return 0;
+	}
+
+	if (!cpumask_subset(dest, cpu_possible_mask)) {
+		pr_warn("Reservation is not in possible_cpu_mask\n");
+		return 0;
+	}
+
+	nr_irqs = cpumask_weight(dest);
+	if (!nr_irqs) {
+		pr_warn("Reservation for empty destination mask\n");
+		return 0;
+	}
+
+	if (irq_domain_is_ipi_single(domain)) {
+		/*
+		 * If the underlying implementation uses a single HW irq on
+		 * all cpus then we only need a single Linux irq number for
+		 * it. We have no restrictions vs. the destination mask. The
+		 * underlying implementation can deal with holes nicely.
+		 */
+		nr_irqs = 1;
+		offset = 0;
+	} else {
+		unsigned int next;
+
+		/*
+		 * The IPI requires a seperate HW irq on each CPU. We require
+		 * that the destination mask is consecutive. If an
+		 * implementation needs to support holes, it can reserve
+		 * several IPI ranges.
+		 */
+		offset = cpumask_first(dest);
+		/*
+		 * Find a hole and if found look for another set bit after the
+		 * hole. For now we don't support this scenario.
+		 */
+		next = cpumask_next_zero(offset, dest);
+		if (next < nr_cpu_ids)
+			next = cpumask_next(next, dest);
+		if (next < nr_cpu_ids) {
+			pr_warn("Destination mask has holes\n");
+			return 0;
+		}
+	}
+
+	virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
+	if (virq <= 0) {
+		pr_warn("Can't reserve IPI, failed to alloc descs\n");
+		return 0;
+	}
+
+	virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
+				       (void *) dest, true);
+
+	if (virq <= 0) {
+		pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
+		goto free_descs;
+	}
+
+	for (i = 0; i < nr_irqs; i++) {
+		data = irq_get_irq_data(virq + i);
+		cpumask_copy(data->common->affinity, dest);
+		data->common->ipi_offset = offset;
+	}
+	return virq;
+
+free_descs:
+	irq_free_descs(virq, nr_irqs);
+	return 0;
+}
+
+/**
+ * irq_destroy_ipi() - unreserve an IPI that was previously allocated
+ * @irq:	linux irq number to be destroyed
+ *
+ * Return the IPIs allocated with irq_reserve_ipi() to the system destroying
+ * all virqs associated with them.
+ */
+void irq_destroy_ipi(unsigned int irq)
+{
+	struct irq_data *data = irq_get_irq_data(irq);
+	struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+	struct irq_domain *domain;
+	unsigned int nr_irqs;
+
+	if (!irq || !data || !ipimask)
+		return;
+
+	domain = data->domain;
+	if (WARN_ON(domain == NULL))
+		return;
+
+	if (!irq_domain_is_ipi(domain)) {
+		pr_warn("Trying to destroy a non IPI domain!\n");
+		return;
+	}
+
+	if (irq_domain_is_ipi_per_cpu(domain))
+		nr_irqs = cpumask_weight(ipimask);
+	else
+		nr_irqs = 1;
+
+	irq_domain_free_irqs(irq, nr_irqs);
+}
-- 
cgit v0.10.2


From f9bce791ae2a1a10a965b30427f5507c1a77669f Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:20 +0000
Subject: genirq: Add a new function to get IPI reverse mapping

When dealing with coprocessors we need to find out the actual hwirqs values to
pass on to the firmware so that it knows what it needs to use to receive IPIs
from and send IPIs to Linux cpus.

[ tglx: Fixed the single hwirq IPI case. The hardware irq number does not
  	change due to the cpu number ]

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-10-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 95f4f66..10273dc 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -942,5 +942,6 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc,
 
 /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */
 #define INVALID_HWIRQ	(~0UL)
+irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu);
 
 #endif /* _LINUX_IRQ_H */
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 340af27..6f34f29 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -135,3 +135,37 @@ void irq_destroy_ipi(unsigned int irq)
 
 	irq_domain_free_irqs(irq, nr_irqs);
 }
+
+/**
+ * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu
+ * @irq:	linux irq number
+ * @cpu:	the target cpu
+ *
+ * When dealing with coprocessors IPI, we need to inform the coprocessor of
+ * the hwirq it needs to use to receive and send IPIs.
+ *
+ * Returns hwirq value on success and INVALID_HWIRQ on failure.
+ */
+irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
+{
+	struct irq_data *data = irq_get_irq_data(irq);
+	struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
+
+	if (!data || !ipimask || cpu > nr_cpu_ids)
+		return INVALID_HWIRQ;
+
+	if (!cpumask_test_cpu(cpu, ipimask))
+		return INVALID_HWIRQ;
+
+	/*
+	 * Get the real hardware irq number if the underlying implementation
+	 * uses a seperate irq per cpu. If the underlying implementation uses
+	 * a single hardware irq for all cpus then the IPI send mechanism
+	 * needs to take care of this.
+	 */
+	if (irq_domain_is_ipi_per_cpu(data->domain))
+		data = irq_get_irq_data(irq + cpu - data->common->ipi_offset);
+
+	return data ? irqd_to_hwirq(data) : INVALID_HWIRQ;
+}
+EXPORT_SYMBOL_GPL(ipi_get_hwirq);
-- 
cgit v0.10.2


From 34dc1ae101018dbb50e1d04e88aa89052802a7db Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:21 +0000
Subject: genirq: Add send_ipi callbacks to irq_chip

Introduce the new callbacks which can be used by the core code to implement a
generic IPI send mechanism.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-11-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 10273dc..3b3a5b8 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -347,6 +347,8 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  * @irq_get_irqchip_state:	return the internal state of an interrupt
  * @irq_set_irqchip_state:	set the internal state of a interrupt
  * @irq_set_vcpu_affinity:	optional to target a vCPU in a virtual machine
+ * @ipi_send_single:	send a single IPI to destination cpus
+ * @ipi_send_mask:	send an IPI to destination cpus in cpumask
  * @flags:		chip specific flags
  */
 struct irq_chip {
@@ -391,6 +393,9 @@ struct irq_chip {
 
 	int		(*irq_set_vcpu_affinity)(struct irq_data *data, void *vcpu_info);
 
+	void		(*ipi_send_single)(struct irq_data *data, unsigned int cpu);
+	void		(*ipi_send_mask)(struct irq_data *data, const struct cpumask *dest);
+
 	unsigned long	flags;
 };
 
-- 
cgit v0.10.2


From 3b8e29a82dd16c1f2061e0b955a71cd36eeb061b Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:22 +0000
Subject: genirq: Implement ipi_send_mask/single()

Add APIs to send IPIs from driver and arch code.

We have different functions because we allow architecture code to cache the
irq descriptor to avoid lookups. Driver code has to use the irq number and is
subject to more restrictive checks.

[ tglx: Polish the implementation ]

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <ralf@linux-mips.org>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-12-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 3b3a5b8..d5ebd94 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -948,5 +948,9 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc,
 /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */
 #define INVALID_HWIRQ	(~0UL)
 irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu);
+int __ipi_send_single(struct irq_desc *desc, unsigned int cpu);
+int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
+int ipi_send_single(unsigned int virq, unsigned int cpu);
+int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
 
 #endif /* _LINUX_IRQ_H */
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 6f34f29..c37f34b 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -161,7 +161,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
 	 * Get the real hardware irq number if the underlying implementation
 	 * uses a seperate irq per cpu. If the underlying implementation uses
 	 * a single hardware irq for all cpus then the IPI send mechanism
-	 * needs to take care of this.
+	 * needs to take care of the cpu destinations.
 	 */
 	if (irq_domain_is_ipi_per_cpu(data->domain))
 		data = irq_get_irq_data(irq + cpu - data->common->ipi_offset);
@@ -169,3 +169,158 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
 	return data ? irqd_to_hwirq(data) : INVALID_HWIRQ;
 }
 EXPORT_SYMBOL_GPL(ipi_get_hwirq);
+
+static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
+			   const struct cpumask *dest, unsigned int cpu)
+{
+	struct cpumask *ipimask = irq_data_get_affinity_mask(data);
+
+	if (!chip || !ipimask)
+		return -EINVAL;
+
+	if (!chip->ipi_send_single && !chip->ipi_send_mask)
+		return -EINVAL;
+
+	if (cpu > nr_cpu_ids)
+		return -EINVAL;
+
+	if (dest) {
+		if (!cpumask_subset(dest, ipimask))
+			return -EINVAL;
+	} else {
+		if (!cpumask_test_cpu(cpu, ipimask))
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * __ipi_send_single - send an IPI to a target Linux SMP CPU
+ * @desc:	pointer to irq_desc of the IRQ
+ * @cpu:	destination CPU, must in the destination mask passed to
+ *		irq_reserve_ipi()
+ *
+ * This function is for architecture or core code to speed up IPI sending. Not
+ * usable from driver code.
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
+{
+	struct irq_data *data = irq_desc_get_irq_data(desc);
+	struct irq_chip *chip = irq_data_get_irq_chip(data);
+
+#ifdef DEBUG
+	/*
+	 * Minimise the overhead by omitting the checks for Linux SMP IPIs.
+	 * Since the callers should be arch or core code which is generally
+	 * trusted, only check for errors when debugging.
+	 */
+	if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
+		return -EINVAL;
+#endif
+	if (!chip->ipi_send_single) {
+		chip->ipi_send_mask(data, cpumask_of(cpu));
+		return 0;
+	}
+
+	/* FIXME: Store this information in irqdata flags */
+	if (irq_domain_is_ipi_per_cpu(data->domain) &&
+	    cpu != data->common->ipi_offset) {
+		/* use the correct data for that cpu */
+		unsigned irq = data->irq + cpu - data->common->ipi_offset;
+
+		data = irq_get_irq_data(irq);
+	}
+	chip->ipi_send_single(data, cpu);
+	return 0;
+}
+
+/**
+ * ipi_send_mask - send an IPI to target Linux SMP CPU(s)
+ * @desc:	pointer to irq_desc of the IRQ
+ * @dest:	dest CPU(s), must be a subset of the mask passed to
+ *		irq_reserve_ipi()
+ *
+ * This function is for architecture or core code to speed up IPI sending. Not
+ * usable from driver code.
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
+{
+	struct irq_data *data = irq_desc_get_irq_data(desc);
+	struct irq_chip *chip = irq_data_get_irq_chip(data);
+	unsigned int cpu;
+
+#ifdef DEBUG
+	/*
+	 * Minimise the overhead by omitting the checks for Linux SMP IPIs.
+	 * Since the callers should be arch or core code which is generally
+	 * trusted, only check for errors when debugging.
+	 */
+	if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
+		return -EINVAL;
+#endif
+	if (chip->ipi_send_mask) {
+		chip->ipi_send_mask(data, dest);
+		return 0;
+	}
+
+	if (irq_domain_is_ipi_per_cpu(data->domain)) {
+		unsigned int base = data->irq;
+
+		for_each_cpu(cpu, dest) {
+			unsigned irq = base + cpu - data->common->ipi_offset;
+
+			data = irq_get_irq_data(irq);
+			chip->ipi_send_single(data, cpu);
+		}
+	} else {
+		for_each_cpu(cpu, dest)
+			chip->ipi_send_single(data, cpu);
+	}
+	return 0;
+}
+
+/**
+ * ipi_send_single - Send an IPI to a single CPU
+ * @virq:	linux irq number from irq_reserve_ipi()
+ * @cpu:	destination CPU, must in the destination mask passed to
+ *		irq_reserve_ipi()
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int ipi_send_single(unsigned int virq, unsigned int cpu)
+{
+	struct irq_desc *desc = irq_to_desc(virq);
+	struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
+	struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
+
+	if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
+		return -EINVAL;
+
+	return __ipi_send_single(desc, cpu);
+}
+EXPORT_SYMBOL_GPL(ipi_send_single);
+
+/**
+ * ipi_send_mask - Send an IPI to target CPU(s)
+ * @virq:	linux irq number from irq_reserve_ipi()
+ * @dest:	dest CPU(s), must be a subset of the mask passed to
+ *		irq_reserve_ipi()
+ *
+ * Returns zero on success and negative error number on failure.
+ */
+int ipi_send_mask(unsigned int virq, const struct cpumask *dest)
+{
+	struct irq_desc *desc = irq_to_desc(virq);
+	struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
+	struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
+
+	if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
+		return -EINVAL;
+
+	return __ipi_send_mask(desc, dest);
+}
+EXPORT_SYMBOL_GPL(ipi_send_mask);
-- 
cgit v0.10.2


From 2af70a962070fd2e6f3b1a259b652faa3fd1a122 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:23 +0000
Subject: irqchip/mips-gic: Add a IPI hierarchy domain

Add a new ipi domain on top of the normal domain.

MIPS GIC now supports dynamic allocation of an IPI.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-13-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 0ffbdfd..71e648a 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -209,6 +209,7 @@ config KEYSTONE_IRQ
 
 config MIPS_GIC
 	bool
+	select IRQ_DOMAIN_HIERARCHY
 	select MIPS_CM
 
 config INGENIC_IRQ
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 9e17ef2..99f01ca 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -29,16 +29,31 @@ struct gic_pcpu_mask {
 	DECLARE_BITMAP(pcpu_mask, GIC_MAX_INTRS);
 };
 
+struct gic_irq_spec {
+	enum {
+		GIC_DEVICE,
+		GIC_IPI
+	} type;
+
+	union {
+		struct cpumask *ipimask;
+		unsigned int hwirq;
+	};
+};
+
 static unsigned long __gic_base_addr;
+
 static void __iomem *gic_base;
 static struct gic_pcpu_mask pcpu_masks[NR_CPUS];
 static DEFINE_SPINLOCK(gic_lock);
 static struct irq_domain *gic_irq_domain;
+static struct irq_domain *gic_ipi_domain;
 static int gic_shared_intrs;
 static int gic_vpes;
 static unsigned int gic_cpu_pin;
 static unsigned int timer_cpu_pin;
 static struct irq_chip gic_level_irq_controller, gic_edge_irq_controller;
+DECLARE_BITMAP(ipi_resrv, GIC_MAX_INTRS);
 
 static void __gic_irq_dispatch(void);
 
@@ -753,7 +768,7 @@ static int gic_local_irq_domain_map(struct irq_domain *d, unsigned int virq,
 }
 
 static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq,
-				     irq_hw_number_t hw)
+				     irq_hw_number_t hw, unsigned int vpe)
 {
 	int intr = GIC_HWIRQ_TO_SHARED(hw);
 	unsigned long flags;
@@ -763,9 +778,8 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq,
 
 	spin_lock_irqsave(&gic_lock, flags);
 	gic_map_to_pin(intr, gic_cpu_pin);
-	/* Map to VPE 0 by default */
-	gic_map_to_vpe(intr, 0);
-	set_bit(intr, pcpu_masks[0].pcpu_mask);
+	gic_map_to_vpe(intr, vpe);
+	set_bit(intr, pcpu_masks[vpe].pcpu_mask);
 	spin_unlock_irqrestore(&gic_lock, flags);
 
 	return 0;
@@ -776,7 +790,7 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq,
 {
 	if (GIC_HWIRQ_TO_LOCAL(hw) < GIC_NUM_LOCAL_INTRS)
 		return gic_local_irq_domain_map(d, virq, hw);
-	return gic_shared_irq_domain_map(d, virq, hw);
+	return gic_shared_irq_domain_map(d, virq, hw, 0);
 }
 
 static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
@@ -798,9 +812,157 @@ static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
 	return 0;
 }
 
+static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq,
+				unsigned int nr_irqs, void *arg)
+{
+	struct gic_irq_spec *spec = arg;
+	irq_hw_number_t hwirq, base_hwirq;
+	int cpu, ret, i;
+
+	if (spec->type == GIC_DEVICE) {
+		/* verify that it doesn't conflict with an IPI irq */
+		if (test_bit(spec->hwirq, ipi_resrv))
+			return -EBUSY;
+	} else {
+		base_hwirq = find_first_bit(ipi_resrv, gic_shared_intrs);
+		if (base_hwirq == gic_shared_intrs) {
+			return -ENOMEM;
+		}
+
+		/* check that we have enough space */
+		for (i = base_hwirq; i < nr_irqs; i++) {
+			if (!test_bit(i, ipi_resrv))
+				return -EBUSY;
+		}
+		bitmap_clear(ipi_resrv, base_hwirq, nr_irqs);
+
+		/* map the hwirq for each cpu consecutively */
+		i = 0;
+		for_each_cpu(cpu, spec->ipimask) {
+			hwirq = GIC_SHARED_TO_HWIRQ(base_hwirq + i);
+
+			ret = irq_domain_set_hwirq_and_chip(d, virq + i, hwirq,
+							    &gic_edge_irq_controller,
+							    NULL);
+			if (ret)
+				goto error;
+
+			ret = gic_shared_irq_domain_map(d, virq + i, hwirq, cpu);
+			if (ret)
+				goto error;
+
+			i++;
+		}
+
+		/*
+		 * tell the parent about the base hwirq we allocated so it can
+		 * set its own domain data
+		 */
+		spec->hwirq = base_hwirq;
+	}
+
+	return 0;
+error:
+	bitmap_set(ipi_resrv, base_hwirq, nr_irqs);
+	return ret;
+}
+
+void gic_irq_domain_free(struct irq_domain *d, unsigned int virq,
+			 unsigned int nr_irqs)
+{
+	irq_hw_number_t base_hwirq;
+	struct irq_data *data;
+
+	data = irq_get_irq_data(virq);
+	if (!data)
+		return;
+
+	base_hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(data));
+	bitmap_set(ipi_resrv, base_hwirq, nr_irqs);
+}
+
 static const struct irq_domain_ops gic_irq_domain_ops = {
 	.map = gic_irq_domain_map,
 	.xlate = gic_irq_domain_xlate,
+	.alloc = gic_irq_domain_alloc,
+	.free = gic_irq_domain_free,
+};
+
+static int gic_ipi_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
+				const u32 *intspec, unsigned int intsize,
+				irq_hw_number_t *out_hwirq,
+				unsigned int *out_type)
+{
+	/*
+	 * There's nothing to translate here. hwirq is dynamically allocated and
+	 * the irq type is always edge triggered.
+	 * */
+	*out_hwirq = 0;
+	*out_type = IRQ_TYPE_EDGE_RISING;
+
+	return 0;
+}
+
+static int gic_ipi_domain_alloc(struct irq_domain *d, unsigned int virq,
+				unsigned int nr_irqs, void *arg)
+{
+	struct cpumask *ipimask = arg;
+	struct gic_irq_spec spec = {
+		.type = GIC_IPI,
+		.ipimask = ipimask
+	};
+	int ret, i;
+
+	ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec);
+	if (ret)
+		return ret;
+
+	/* the parent should have set spec.hwirq to the base_hwirq it allocated */
+	for (i = 0; i < nr_irqs; i++) {
+		ret = irq_domain_set_hwirq_and_chip(d, virq + i,
+						    GIC_SHARED_TO_HWIRQ(spec.hwirq + i),
+						    &gic_edge_irq_controller,
+						    NULL);
+		if (ret)
+			goto error;
+
+		ret = irq_set_irq_type(virq + i, IRQ_TYPE_EDGE_RISING);
+		if (ret)
+			goto error;
+	}
+
+	return 0;
+error:
+	irq_domain_free_irqs_parent(d, virq, nr_irqs);
+	return ret;
+}
+
+void gic_ipi_domain_free(struct irq_domain *d, unsigned int virq,
+			 unsigned int nr_irqs)
+{
+	irq_domain_free_irqs_parent(d, virq, nr_irqs);
+}
+
+int gic_ipi_domain_match(struct irq_domain *d, struct device_node *node,
+			 enum irq_domain_bus_token bus_token)
+{
+	bool is_ipi;
+
+	switch (bus_token) {
+	case DOMAIN_BUS_IPI:
+		is_ipi = d->bus_token == bus_token;
+		return to_of_node(d->fwnode) == node && is_ipi;
+		break;
+	default:
+		return 0;
+	}
+}
+
+static struct irq_domain_ops gic_ipi_domain_ops = {
+	.xlate = gic_ipi_domain_xlate,
+	.alloc = gic_ipi_domain_alloc,
+	.free = gic_ipi_domain_free,
+	.match = gic_ipi_domain_match,
 };
 
 static void __init __gic_init(unsigned long gic_base_addr,
@@ -864,6 +1026,18 @@ static void __init __gic_init(unsigned long gic_base_addr,
 	if (!gic_irq_domain)
 		panic("Failed to add GIC IRQ domain");
 
+	gic_ipi_domain = irq_domain_add_hierarchy(gic_irq_domain,
+						  IRQ_DOMAIN_FLAG_IPI_PER_CPU,
+						  GIC_NUM_LOCAL_INTRS + gic_shared_intrs,
+						  node, &gic_ipi_domain_ops, NULL);
+	if (!gic_ipi_domain)
+		panic("Failed to add GIC IPI domain");
+
+	gic_ipi_domain->bus_token = DOMAIN_BUS_IPI;
+
+	/* Make the last 2 * NR_CPUS available for IPIs */
+	bitmap_set(ipi_resrv, gic_shared_intrs - 2 * NR_CPUS, 2 * NR_CPUS);
+
 	gic_basic_init();
 
 	gic_ipi_init();
-- 
cgit v0.10.2


From c98c1822ee13e4501bf48a9e3184fb9a84c149c0 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:24 +0000
Subject: irqchip/mips-gic: Add device hierarchy domain

Now the root gic_irq_domain is split into device and IPI domains.

This form provides a better representation of how the root domain is split into
2. One for devices and one for IPIs.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-14-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 99f01ca..794fc59 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -47,6 +47,7 @@ static void __iomem *gic_base;
 static struct gic_pcpu_mask pcpu_masks[NR_CPUS];
 static DEFINE_SPINLOCK(gic_lock);
 static struct irq_domain *gic_irq_domain;
+static struct irq_domain *gic_dev_domain;
 static struct irq_domain *gic_ipi_domain;
 static int gic_shared_intrs;
 static int gic_vpes;
@@ -793,25 +794,6 @@ static int gic_irq_domain_map(struct irq_domain *d, unsigned int virq,
 	return gic_shared_irq_domain_map(d, virq, hw, 0);
 }
 
-static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
-				const u32 *intspec, unsigned int intsize,
-				irq_hw_number_t *out_hwirq,
-				unsigned int *out_type)
-{
-	if (intsize != 3)
-		return -EINVAL;
-
-	if (intspec[0] == GIC_SHARED)
-		*out_hwirq = GIC_SHARED_TO_HWIRQ(intspec[1]);
-	else if (intspec[0] == GIC_LOCAL)
-		*out_hwirq = GIC_LOCAL_TO_HWIRQ(intspec[1]);
-	else
-		return -EINVAL;
-	*out_type = intspec[2] & IRQ_TYPE_SENSE_MASK;
-
-	return 0;
-}
-
 static int gic_irq_domain_alloc(struct irq_domain *d, unsigned int virq,
 				unsigned int nr_irqs, void *arg)
 {
@@ -881,11 +863,86 @@ void gic_irq_domain_free(struct irq_domain *d, unsigned int virq,
 	bitmap_set(ipi_resrv, base_hwirq, nr_irqs);
 }
 
+int gic_irq_domain_match(struct irq_domain *d, struct device_node *node,
+			 enum irq_domain_bus_token bus_token)
+{
+	/* this domain should'nt be accessed directly */
+	return 0;
+}
+
 static const struct irq_domain_ops gic_irq_domain_ops = {
 	.map = gic_irq_domain_map,
-	.xlate = gic_irq_domain_xlate,
 	.alloc = gic_irq_domain_alloc,
 	.free = gic_irq_domain_free,
+	.match = gic_irq_domain_match,
+};
+
+static int gic_dev_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
+				const u32 *intspec, unsigned int intsize,
+				irq_hw_number_t *out_hwirq,
+				unsigned int *out_type)
+{
+	if (intsize != 3)
+		return -EINVAL;
+
+	if (intspec[0] == GIC_SHARED)
+		*out_hwirq = GIC_SHARED_TO_HWIRQ(intspec[1]);
+	else if (intspec[0] == GIC_LOCAL)
+		*out_hwirq = GIC_LOCAL_TO_HWIRQ(intspec[1]);
+	else
+		return -EINVAL;
+	*out_type = intspec[2] & IRQ_TYPE_SENSE_MASK;
+
+	return 0;
+}
+
+static int gic_dev_domain_alloc(struct irq_domain *d, unsigned int virq,
+				unsigned int nr_irqs, void *arg)
+{
+	struct irq_fwspec *fwspec = arg;
+	struct gic_irq_spec spec = {
+		.type = GIC_DEVICE,
+		.hwirq = fwspec->param[1],
+	};
+	int i, ret;
+	bool is_shared = fwspec->param[0] == GIC_SHARED;
+
+	if (is_shared) {
+		ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, &spec);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_hw_number_t hwirq;
+
+		if (is_shared)
+			hwirq = GIC_SHARED_TO_HWIRQ(spec.hwirq + i);
+		else
+			hwirq = GIC_LOCAL_TO_HWIRQ(spec.hwirq + i);
+
+		ret = irq_domain_set_hwirq_and_chip(d, virq + i,
+						    hwirq,
+						    &gic_level_irq_controller,
+						    NULL);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+void gic_dev_domain_free(struct irq_domain *d, unsigned int virq,
+			 unsigned int nr_irqs)
+{
+	/* no real allocation is done for dev irqs, so no need to free anything */
+	return;
+}
+
+static struct irq_domain_ops gic_dev_domain_ops = {
+	.xlate = gic_dev_domain_xlate,
+	.alloc = gic_dev_domain_alloc,
+	.free = gic_dev_domain_free,
 };
 
 static int gic_ipi_domain_xlate(struct irq_domain *d, struct device_node *ctrlr,
@@ -1026,6 +1083,12 @@ static void __init __gic_init(unsigned long gic_base_addr,
 	if (!gic_irq_domain)
 		panic("Failed to add GIC IRQ domain");
 
+	gic_dev_domain = irq_domain_add_hierarchy(gic_irq_domain, 0,
+						  GIC_NUM_LOCAL_INTRS + gic_shared_intrs,
+						  node, &gic_dev_domain_ops, NULL);
+	if (!gic_dev_domain)
+		panic("Failed to add GIC DEV domain");
+
 	gic_ipi_domain = irq_domain_add_hierarchy(gic_irq_domain,
 						  IRQ_DOMAIN_FLAG_IPI_PER_CPU,
 						  GIC_NUM_LOCAL_INTRS + gic_shared_intrs,
-- 
cgit v0.10.2


From 2a07870511829977d02609dac6450017b0419ea9 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:25 +0000
Subject: irqchip/mips-gic: Use gic_vpes instead of NR_CPUS

NR_CPUS is set by Kconfig and could be much higher than what actually is in the
system.

gic_vpes should be a true representitives of the number of cpus in the system,
so use it instead.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-15-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 794fc59..1fe73a1 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -465,7 +465,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *cpumask,
 	gic_map_to_vpe(irq, mips_cm_vp_id(cpumask_first(&tmp)));
 
 	/* Update the pcpu_masks */
-	for (i = 0; i < NR_CPUS; i++)
+	for (i = 0; i < gic_vpes; i++)
 		clear_bit(irq, pcpu_masks[i].pcpu_mask);
 	set_bit(irq, pcpu_masks[cpumask_first(&tmp)].pcpu_mask);
 
@@ -1098,8 +1098,8 @@ static void __init __gic_init(unsigned long gic_base_addr,
 
 	gic_ipi_domain->bus_token = DOMAIN_BUS_IPI;
 
-	/* Make the last 2 * NR_CPUS available for IPIs */
-	bitmap_set(ipi_resrv, gic_shared_intrs - 2 * NR_CPUS, 2 * NR_CPUS);
+	/* Make the last 2 * gic_vpes available for IPIs */
+	bitmap_set(ipi_resrv, gic_shared_intrs - 2 * gic_vpes, 2 * gic_vpes);
 
 	gic_basic_init();
 
-- 
cgit v0.10.2


From 78930f09b9406db029c69dcebad10cd7b6e06fae Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:26 +0000
Subject: irqchip/mips-gic: Clear percpu_masks correctly when mapping

When setting the mapping for a hwirq, make sure we clear percpu_masks for
all other cpus in case it was set previously.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-16-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 1fe73a1..83395bf 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -773,6 +773,7 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq,
 {
 	int intr = GIC_HWIRQ_TO_SHARED(hw);
 	unsigned long flags;
+	int i;
 
 	irq_set_chip_and_handler(virq, &gic_level_irq_controller,
 				 handle_level_irq);
@@ -780,6 +781,8 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq,
 	spin_lock_irqsave(&gic_lock, flags);
 	gic_map_to_pin(intr, gic_cpu_pin);
 	gic_map_to_vpe(intr, vpe);
+	for (i = 0; i < gic_vpes; i++)
+		clear_bit(intr, pcpu_masks[i].pcpu_mask);
 	set_bit(intr, pcpu_masks[vpe].pcpu_mask);
 	spin_unlock_irqrestore(&gic_lock, flags);
 
-- 
cgit v0.10.2


From fbde2d7d8290d8c642d591a471356abda2446874 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:27 +0000
Subject: MIPS: Add generic SMP IPI support

Use the new generic IPI layer to provide generic SMP IPI support if the irqchip
supports it.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-17-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index bd4385a..c012e19 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -33,12 +33,16 @@
 #include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/ftrace.h>
+#include <linux/irqdomain.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
 
 #include <linux/atomic.h>
 #include <asm/cpu.h>
 #include <asm/processor.h>
 #include <asm/idle.h>
 #include <asm/r4k-timer.h>
+#include <asm/mips-cpc.h>
 #include <asm/mmu_context.h>
 #include <asm/time.h>
 #include <asm/setup.h>
@@ -79,6 +83,11 @@ static cpumask_t cpu_core_setup_map;
 
 cpumask_t cpu_coherent_mask;
 
+#ifdef CONFIG_GENERIC_IRQ_IPI
+static struct irq_desc *call_desc;
+static struct irq_desc *sched_desc;
+#endif
+
 static inline void set_cpu_sibling_map(int cpu)
 {
 	int i;
@@ -145,6 +154,133 @@ void register_smp_ops(struct plat_smp_ops *ops)
 	mp_ops = ops;
 }
 
+#ifdef CONFIG_GENERIC_IRQ_IPI
+void mips_smp_send_ipi_single(int cpu, unsigned int action)
+{
+	mips_smp_send_ipi_mask(cpumask_of(cpu), action);
+}
+
+void mips_smp_send_ipi_mask(const struct cpumask *mask, unsigned int action)
+{
+	unsigned long flags;
+	unsigned int core;
+	int cpu;
+
+	local_irq_save(flags);
+
+	switch (action) {
+	case SMP_CALL_FUNCTION:
+		__ipi_send_mask(call_desc, mask);
+		break;
+
+	case SMP_RESCHEDULE_YOURSELF:
+		__ipi_send_mask(sched_desc, mask);
+		break;
+
+	default:
+		BUG();
+	}
+
+	if (mips_cpc_present()) {
+		for_each_cpu(cpu, mask) {
+			core = cpu_data[cpu].core;
+
+			if (core == current_cpu_data.core)
+				continue;
+
+			while (!cpumask_test_cpu(cpu, &cpu_coherent_mask)) {
+				mips_cpc_lock_other(core);
+				write_cpc_co_cmd(CPC_Cx_CMD_PWRUP);
+				mips_cpc_unlock_other();
+			}
+		}
+	}
+
+	local_irq_restore(flags);
+}
+
+
+static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
+{
+	scheduler_ipi();
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t ipi_call_interrupt(int irq, void *dev_id)
+{
+	generic_smp_call_function_interrupt();
+
+	return IRQ_HANDLED;
+}
+
+static struct irqaction irq_resched = {
+	.handler	= ipi_resched_interrupt,
+	.flags		= IRQF_PERCPU,
+	.name		= "IPI resched"
+};
+
+static struct irqaction irq_call = {
+	.handler	= ipi_call_interrupt,
+	.flags		= IRQF_PERCPU,
+	.name		= "IPI call"
+};
+
+static __init void smp_ipi_init_one(unsigned int virq,
+				    struct irqaction *action)
+{
+	int ret;
+
+	irq_set_handler(virq, handle_percpu_irq);
+	ret = setup_irq(virq, action);
+	BUG_ON(ret);
+}
+
+static int __init mips_smp_ipi_init(void)
+{
+	unsigned int call_virq, sched_virq;
+	struct irq_domain *ipidomain;
+	struct device_node *node;
+
+	node = of_irq_find_parent(of_root);
+	ipidomain = irq_find_matching_host(node, DOMAIN_BUS_IPI);
+
+	/*
+	 * Some platforms have half DT setup. So if we found irq node but
+	 * didn't find an ipidomain, try to search for one that is not in the
+	 * DT.
+	 */
+	if (node && !ipidomain)
+		ipidomain = irq_find_matching_host(NULL, DOMAIN_BUS_IPI);
+
+	BUG_ON(!ipidomain);
+
+	call_virq = irq_reserve_ipi(ipidomain, cpu_possible_mask);
+	BUG_ON(!call_virq);
+
+	sched_virq = irq_reserve_ipi(ipidomain, cpu_possible_mask);
+	BUG_ON(!sched_virq);
+
+	if (irq_domain_is_ipi_per_cpu(ipidomain)) {
+		int cpu;
+
+		for_each_cpu(cpu, cpu_possible_mask) {
+			smp_ipi_init_one(call_virq + cpu, &irq_call);
+			smp_ipi_init_one(sched_virq + cpu, &irq_resched);
+		}
+	} else {
+		smp_ipi_init_one(call_virq, &irq_call);
+		smp_ipi_init_one(sched_virq, &irq_resched);
+	}
+
+	call_desc = irq_to_desc(call_virq);
+	sched_desc = irq_to_desc(sched_virq);
+
+	return 0;
+}
+early_initcall(mips_smp_ipi_init);
+#endif
+
 /*
  * First C code run on the secondary CPUs after being started up by
  * the master.
-- 
cgit v0.10.2


From bb11cff327e54179c13446c4022ed4ed7d4871c7 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:28 +0000
Subject: MIPS: Make smp CMP, CPS and MT use the new generic IPI functions

This commit does several things to avoid breaking bisectability.

	1- Remove IPI init code from irqchip/mips-gic
	2- Implement the new irqchip->send_ipi() in irqchip/mips-gic
	3- Select GENERIC_IRQ_IPI Kconfig symbol for MIPS_GIC
	4- Change MIPS SMP to use the generic IPI implementation

Only the SMP variants that use GIC were converted as it's the only irqchip that
will have the support for generic IPI for now.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-18-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/mips/include/asm/smp-ops.h b/arch/mips/include/asm/smp-ops.h
index 6ba1fb8..db7c322 100644
--- a/arch/mips/include/asm/smp-ops.h
+++ b/arch/mips/include/asm/smp-ops.h
@@ -44,8 +44,9 @@ static inline void plat_smp_setup(void)
 	mp_ops->smp_setup();
 }
 
-extern void gic_send_ipi_single(int cpu, unsigned int action);
-extern void gic_send_ipi_mask(const struct cpumask *mask, unsigned int action);
+extern void mips_smp_send_ipi_single(int cpu, unsigned int action);
+extern void mips_smp_send_ipi_mask(const struct cpumask *mask,
+				      unsigned int action);
 
 #else /* !CONFIG_SMP */
 
diff --git a/arch/mips/kernel/smp-cmp.c b/arch/mips/kernel/smp-cmp.c
index d5e0f94..7692334 100644
--- a/arch/mips/kernel/smp-cmp.c
+++ b/arch/mips/kernel/smp-cmp.c
@@ -149,8 +149,8 @@ void __init cmp_prepare_cpus(unsigned int max_cpus)
 }
 
 struct plat_smp_ops cmp_smp_ops = {
-	.send_ipi_single	= gic_send_ipi_single,
-	.send_ipi_mask		= gic_send_ipi_mask,
+	.send_ipi_single	= mips_smp_send_ipi_single,
+	.send_ipi_mask		= mips_smp_send_ipi_mask,
 	.init_secondary		= cmp_init_secondary,
 	.smp_finish		= cmp_smp_finish,
 	.boot_secondary		= cmp_boot_secondary,
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index 2ad4e4c..253e140 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -472,8 +472,8 @@ static struct plat_smp_ops cps_smp_ops = {
 	.boot_secondary		= cps_boot_secondary,
 	.init_secondary		= cps_init_secondary,
 	.smp_finish		= cps_smp_finish,
-	.send_ipi_single	= gic_send_ipi_single,
-	.send_ipi_mask		= gic_send_ipi_mask,
+	.send_ipi_single	= mips_smp_send_ipi_single,
+	.send_ipi_mask		= mips_smp_send_ipi_mask,
 #ifdef CONFIG_HOTPLUG_CPU
 	.cpu_disable		= cps_cpu_disable,
 	.cpu_die		= cps_cpu_die,
diff --git a/arch/mips/kernel/smp-mt.c b/arch/mips/kernel/smp-mt.c
index 86311a1..4f9570a 100644
--- a/arch/mips/kernel/smp-mt.c
+++ b/arch/mips/kernel/smp-mt.c
@@ -121,7 +121,7 @@ static void vsmp_send_ipi_single(int cpu, unsigned int action)
 
 #ifdef CONFIG_MIPS_GIC
 	if (gic_present) {
-		gic_send_ipi_single(cpu, action);
+		mips_smp_send_ipi_single(cpu, action);
 		return;
 	}
 #endif
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 71e648a..00bbec6 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -209,6 +209,7 @@ config KEYSTONE_IRQ
 
 config MIPS_GIC
 	bool
+	select GENERIC_IRQ_IPI
 	select IRQ_DOMAIN_HIERARCHY
 	select MIPS_CM
 
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 83395bf..37831a5 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -280,9 +280,11 @@ static void gic_bind_eic_interrupt(int irq, int set)
 		  GIC_VPE_EIC_SS(irq), set);
 }
 
-void gic_send_ipi(unsigned int intr)
+static void gic_send_ipi(struct irq_data *d, unsigned int cpu)
 {
-	gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(intr));
+	irq_hw_number_t hwirq = GIC_HWIRQ_TO_SHARED(irqd_to_hwirq(d));
+
+	gic_write(GIC_REG(SHARED, GIC_SH_WEDGE), GIC_SH_WEDGE_SET(hwirq));
 }
 
 int gic_get_c0_compare_int(void)
@@ -495,6 +497,7 @@ static struct irq_chip gic_edge_irq_controller = {
 #ifdef CONFIG_SMP
 	.irq_set_affinity	=	gic_set_affinity,
 #endif
+	.ipi_send_single	=	gic_send_ipi,
 };
 
 static void gic_handle_local_int(bool chained)
@@ -588,83 +591,6 @@ static void gic_irq_dispatch(struct irq_desc *desc)
 	gic_handle_shared_int(true);
 }
 
-#ifdef CONFIG_MIPS_GIC_IPI
-static int gic_resched_int_base;
-static int gic_call_int_base;
-
-unsigned int plat_ipi_resched_int_xlate(unsigned int cpu)
-{
-	return gic_resched_int_base + cpu;
-}
-
-unsigned int plat_ipi_call_int_xlate(unsigned int cpu)
-{
-	return gic_call_int_base + cpu;
-}
-
-static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
-{
-	scheduler_ipi();
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t ipi_call_interrupt(int irq, void *dev_id)
-{
-	generic_smp_call_function_interrupt();
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction irq_resched = {
-	.handler	= ipi_resched_interrupt,
-	.flags		= IRQF_PERCPU,
-	.name		= "IPI resched"
-};
-
-static struct irqaction irq_call = {
-	.handler	= ipi_call_interrupt,
-	.flags		= IRQF_PERCPU,
-	.name		= "IPI call"
-};
-
-static __init void gic_ipi_init_one(unsigned int intr, int cpu,
-				    struct irqaction *action)
-{
-	int virq = irq_create_mapping(gic_irq_domain,
-				      GIC_SHARED_TO_HWIRQ(intr));
-	int i;
-
-	gic_map_to_vpe(intr, mips_cm_vp_id(cpu));
-	for (i = 0; i < NR_CPUS; i++)
-		clear_bit(intr, pcpu_masks[i].pcpu_mask);
-	set_bit(intr, pcpu_masks[cpu].pcpu_mask);
-
-	irq_set_irq_type(virq, IRQ_TYPE_EDGE_RISING);
-
-	irq_set_handler(virq, handle_percpu_irq);
-	setup_irq(virq, action);
-}
-
-static __init void gic_ipi_init(void)
-{
-	int i;
-
-	/* Use last 2 * NR_CPUS interrupts as IPIs */
-	gic_resched_int_base = gic_shared_intrs - nr_cpu_ids;
-	gic_call_int_base = gic_resched_int_base - nr_cpu_ids;
-
-	for (i = 0; i < nr_cpu_ids; i++) {
-		gic_ipi_init_one(gic_call_int_base + i, i, &irq_call);
-		gic_ipi_init_one(gic_resched_int_base + i, i, &irq_resched);
-	}
-}
-#else
-static inline void gic_ipi_init(void)
-{
-}
-#endif
-
 static void __init gic_basic_init(void)
 {
 	unsigned int i;
@@ -1105,8 +1031,6 @@ static void __init __gic_init(unsigned long gic_base_addr,
 	bitmap_set(ipi_resrv, gic_shared_intrs - 2 * gic_vpes, 2 * gic_vpes);
 
 	gic_basic_init();
-
-	gic_ipi_init();
 }
 
 void __init gic_init(unsigned long gic_base_addr,
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index ce824db..80f89e4 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -261,9 +261,6 @@ extern void gic_write_compare(cycle_t cnt);
 extern void gic_write_cpu_compare(cycle_t cnt, int cpu);
 extern void gic_start_count(void);
 extern void gic_stop_count(void);
-extern void gic_send_ipi(unsigned int intr);
-extern unsigned int plat_ipi_call_int_xlate(unsigned int);
-extern unsigned int plat_ipi_resched_int_xlate(unsigned int);
 extern int gic_get_c0_compare_int(void);
 extern int gic_get_c0_perfcount_int(void);
 extern int gic_get_c0_fdc_int(void);
-- 
cgit v0.10.2


From 7eb8c99db26cc6499bfb1eba72dffc4730570752 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:29 +0000
Subject: MIPS: Delete smp-gic.c

We now have a generic IPI layer that will use GIC automatically
if it's compiled in.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-19-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index eb079ac..ddcbeda 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2170,7 +2170,6 @@ config MIPS_MT_SMP
 	select CPU_MIPSR2_IRQ_VI
 	select CPU_MIPSR2_IRQ_EI
 	select SYNC_R4K
-	select MIPS_GIC_IPI
 	select MIPS_MT
 	select SMP
 	select SMP_UP
@@ -2268,7 +2267,6 @@ config MIPS_VPE_APSP_API_MT
 config MIPS_CMP
 	bool "MIPS CMP framework support (DEPRECATED)"
 	depends on SYS_SUPPORTS_MIPS_CMP && !CPU_MIPSR6
-	select MIPS_GIC_IPI
 	select SMP
 	select SYNC_R4K
 	select SYS_SUPPORTS_SMP
@@ -2288,7 +2286,6 @@ config MIPS_CPS
 	select MIPS_CM
 	select MIPS_CPC
 	select MIPS_CPS_PM if HOTPLUG_CPU
-	select MIPS_GIC_IPI
 	select SMP
 	select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
 	select SYS_SUPPORTS_HOTPLUG_CPU
@@ -2306,9 +2303,6 @@ config MIPS_CPS_PM
 	select MIPS_CPC
 	bool
 
-config MIPS_GIC_IPI
-	bool
-
 config MIPS_CM
 	bool
 
diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
index 68e2b7d..b0988fd 100644
--- a/arch/mips/kernel/Makefile
+++ b/arch/mips/kernel/Makefile
@@ -52,7 +52,6 @@ obj-$(CONFIG_MIPS_MT_SMP)	+= smp-mt.o
 obj-$(CONFIG_MIPS_CMP)		+= smp-cmp.o
 obj-$(CONFIG_MIPS_CPS)		+= smp-cps.o cps-vec.o
 obj-$(CONFIG_MIPS_CPS_NS16550)	+= cps-vec-ns16550.o
-obj-$(CONFIG_MIPS_GIC_IPI)	+= smp-gic.o
 obj-$(CONFIG_MIPS_SPRAM)	+= spram.o
 
 obj-$(CONFIG_MIPS_VPE_LOADER)	+= vpe.o
-- 
cgit v0.10.2


From 16a8083cedbe628228dbb08fc1469c70e6208619 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@imgtec.com>
Date: Tue, 8 Dec 2015 13:20:30 +0000
Subject: irqchip/mips-gic: Add new DT property to reserve IPIs

The new property will allow to specify the range of GIC hwirqs to use for IPIs.

This is an optinal property. We preserve the previous behaviour of allocating
the last 2 * gic_vpes if it's not specified or DT is not supported.

Signed-off-by: Qais Yousef <qais.yousef@imgtec.com>
Acked-by: Rob Herring <robh@kernel.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Cc: <jason@lakedaemon.net>
Cc: <marc.zyngier@arm.com>
Cc: <jiang.liu@linux.intel.com>
Cc: <linux-mips@linux-mips.org>
Cc: <lisa.parratt@imgtec.com>
Cc: Qais Yousef <qsyousef@gmail.com>
Link: http://lkml.kernel.org/r/1449580830-23652-20-git-send-email-qais.yousef@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt b/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt
index aae4c38..1735953 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/mips-gic.txt
@@ -23,6 +23,12 @@ Optional properties:
 - mti,reserved-cpu-vectors : Specifies the list of CPU interrupt vectors
   to which the GIC may not route interrupts.  Valid values are 2 - 7.
   This property is ignored if the CPU is started in EIC mode.
+- mti,reserved-ipi-vectors : Specifies the range of GIC interrupts that are
+  reserved for IPIs.
+  It accepts 2 values, the 1st is the starting interrupt and the 2nd is the size
+  of the reserved range.
+  If not specified, the driver will allocate the last 2 * number of VPEs in the
+  system.
 
 Required properties for timer sub-node:
 - compatible : Should be "mti,gic-timer".
@@ -44,6 +50,7 @@ Example:
 		#interrupt-cells = <3>;
 
 		mti,reserved-cpu-vectors = <7>;
+		mti,reserved-ipi-vectors = <40 8>;
 
 		timer {
 			compatible = "mti,gic-timer";
diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c
index 37831a5..94a30da 100644
--- a/drivers/irqchip/irq-mips-gic.c
+++ b/drivers/irqchip/irq-mips-gic.c
@@ -957,6 +957,7 @@ static void __init __gic_init(unsigned long gic_base_addr,
 			      struct device_node *node)
 {
 	unsigned int gicconfig;
+	unsigned int v[2];
 
 	__gic_base_addr = gic_base_addr;
 
@@ -1027,8 +1028,15 @@ static void __init __gic_init(unsigned long gic_base_addr,
 
 	gic_ipi_domain->bus_token = DOMAIN_BUS_IPI;
 
-	/* Make the last 2 * gic_vpes available for IPIs */
-	bitmap_set(ipi_resrv, gic_shared_intrs - 2 * gic_vpes, 2 * gic_vpes);
+	if (node &&
+	    !of_property_read_u32_array(node, "mti,reserved-ipi-vectors", v, 2)) {
+		bitmap_set(ipi_resrv, v[0], v[1]);
+	} else {
+		/* Make the last 2 * gic_vpes available for IPIs */
+		bitmap_set(ipi_resrv,
+			   gic_shared_intrs - 2 * gic_vpes,
+			   2 * gic_vpes);
+	}
 
 	gic_basic_init();
 }
-- 
cgit v0.10.2


From 13b35686e8b934ff78f59cef0c65fa3a43f8eeaf Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Fri, 19 Feb 2016 09:46:37 +0100
Subject: wait.[ch]: Introduce the simple waitqueue (swait) implementation

The existing wait queue support has support for custom wake up call
backs, wake flags, wake key (passed to call back) and exclusive
flags that allow wakers to be tagged as exclusive, for limiting
the number of wakers.

In a lot of cases, none of these features are used, and hence we
can benefit from a slimmed down version that lowers memory overhead
and reduces runtime overhead.

The concept originated from -rt, where waitqueues are a constant
source of trouble, as we can't convert the head lock to a raw
spinlock due to fancy and long lasting callbacks.

With the removal of custom callbacks, we can use a raw lock for
queue list manipulations, hence allowing the simple wait support
to be used in -rt.

[Patch is from PeterZ which is based on Thomas version. Commit message is
 written by Paul G.
 Daniel:  - Fixed some compile issues
 	  - Added non-lazy implementation of swake_up_locked as suggested
	     by Boqun Feng.]

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: linux-rt-users@vger.kernel.org
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1455871601-27484-2-git-send-email-wagi@monom.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/swait.h b/include/linux/swait.h
new file mode 100644
index 0000000..c1f9c62
--- /dev/null
+++ b/include/linux/swait.h
@@ -0,0 +1,172 @@
+#ifndef _LINUX_SWAIT_H
+#define _LINUX_SWAIT_H
+
+#include <linux/list.h>
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <asm/current.h>
+
+/*
+ * Simple wait queues
+ *
+ * While these are very similar to the other/complex wait queues (wait.h) the
+ * most important difference is that the simple waitqueue allows for
+ * deterministic behaviour -- IOW it has strictly bounded IRQ and lock hold
+ * times.
+ *
+ * In order to make this so, we had to drop a fair number of features of the
+ * other waitqueue code; notably:
+ *
+ *  - mixing INTERRUPTIBLE and UNINTERRUPTIBLE sleeps on the same waitqueue;
+ *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
+ *    sleeper state.
+ *
+ *  - the exclusive mode; because this requires preserving the list order
+ *    and this is hard.
+ *
+ *  - custom wake functions; because you cannot give any guarantees about
+ *    random code.
+ *
+ * As a side effect of this; the data structures are slimmer.
+ *
+ * One would recommend using this wait queue where possible.
+ */
+
+struct task_struct;
+
+struct swait_queue_head {
+	raw_spinlock_t		lock;
+	struct list_head	task_list;
+};
+
+struct swait_queue {
+	struct task_struct	*task;
+	struct list_head	task_list;
+};
+
+#define __SWAITQUEUE_INITIALIZER(name) {				\
+	.task		= current,					\
+	.task_list	= LIST_HEAD_INIT((name).task_list),		\
+}
+
+#define DECLARE_SWAITQUEUE(name)					\
+	struct swait_queue name = __SWAITQUEUE_INITIALIZER(name)
+
+#define __SWAIT_QUEUE_HEAD_INITIALIZER(name) {				\
+	.lock		= __RAW_SPIN_LOCK_UNLOCKED(name.lock),		\
+	.task_list	= LIST_HEAD_INIT((name).task_list),		\
+}
+
+#define DECLARE_SWAIT_QUEUE_HEAD(name)					\
+	struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INITIALIZER(name)
+
+extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+				    struct lock_class_key *key);
+
+#define init_swait_queue_head(q)				\
+	do {							\
+		static struct lock_class_key __key;		\
+		__init_swait_queue_head((q), #q, &__key);	\
+	} while (0)
+
+#ifdef CONFIG_LOCKDEP
+# define __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)			\
+	({ init_swait_queue_head(&name); name; })
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)			\
+	struct swait_queue_head name = __SWAIT_QUEUE_HEAD_INIT_ONSTACK(name)
+#else
+# define DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(name)			\
+	DECLARE_SWAIT_QUEUE_HEAD(name)
+#endif
+
+static inline int swait_active(struct swait_queue_head *q)
+{
+	return !list_empty(&q->task_list);
+}
+
+extern void swake_up(struct swait_queue_head *q);
+extern void swake_up_all(struct swait_queue_head *q);
+extern void swake_up_locked(struct swait_queue_head *q);
+
+extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
+extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
+
+extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
+#define ___swait_event(wq, condition, state, ret, cmd)			\
+({									\
+	struct swait_queue __wait;					\
+	long __ret = ret;						\
+									\
+	INIT_LIST_HEAD(&__wait.task_list);				\
+	for (;;) {							\
+		long __int = prepare_to_swait_event(&wq, &__wait, state);\
+									\
+		if (condition)						\
+			break;						\
+									\
+		if (___wait_is_interruptible(state) && __int) {		\
+			__ret = __int;					\
+			break;						\
+		}							\
+									\
+		cmd;							\
+	}								\
+	finish_swait(&wq, &__wait);					\
+	__ret;								\
+})
+
+#define __swait_event(wq, condition)					\
+	(void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,	\
+			    schedule())
+
+#define swait_event(wq, condition)					\
+do {									\
+	if (condition)							\
+		break;							\
+	__swait_event(wq, condition);					\
+} while (0)
+
+#define __swait_event_timeout(wq, condition, timeout)			\
+	___swait_event(wq, ___wait_cond_timeout(condition),		\
+		      TASK_UNINTERRUPTIBLE, timeout,			\
+		      __ret = schedule_timeout(__ret))
+
+#define swait_event_timeout(wq, condition, timeout)			\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __swait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+
+#define __swait_event_interruptible(wq, condition)			\
+	___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,		\
+		      schedule())
+
+#define swait_event_interruptible(wq, condition)			\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__ret = __swait_event_interruptible(wq, condition);	\
+	__ret;								\
+})
+
+#define __swait_event_interruptible_timeout(wq, condition, timeout)	\
+	___swait_event(wq, ___wait_cond_timeout(condition),		\
+		      TASK_INTERRUPTIBLE, timeout,			\
+		      __ret = schedule_timeout(__ret))
+
+#define swait_event_interruptible_timeout(wq, condition, timeout)	\
+({									\
+	long __ret = timeout;						\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __swait_event_interruptible_timeout(wq,		\
+						condition, timeout);	\
+	__ret;								\
+})
+
+#endif /* _LINUX_SWAIT_H */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 6768797..7d4cba2 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
 
 obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o idle.o
+obj-y += wait.o swait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
new file mode 100644
index 0000000..82f0dff
--- /dev/null
+++ b/kernel/sched/swait.c
@@ -0,0 +1,123 @@
+#include <linux/sched.h>
+#include <linux/swait.h>
+
+void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
+			     struct lock_class_key *key)
+{
+	raw_spin_lock_init(&q->lock);
+	lockdep_set_class_and_name(&q->lock, key, name);
+	INIT_LIST_HEAD(&q->task_list);
+}
+EXPORT_SYMBOL(__init_swait_queue_head);
+
+/*
+ * The thing about the wake_up_state() return value; I think we can ignore it.
+ *
+ * If for some reason it would return 0, that means the previously waiting
+ * task is already running, so it will observe condition true (or has already).
+ */
+void swake_up_locked(struct swait_queue_head *q)
+{
+	struct swait_queue *curr;
+
+	if (list_empty(&q->task_list))
+		return;
+
+	curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
+	wake_up_process(curr->task);
+	list_del_init(&curr->task_list);
+}
+EXPORT_SYMBOL(swake_up_locked);
+
+void swake_up(struct swait_queue_head *q)
+{
+	unsigned long flags;
+
+	if (!swait_active(q))
+		return;
+
+	raw_spin_lock_irqsave(&q->lock, flags);
+	swake_up_locked(q);
+	raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(swake_up);
+
+/*
+ * Does not allow usage from IRQ disabled, since we must be able to
+ * release IRQs to guarantee bounded hold time.
+ */
+void swake_up_all(struct swait_queue_head *q)
+{
+	struct swait_queue *curr;
+	LIST_HEAD(tmp);
+
+	if (!swait_active(q))
+		return;
+
+	raw_spin_lock_irq(&q->lock);
+	list_splice_init(&q->task_list, &tmp);
+	while (!list_empty(&tmp)) {
+		curr = list_first_entry(&tmp, typeof(*curr), task_list);
+
+		wake_up_state(curr->task, TASK_NORMAL);
+		list_del_init(&curr->task_list);
+
+		if (list_empty(&tmp))
+			break;
+
+		raw_spin_unlock_irq(&q->lock);
+		raw_spin_lock_irq(&q->lock);
+	}
+	raw_spin_unlock_irq(&q->lock);
+}
+EXPORT_SYMBOL(swake_up_all);
+
+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	wait->task = current;
+	if (list_empty(&wait->task_list))
+		list_add(&wait->task_list, &q->task_list);
+}
+
+void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&q->lock, flags);
+	__prepare_to_swait(q, wait);
+	set_current_state(state);
+	raw_spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(prepare_to_swait);
+
+long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
+{
+	if (signal_pending_state(state, current))
+		return -ERESTARTSYS;
+
+	prepare_to_swait(q, wait, state);
+
+	return 0;
+}
+EXPORT_SYMBOL(prepare_to_swait_event);
+
+void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	__set_current_state(TASK_RUNNING);
+	if (!list_empty(&wait->task_list))
+		list_del_init(&wait->task_list);
+}
+
+void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+
+	if (!list_empty_careful(&wait->task_list)) {
+		raw_spin_lock_irqsave(&q->lock, flags);
+		list_del_init(&wait->task_list);
+		raw_spin_unlock_irqrestore(&q->lock, flags);
+	}
+}
+EXPORT_SYMBOL(finish_swait);
-- 
cgit v0.10.2


From ef50c046338948df9f23fb5ef853efda0274c7b3 Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Fri, 19 Feb 2016 09:46:38 +0100
Subject: kbuild: Add option to turn incompatible pointer check into error

With the introduction of the simple wait API we have two very
similar APIs in the kernel. For example wake_up() and swake_up()
is only one character away. Although the compiler will warn
happily the wrong usage it keeps on going an even links the kernel.
Thomas and Peter would rather like to see early missuses reported
as error early on.

In a first attempt we tried to wrap all swait and wait calls
into a macro which has an compile time type assertion. The result
was pretty ugly and wasn't able to catch all wrong usages.
woken_wake_function(), autoremove_wake_function() and wake_bit_function()
are assigned as function pointers. Wrapping them with a macro around is
not possible. Prefixing them with '_' was also not a real option
because there some users in the kernel which do use them as well.
All in all this attempt looked to intrusive and too ugly.

An alternative is to turn the pointer type check into an error which
catches wrong type uses. Obviously not only the swait/wait ones. That
isn't a bad thing either.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: linux-rt-users@vger.kernel.org
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1455871601-27484-3-git-send-email-wagi@monom.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/Makefile b/Makefile
index 6c1a3c2..4513509 100644
--- a/Makefile
+++ b/Makefile
@@ -773,6 +773,9 @@ KBUILD_CFLAGS   += $(call cc-option,-Werror=strict-prototypes)
 # Prohibit date/time macros, which would make the build non-deterministic
 KBUILD_CFLAGS   += $(call cc-option,-Werror=date-time)
 
+# enforce correct pointer usage
+KBUILD_CFLAGS   += $(call cc-option,-Werror=incompatible-pointer-types)
+
 # use the deterministic mode of AR if available
 KBUILD_ARFLAGS := $(call ar-option,D)
 
-- 
cgit v0.10.2


From 8577370fb0cbe88266b7583d8d3b9f43ced077a0 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 19 Feb 2016 09:46:39 +0100
Subject: KVM: Use simple waitqueue for vcpu->wq

The problem:

On -rt, an emulated LAPIC timer instances has the following path:

1) hard interrupt
2) ksoftirqd is scheduled
3) ksoftirqd wakes up vcpu thread
4) vcpu thread is scheduled

This extra context switch introduces unnecessary latency in the
LAPIC path for a KVM guest.

The solution:

Allow waking up vcpu thread from hardirq context,
thus avoiding the need for ksoftirqd to be scheduled.

Normal waitqueues make use of spinlocks, which on -RT
are sleepable locks. Therefore, waking up a waitqueue
waiter involves locking a sleeping lock, which
is not allowed from hard interrupt context.

cyclictest command line:

This patch reduces the average latency in my tests from 14us to 11us.

Daniel writes:
Paolo asked for numbers from kvm-unit-tests/tscdeadline_latency
benchmark on mainline. The test was run 1000 times on
tip/sched/core 4.4.0-rc8-01134-g0905f04:

  ./x86-run x86/tscdeadline_latency.flat -cpu host

with idle=poll.

The test seems not to deliver really stable numbers though most of
them are smaller. Paolo write:

"Anything above ~10000 cycles means that the host went to C1 or
lower---the number means more or less nothing in that case.

The mean shows an improvement indeed."

Before:

               min             max         mean           std
count  1000.000000     1000.000000  1000.000000   1000.000000
mean   5162.596000  2019270.084000  5824.491541  20681.645558
std      75.431231   622607.723969    89.575700   6492.272062
min    4466.000000    23928.000000  5537.926500    585.864966
25%    5163.000000  1613252.750000  5790.132275  16683.745433
50%    5175.000000  2281919.000000  5834.654000  23151.990026
75%    5190.000000  2382865.750000  5861.412950  24148.206168
max    5228.000000  4175158.000000  6254.827300  46481.048691

After
               min            max         mean           std
count  1000.000000     1000.00000  1000.000000   1000.000000
mean   5143.511000  2076886.10300  5813.312474  21207.357565
std      77.668322   610413.09583    86.541500   6331.915127
min    4427.000000    25103.00000  5529.756600    559.187707
25%    5148.000000  1691272.75000  5784.889825  17473.518244
50%    5160.000000  2308328.50000  5832.025000  23464.837068
75%    5172.000000  2393037.75000  5853.177675  24223.969976
max    5222.000000  3922458.00000  6186.720500  42520.379830

[Patch was originaly based on the swait implementation found in the -rt
 tree. Daniel ported it to mainline's version and gathered the
 benchmark numbers for tscdeadline_latency test.]

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: linux-rt-users@vger.kernel.org
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1455871601-27484-4-git-send-email-wagi@monom.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index dda1959..08e49c4 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -506,18 +506,18 @@ static void kvm_arm_resume_guest(struct kvm *kvm)
 	struct kvm_vcpu *vcpu;
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
-		wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+		struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 
 		vcpu->arch.pause = false;
-		wake_up_interruptible(wq);
+		swake_up(wq);
 	}
 }
 
 static void vcpu_sleep(struct kvm_vcpu *vcpu)
 {
-	wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+	struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
 
-	wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
+	swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
 				       (!vcpu->arch.pause)));
 }
 
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index a9b3b90..c2b1315 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -70,7 +70,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 {
 	struct kvm *kvm = source_vcpu->kvm;
 	struct kvm_vcpu *vcpu = NULL;
-	wait_queue_head_t *wq;
+	struct swait_queue_head *wq;
 	unsigned long cpu_id;
 	unsigned long context_id;
 	phys_addr_t target_pc;
@@ -119,7 +119,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 	smp_mb();		/* Make sure the above is visible */
 
 	wq = kvm_arch_vcpu_wq(vcpu);
-	wake_up_interruptible(wq);
+	swake_up(wq);
 
 	return PSCI_RET_SUCCESS;
 }
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 8bc3977..341f6a1 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -445,8 +445,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 
 	dvcpu->arch.wait = 0;
 
-	if (waitqueue_active(&dvcpu->wq))
-		wake_up_interruptible(&dvcpu->wq);
+	if (swait_active(&dvcpu->wq))
+		swake_up(&dvcpu->wq);
 
 	return 0;
 }
@@ -1174,8 +1174,8 @@ static void kvm_mips_comparecount_func(unsigned long data)
 	kvm_mips_callbacks->queue_timer_int(vcpu);
 
 	vcpu->arch.wait = 0;
-	if (waitqueue_active(&vcpu->wq))
-		wake_up_interruptible(&vcpu->wq);
+	if (swait_active(&vcpu->wq))
+		swake_up(&vcpu->wq);
 }
 
 /* low level hrtimer wake routine */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 9d08d8c..c98afa5 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -289,7 +289,7 @@ struct kvmppc_vcore {
 	struct list_head runnable_threads;
 	struct list_head preempt_list;
 	spinlock_t lock;
-	wait_queue_head_t wq;
+	struct swait_queue_head wq;
 	spinlock_t stoltb_lock;	/* protects stolen_tb and preempt_tb */
 	u64 stolen_tb;
 	u64 preempt_tb;
@@ -629,7 +629,7 @@ struct kvm_vcpu_arch {
 	u8 prodded;
 	u32 last_inst;
 
-	wait_queue_head_t *wqp;
+	struct swait_queue_head *wqp;
 	struct kvmppc_vcore *vcore;
 	int ret;
 	int trap;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index baeddb0..f1187bb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -114,11 +114,11 @@ static bool kvmppc_ipi_thread(int cpu)
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
 	int cpu;
-	wait_queue_head_t *wqp;
+	struct swait_queue_head *wqp;
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
-	if (waitqueue_active(wqp)) {
-		wake_up_interruptible(wqp);
+	if (swait_active(wqp)) {
+		swake_up(wqp);
 		++vcpu->stat.halt_wakeup;
 	}
 
@@ -701,8 +701,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		tvcpu->arch.prodded = 1;
 		smp_mb();
 		if (vcpu->arch.ceded) {
-			if (waitqueue_active(&vcpu->wq)) {
-				wake_up_interruptible(&vcpu->wq);
+			if (swait_active(&vcpu->wq)) {
+				swake_up(&vcpu->wq);
 				vcpu->stat.halt_wakeup++;
 			}
 		}
@@ -1459,7 +1459,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 	INIT_LIST_HEAD(&vcore->runnable_threads);
 	spin_lock_init(&vcore->lock);
 	spin_lock_init(&vcore->stoltb_lock);
-	init_waitqueue_head(&vcore->wq);
+	init_swait_queue_head(&vcore->wq);
 	vcore->preempt_tb = TB_NIL;
 	vcore->lpcr = kvm->arch.lpcr;
 	vcore->first_vcpuid = core * threads_per_subcore;
@@ -2531,10 +2531,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
 	struct kvm_vcpu *vcpu;
 	int do_sleep = 1;
+	DECLARE_SWAITQUEUE(wait);
 
-	DEFINE_WAIT(wait);
-
-	prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
 
 	/*
 	 * Check one last time for pending exceptions and ceded state after
@@ -2548,7 +2547,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	}
 
 	if (!do_sleep) {
-		finish_wait(&vc->wq, &wait);
+		finish_swait(&vc->wq, &wait);
 		return;
 	}
 
@@ -2556,7 +2555,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	trace_kvmppc_vcore_blocked(vc, 0);
 	spin_unlock(&vc->lock);
 	schedule();
-	finish_wait(&vc->wq, &wait);
+	finish_swait(&vc->wq, &wait);
 	spin_lock(&vc->lock);
 	vc->vcore_state = VCORE_INACTIVE;
 	trace_kvmppc_vcore_blocked(vc, 1);
@@ -2612,7 +2611,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			kvmppc_start_thread(vcpu, vc);
 			trace_kvm_guest_enter(vcpu);
 		} else if (vc->vcore_state == VCORE_SLEEPING) {
-			wake_up(&vc->wq);
+			swake_up(&vc->wq);
 		}
 
 	}
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8959ebb..b0c8ad0 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -467,7 +467,7 @@ struct kvm_s390_irq_payload {
 struct kvm_s390_local_interrupt {
 	spinlock_t lock;
 	struct kvm_s390_float_interrupt *float_int;
-	wait_queue_head_t *wq;
+	struct swait_queue_head *wq;
 	atomic_t *cpuflags;
 	DECLARE_BITMAP(sigp_emerg_pending, KVM_MAX_VCPUS);
 	struct kvm_s390_irq_payload irq;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index f88ca72..9ffc732 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -966,13 +966,13 @@ no_timer:
 
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 {
-	if (waitqueue_active(&vcpu->wq)) {
+	if (swait_active(&vcpu->wq)) {
 		/*
 		 * The vcpu gave up the cpu voluntarily, mark it as a good
 		 * yield-candidate.
 		 */
 		vcpu->preempted = true;
-		wake_up_interruptible(&vcpu->wq);
+		swake_up(&vcpu->wq);
 		vcpu->stat.halt_wakeup++;
 	}
 }
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36591fa..3a045f3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1195,7 +1195,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
 static void apic_timer_expired(struct kvm_lapic *apic)
 {
 	struct kvm_vcpu *vcpu = apic->vcpu;
-	wait_queue_head_t *q = &vcpu->wq;
+	struct swait_queue_head *q = &vcpu->wq;
 	struct kvm_timer *ktimer = &apic->lapic_timer;
 
 	if (atomic_read(&apic->lapic_timer.pending))
@@ -1204,8 +1204,8 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 	atomic_inc(&apic->lapic_timer.pending);
 	kvm_set_pending_timer(vcpu);
 
-	if (waitqueue_active(q))
-		wake_up_interruptible(q);
+	if (swait_active(q))
+		swake_up(q);
 
 	if (apic_lvtt_tscdeadline(apic))
 		ktimer->expired_tscdeadline = ktimer->tscdeadline;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 861f690..5276fe0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -25,6 +25,7 @@
 #include <linux/irqflags.h>
 #include <linux/context_tracking.h>
 #include <linux/irqbypass.h>
+#include <linux/swait.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -218,7 +219,7 @@ struct kvm_vcpu {
 	int fpu_active;
 	int guest_fpu_loaded, guest_xcr0_loaded;
 	unsigned char fpu_counter;
-	wait_queue_head_t wq;
+	struct swait_queue_head wq;
 	struct pid *pid;
 	int sigset_active;
 	sigset_t sigset;
@@ -782,7 +783,7 @@ static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
 }
 #endif
 
-static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
+static inline struct swait_queue_head *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 {
 #ifdef __KVM_HAVE_ARCH_WQP
 	return vcpu->arch.wqp;
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 3531599..73c1a2a 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -97,8 +97,8 @@ static void async_pf_execute(struct work_struct *work)
 	 * This memory barrier pairs with prepare_to_wait's set_current_state()
 	 */
 	smp_mb();
-	if (waitqueue_active(&vcpu->wq))
-		wake_up_interruptible(&vcpu->wq);
+	if (swait_active(&vcpu->wq))
+		swake_up(&vcpu->wq);
 
 	mmput(mm);
 	kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a11cfd2..f8417d0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -216,8 +216,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	vcpu->pid = NULL;
-	vcpu->halt_poll_ns = 0;
-	init_waitqueue_head(&vcpu->wq);
+	init_swait_queue_head(&vcpu->wq);
 	kvm_async_pf_vcpu_init(vcpu);
 
 	vcpu->pre_pcpu = -1;
@@ -1990,7 +1989,7 @@ static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 {
 	ktime_t start, cur;
-	DEFINE_WAIT(wait);
+	DECLARE_SWAITQUEUE(wait);
 	bool waited = false;
 	u64 block_ns;
 
@@ -2015,7 +2014,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	kvm_arch_vcpu_blocking(vcpu);
 
 	for (;;) {
-		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
 		if (kvm_vcpu_check_block(vcpu) < 0)
 			break;
@@ -2024,7 +2023,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		schedule();
 	}
 
-	finish_wait(&vcpu->wq, &wait);
+	finish_swait(&vcpu->wq, &wait);
 	cur = ktime_get();
 
 	kvm_arch_vcpu_unblocking(vcpu);
@@ -2056,11 +2055,11 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
 	int me;
 	int cpu = vcpu->cpu;
-	wait_queue_head_t *wqp;
+	struct swait_queue_head *wqp;
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
-	if (waitqueue_active(wqp)) {
-		wake_up_interruptible(wqp);
+	if (swait_active(wqp)) {
+		swake_up(wqp);
 		++vcpu->stat.halt_wakeup;
 	}
 
@@ -2161,7 +2160,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 				continue;
 			if (vcpu == me)
 				continue;
-			if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
+			if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu))
 				continue;
 			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
 				continue;
-- 
cgit v0.10.2


From 065bb78c5b09df54d1c32e03227deb777ddff57b Mon Sep 17 00:00:00 2001
From: Daniel Wagner <daniel.wagner@bmw-carit.de>
Date: Fri, 19 Feb 2016 09:46:40 +0100
Subject: rcu: Do not call rcu_nocb_gp_cleanup() while holding rnp->lock

rcu_nocb_gp_cleanup() is called while holding rnp->lock. Currently,
this is okay because the wake_up_all() in rcu_nocb_gp_cleanup() will
not enable the IRQs. lockdep is happy.

By switching over using swait this is not true anymore. swake_up_all()
enables the IRQs while processing the waiters. __do_softirq() can now
run and will eventually call rcu_process_callbacks() which wants to
grap nrp->lock.

Let's move the rcu_nocb_gp_cleanup() call outside the lock before we
switch over to swait.

If we would hold the rnp->lock and use swait, lockdep reports
following:

 =================================
 [ INFO: inconsistent lock state ]
 4.2.0-rc5-00025-g9a73ba0 #136 Not tainted
 ---------------------------------
 inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
 rcu_preempt/8 [HC0[0]:SC0[0]:HE1:SE1] takes:
  (rcu_node_1){+.?...}, at: [<ffffffff811387c7>] rcu_gp_kthread+0xb97/0xeb0
 {IN-SOFTIRQ-W} state was registered at:
   [<ffffffff81109b9f>] __lock_acquire+0xd5f/0x21e0
   [<ffffffff8110be0f>] lock_acquire+0xdf/0x2b0
   [<ffffffff81841cc9>] _raw_spin_lock_irqsave+0x59/0xa0
   [<ffffffff81136991>] rcu_process_callbacks+0x141/0x3c0
   [<ffffffff810b1a9d>] __do_softirq+0x14d/0x670
   [<ffffffff810b2214>] irq_exit+0x104/0x110
   [<ffffffff81844e96>] smp_apic_timer_interrupt+0x46/0x60
   [<ffffffff81842e70>] apic_timer_interrupt+0x70/0x80
   [<ffffffff810dba66>] rq_attach_root+0xa6/0x100
   [<ffffffff810dbc2d>] cpu_attach_domain+0x16d/0x650
   [<ffffffff810e4b42>] build_sched_domains+0x942/0xb00
   [<ffffffff821777c2>] sched_init_smp+0x509/0x5c1
   [<ffffffff821551e3>] kernel_init_freeable+0x172/0x28f
   [<ffffffff8182cdce>] kernel_init+0xe/0xe0
   [<ffffffff8184231f>] ret_from_fork+0x3f/0x70
 irq event stamp: 76
 hardirqs last  enabled at (75): [<ffffffff81841330>] _raw_spin_unlock_irq+0x30/0x60
 hardirqs last disabled at (76): [<ffffffff8184116f>] _raw_spin_lock_irq+0x1f/0x90
 softirqs last  enabled at (0): [<ffffffff810a8df2>] copy_process.part.26+0x602/0x1cf0
 softirqs last disabled at (0): [<          (null)>]           (null)
 other info that might help us debug this:
  Possible unsafe locking scenario:
        CPU0
        ----
   lock(rcu_node_1);
   <Interrupt>
     lock(rcu_node_1);
  *** DEADLOCK ***
 1 lock held by rcu_preempt/8:
  #0:  (rcu_node_1){+.?...}, at: [<ffffffff811387c7>] rcu_gp_kthread+0xb97/0xeb0
 stack backtrace:
 CPU: 0 PID: 8 Comm: rcu_preempt Not tainted 4.2.0-rc5-00025-g9a73ba0 #136
 Hardware name: Dell Inc. PowerEdge R820/066N7P, BIOS 2.0.20 01/16/2014
  0000000000000000 000000006d7e67d8 ffff881fb081fbd8 ffffffff818379e0
  0000000000000000 ffff881fb0812a00 ffff881fb081fc38 ffffffff8110813b
  0000000000000000 0000000000000001 ffff881f00000001 ffffffff8102fa4f
 Call Trace:
  [<ffffffff818379e0>] dump_stack+0x4f/0x7b
  [<ffffffff8110813b>] print_usage_bug+0x1db/0x1e0
  [<ffffffff8102fa4f>] ? save_stack_trace+0x2f/0x50
  [<ffffffff811087ad>] mark_lock+0x66d/0x6e0
  [<ffffffff81107790>] ? check_usage_forwards+0x150/0x150
  [<ffffffff81108898>] mark_held_locks+0x78/0xa0
  [<ffffffff81841330>] ? _raw_spin_unlock_irq+0x30/0x60
  [<ffffffff81108a28>] trace_hardirqs_on_caller+0x168/0x220
  [<ffffffff81108aed>] trace_hardirqs_on+0xd/0x10
  [<ffffffff81841330>] _raw_spin_unlock_irq+0x30/0x60
  [<ffffffff810fd1c7>] swake_up_all+0xb7/0xe0
  [<ffffffff811386e1>] rcu_gp_kthread+0xab1/0xeb0
  [<ffffffff811089bf>] ? trace_hardirqs_on_caller+0xff/0x220
  [<ffffffff81841341>] ? _raw_spin_unlock_irq+0x41/0x60
  [<ffffffff81137c30>] ? rcu_barrier+0x20/0x20
  [<ffffffff810d2014>] kthread+0x104/0x120
  [<ffffffff81841330>] ? _raw_spin_unlock_irq+0x30/0x60
  [<ffffffff810d1f10>] ? kthread_create_on_node+0x260/0x260
  [<ffffffff8184231f>] ret_from_fork+0x3f/0x70
  [<ffffffff810d1f10>] ? kthread_create_on_node+0x260/0x260

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: linux-rt-users@vger.kernel.org
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1455871601-27484-5-git-send-email-wagi@monom.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e41dd41..8e8c6ec 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1614,7 +1614,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
 	int needmore;
 	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 
-	rcu_nocb_gp_cleanup(rsp, rnp);
 	rnp->need_future_gp[c & 0x1] = 0;
 	needmore = rnp->need_future_gp[(c + 1) & 0x1];
 	trace_rcu_future_gp(rnp, rdp, c,
@@ -2010,6 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	int nocb = 0;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
+	wait_queue_head_t *sq;
 
 	WRITE_ONCE(rsp->gp_activity, jiffies);
 	raw_spin_lock_irq_rcu_node(rnp);
@@ -2046,7 +2046,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 			needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
 		/* smp_mb() provided by prior unlock-lock pair. */
 		nocb += rcu_future_gp_cleanup(rsp, rnp);
+		sq = rcu_nocb_gp_get(rnp);
 		raw_spin_unlock_irq(&rnp->lock);
+		rcu_nocb_gp_cleanup(sq);
 		cond_resched_rcu_qs();
 		WRITE_ONCE(rsp->gp_activity, jiffies);
 		rcu_gp_slow(rsp, gp_cleanup_delay);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 83360b4..10dedfb 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -621,7 +621,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp);
+static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 			    bool lazy, unsigned long flags);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 9467a8b..631aff6 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
  * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
  * grace period.
  */
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq)
 {
-	wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+	wake_up_all(sq);
 }
 
 /*
@@ -1829,6 +1829,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
 	rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
 }
 
+static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+	return &rnp->nocb_gp_wq[rnp->completed & 0x1];
+}
+
 static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
 	init_waitqueue_head(&rnp->nocb_gp_wq[0]);
@@ -2502,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 	return false;
 }
 
-static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq)
 {
 }
 
@@ -2510,6 +2515,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
 {
 }
 
+static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+	return NULL;
+}
+
 static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
 }
-- 
cgit v0.10.2


From abedf8e2419fb873d919dd74de2e84b510259339 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 19 Feb 2016 09:46:41 +0100
Subject: rcu: Use simple wait queues where possible in rcutree

As of commit dae6e64d2bcfd ("rcu: Introduce proper blocking to no-CBs kthreads
GP waits") the RCU subsystem started making use of wait queues.

Here we convert all additions of RCU wait queues to use simple wait queues,
since they don't need the extra overhead of the full wait queue features.

Originally this was done for RT kernels[1], since we would get things like...

  BUG: sleeping function called from invalid context at kernel/rtmutex.c:659
  in_atomic(): 1, irqs_disabled(): 1, pid: 8, name: rcu_preempt
  Pid: 8, comm: rcu_preempt Not tainted
  Call Trace:
   [<ffffffff8106c8d0>] __might_sleep+0xd0/0xf0
   [<ffffffff817d77b4>] rt_spin_lock+0x24/0x50
   [<ffffffff8106fcf6>] __wake_up+0x36/0x70
   [<ffffffff810c4542>] rcu_gp_kthread+0x4d2/0x680
   [<ffffffff8105f910>] ? __init_waitqueue_head+0x50/0x50
   [<ffffffff810c4070>] ? rcu_gp_fqs+0x80/0x80
   [<ffffffff8105eabb>] kthread+0xdb/0xe0
   [<ffffffff8106b912>] ? finish_task_switch+0x52/0x100
   [<ffffffff817e0754>] kernel_thread_helper+0x4/0x10
   [<ffffffff8105e9e0>] ? __init_kthread_worker+0x60/0x60
   [<ffffffff817e0750>] ? gs_change+0xb/0xb

...and hence simple wait queues were deployed on RT out of necessity
(as simple wait uses a raw lock), but mainline might as well take
advantage of the more streamline support as well.

[1] This is a carry forward of work from v3.10-rt; the original conversion
was by Thomas on an earlier -rt version, and Sebastian extended it to
additional post-3.10 added RCU waiters; here I've added a commit log and
unified the RCU changes into one, and uprev'd it to match mainline RCU.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: linux-rt-users@vger.kernel.org
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1455871601-27484-6-git-send-email-wagi@monom.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e8c6ec..9fd5b62 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1634,7 +1634,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
 	    !READ_ONCE(rsp->gp_flags) ||
 	    !rsp->gp_kthread)
 		return;
-	wake_up(&rsp->gp_wq);
+	swake_up(&rsp->gp_wq);
 }
 
 /*
@@ -2009,7 +2009,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	int nocb = 0;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
-	wait_queue_head_t *sq;
+	struct swait_queue_head *sq;
 
 	WRITE_ONCE(rsp->gp_activity, jiffies);
 	raw_spin_lock_irq_rcu_node(rnp);
@@ -2094,7 +2094,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       READ_ONCE(rsp->gpnum),
 					       TPS("reqwait"));
 			rsp->gp_state = RCU_GP_WAIT_GPS;
-			wait_event_interruptible(rsp->gp_wq,
+			swait_event_interruptible(rsp->gp_wq,
 						 READ_ONCE(rsp->gp_flags) &
 						 RCU_GP_FLAG_INIT);
 			rsp->gp_state = RCU_GP_DONE_GPS;
@@ -2124,7 +2124,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 					       READ_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
 			rsp->gp_state = RCU_GP_WAIT_FQS;
-			ret = wait_event_interruptible_timeout(rsp->gp_wq,
+			ret = swait_event_interruptible_timeout(rsp->gp_wq,
 					rcu_gp_fqs_check_wake(rsp, &gf), j);
 			rsp->gp_state = RCU_GP_DOING_FQS;
 			/* Locking provides needed memory barriers. */
@@ -2248,7 +2248,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
-	rcu_gp_kthread_wake(rsp);
+	swake_up(&rsp->gp_wq);  /* Memory barrier implied by swake_up() path. */
 }
 
 /*
@@ -2902,7 +2902,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	}
 	WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
 	raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
-	rcu_gp_kthread_wake(rsp);
+	swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
 }
 
 /*
@@ -3531,7 +3531,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			if (wake) {
 				smp_mb(); /* EGP done before wake_up(). */
-				wake_up(&rsp->expedited_wq);
+				swake_up(&rsp->expedited_wq);
 			}
 			break;
 		}
@@ -3782,7 +3782,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	jiffies_start = jiffies;
 
 	for (;;) {
-		ret = wait_event_interruptible_timeout(
+		ret = swait_event_timeout(
 				rsp->expedited_wq,
 				sync_rcu_preempt_exp_done(rnp_root),
 				jiffies_stall);
@@ -3790,7 +3790,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 			return;
 		if (ret < 0) {
 			/* Hit a signal, disable CPU stall warnings. */
-			wait_event(rsp->expedited_wq,
+			swait_event(rsp->expedited_wq,
 				   sync_rcu_preempt_exp_done(rnp_root));
 			return;
 		}
@@ -4484,8 +4484,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 		}
 	}
 
-	init_waitqueue_head(&rsp->gp_wq);
-	init_waitqueue_head(&rsp->expedited_wq);
+	init_swait_queue_head(&rsp->gp_wq);
+	init_swait_queue_head(&rsp->expedited_wq);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {
 		while (i > rnp->grphi)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 10dedfb..bbd235d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
+#include <linux/swait.h>
 #include <linux/stop_machine.h>
 
 /*
@@ -243,7 +244,7 @@ struct rcu_node {
 				/* Refused to boost: not sure why, though. */
 				/*  This can happen due to race conditions. */
 #ifdef CONFIG_RCU_NOCB_CPU
-	wait_queue_head_t nocb_gp_wq[2];
+	struct swait_queue_head nocb_gp_wq[2];
 				/* Place for rcu_nocb_kthread() to wait GP. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 	int need_future_gp[2];
@@ -399,7 +400,7 @@ struct rcu_data {
 	atomic_long_t nocb_q_count_lazy; /*  invocation (all stages). */
 	struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
 	struct rcu_head **nocb_follower_tail;
-	wait_queue_head_t nocb_wq;	/* For nocb kthreads to sleep on. */
+	struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
 	struct task_struct *nocb_kthread;
 	int nocb_defer_wakeup;		/* Defer wakeup of nocb_kthread. */
 
@@ -478,7 +479,7 @@ struct rcu_state {
 	unsigned long gpnum;			/* Current gp number. */
 	unsigned long completed;		/* # of last completed gp. */
 	struct task_struct *gp_kthread;		/* Task for grace periods. */
-	wait_queue_head_t gp_wq;		/* Where GP task waits. */
+	struct swait_queue_head gp_wq;		/* Where GP task waits. */
 	short gp_flags;				/* Commands for GP task. */
 	short gp_state;				/* GP kthread sleep state. */
 
@@ -506,7 +507,7 @@ struct rcu_state {
 	unsigned long expedited_sequence;	/* Take a ticket. */
 	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
 	atomic_t expedited_need_qs;		/* # CPUs left to check in. */
-	wait_queue_head_t expedited_wq;		/* Wait for check-ins. */
+	struct swait_queue_head expedited_wq;	/* Wait for check-ins. */
 	int ncpus_snap;				/* # CPUs seen last time. */
 
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
@@ -621,8 +622,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
-static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp);
-static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq);
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 			    bool lazy, unsigned long flags);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 631aff6..080bd20 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1811,9 +1811,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
  * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
  * grace period.
  */
-static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
-	wake_up_all(sq);
+	swake_up_all(sq);
 }
 
 /*
@@ -1829,15 +1829,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
 	rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
 }
 
-static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp)
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
 {
 	return &rnp->nocb_gp_wq[rnp->completed & 0x1];
 }
 
 static void rcu_init_one_nocb(struct rcu_node *rnp)
 {
-	init_waitqueue_head(&rnp->nocb_gp_wq[0]);
-	init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+	init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+	init_swait_queue_head(&rnp->nocb_gp_wq[1]);
 }
 
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1862,7 +1862,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
 	if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
 		/* Prior smp_mb__after_atomic() orders against prior enqueue. */
 		WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
-		wake_up(&rdp_leader->nocb_wq);
+		swake_up(&rdp_leader->nocb_wq);
 	}
 }
 
@@ -2074,7 +2074,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 	 */
 	trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
 	for (;;) {
-		wait_event_interruptible(
+		swait_event_interruptible(
 			rnp->nocb_gp_wq[c & 0x1],
 			(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
 		if (likely(d))
@@ -2102,7 +2102,7 @@ wait_again:
 	/* Wait for callbacks to appear. */
 	if (!rcu_nocb_poll) {
 		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
-		wait_event_interruptible(my_rdp->nocb_wq,
+		swait_event_interruptible(my_rdp->nocb_wq,
 				!READ_ONCE(my_rdp->nocb_leader_sleep));
 		/* Memory barrier handled by smp_mb() calls below and repoll. */
 	} else if (firsttime) {
@@ -2177,7 +2177,7 @@ wait_again:
 			 * List was empty, wake up the follower.
 			 * Memory barriers supplied by atomic_long_add().
 			 */
-			wake_up(&rdp->nocb_wq);
+			swake_up(&rdp->nocb_wq);
 		}
 	}
 
@@ -2198,7 +2198,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
 		if (!rcu_nocb_poll) {
 			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
 					    "FollowerSleep");
-			wait_event_interruptible(rdp->nocb_wq,
+			swait_event_interruptible(rdp->nocb_wq,
 						 READ_ONCE(rdp->nocb_follower_head));
 		} else if (firsttime) {
 			/* Don't drown trace log with "Poll"! */
@@ -2357,7 +2357,7 @@ void __init rcu_init_nohz(void)
 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 {
 	rdp->nocb_tail = &rdp->nocb_head;
-	init_waitqueue_head(&rdp->nocb_wq);
+	init_swait_queue_head(&rdp->nocb_wq);
 	rdp->nocb_follower_tail = &rdp->nocb_follower_head;
 }
 
@@ -2507,7 +2507,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 	return false;
 }
 
-static void rcu_nocb_gp_cleanup(wait_queue_head_t *sq)
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
 {
 }
 
@@ -2515,7 +2515,7 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
 {
 }
 
-static wait_queue_head_t *rcu_nocb_gp_get(struct rcu_node *rnp)
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
 {
 	return NULL;
 }
-- 
cgit v0.10.2


From 157078f64b8a9cd7011b6b900b2f2498df850748 Mon Sep 17 00:00:00 2001
From: Thomas Betker <thomas.betker@rohde-schwarz.com>
Date: Tue, 10 Nov 2015 22:18:15 +0100
Subject: Revert "jffs2: Fix lock acquisition order bug in jffs2_write_begin"

This reverts commit 5ffd3412ae55
("jffs2: Fix lock acquisition order bug in jffs2_write_begin").

The commit modified jffs2_write_begin() to remove a deadlock with
jffs2_garbage_collect_live(), but this introduced new deadlocks found
by multiple users. page_lock() actually has to be called before
mutex_lock(&c->alloc_sem) or mutex_lock(&f->sem) because
jffs2_write_end() and jffs2_readpage() are called with the page locked,
and they acquire c->alloc_sem and f->sem, resp.

In other words, the lock order in jffs2_write_begin() was correct, and
it is the jffs2_garbage_collect_live() path that has to be changed.

Revert the commit to get rid of the new deadlocks, and to clear the way
for a better fix of the original deadlock.

Reported-by: Deng Chao <deng.chao1@zte.com.cn>
Reported-by: Ming Liu <liu.ming50@gmail.com>
Reported-by: wangzaiwei <wangzaiwei@top-vision.cn>
Signed-off-by: Thomas Betker <thomas.betker@rohde-schwarz.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: stable@vger.kernel.org

diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index c5ac594..cad86ba 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -137,39 +137,33 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 	struct page *pg;
 	struct inode *inode = mapping->host;
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
-	struct jffs2_raw_inode ri;
-	uint32_t alloc_len = 0;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	uint32_t pageofs = index << PAGE_CACHE_SHIFT;
 	int ret = 0;
 
-	jffs2_dbg(1, "%s()\n", __func__);
-
-	if (pageofs > inode->i_size) {
-		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
-					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
-		if (ret)
-			return ret;
-	}
-
-	mutex_lock(&f->sem);
 	pg = grab_cache_page_write_begin(mapping, index, flags);
-	if (!pg) {
-		if (alloc_len)
-			jffs2_complete_reservation(c);
-		mutex_unlock(&f->sem);
+	if (!pg)
 		return -ENOMEM;
-	}
 	*pagep = pg;
 
-	if (alloc_len) {
+	jffs2_dbg(1, "%s()\n", __func__);
+
+	if (pageofs > inode->i_size) {
 		/* Make new hole frag from old EOF to new page */
+		struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+		struct jffs2_raw_inode ri;
 		struct jffs2_full_dnode *fn;
+		uint32_t alloc_len;
 
 		jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
 			  (unsigned int)inode->i_size, pageofs);
 
+		ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+					  ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+		if (ret)
+			goto out_page;
+
+		mutex_lock(&f->sem);
 		memset(&ri, 0, sizeof(ri));
 
 		ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -196,6 +190,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 		if (IS_ERR(fn)) {
 			ret = PTR_ERR(fn);
 			jffs2_complete_reservation(c);
+			mutex_unlock(&f->sem);
 			goto out_page;
 		}
 		ret = jffs2_add_full_dnode_to_inode(c, f, fn);
@@ -210,10 +205,12 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 			jffs2_mark_node_obsolete(c, fn->raw);
 			jffs2_free_full_dnode(fn);
 			jffs2_complete_reservation(c);
+			mutex_unlock(&f->sem);
 			goto out_page;
 		}
 		jffs2_complete_reservation(c);
 		inode->i_size = pageofs;
+		mutex_unlock(&f->sem);
 	}
 
 	/*
@@ -222,18 +219,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 	 * case of a short-copy.
 	 */
 	if (!PageUptodate(pg)) {
+		mutex_lock(&f->sem);
 		ret = jffs2_do_readpage_nolock(inode, pg);
+		mutex_unlock(&f->sem);
 		if (ret)
 			goto out_page;
 	}
-	mutex_unlock(&f->sem);
 	jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
 	return ret;
 
 out_page:
 	unlock_page(pg);
 	page_cache_release(pg);
-	mutex_unlock(&f->sem);
 	return ret;
 }
 
-- 
cgit v0.10.2


From 49e91e7079febe59a20ca885a87dd1c54240d0f1 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Mon, 1 Feb 2016 12:37:20 +0000
Subject: jffs2: Fix page lock / f->sem deadlock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this fix, all code paths should now be obtaining the page lock before
f->sem.

Reported-by: Szabó Tamás <sztomi89@gmail.com>
Tested-by: Thomas Betker <thomas.betker@rohde-schwarz.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: stable@vger.kernel.org

diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking
index 3ea3655..8918ac9 100644
--- a/fs/jffs2/README.Locking
+++ b/fs/jffs2/README.Locking
@@ -2,10 +2,6 @@
 	JFFS2 LOCKING DOCUMENTATION
 	---------------------------
 
-At least theoretically, JFFS2 does not require the Big Kernel Lock
-(BKL), which was always helpfully obtained for it by Linux 2.4 VFS
-code. It has its own locking, as described below.
-
 This document attempts to describe the existing locking rules for
 JFFS2. It is not expected to remain perfectly up to date, but ought to
 be fairly close.
@@ -69,6 +65,7 @@ Ordering constraints:
 	   any f->sem held.
 	2. Never attempt to lock two file mutexes in one thread.
 	   No ordering rules have been made for doing so.
+	3. Never lock a page cache page with f->sem held.
 
 
 	erase_completion_lock spinlock
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 5a2dec2..95d5880 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -1296,14 +1296,17 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
 		BUG_ON(start > orig_start);
 	}
 
-	/* First, use readpage() to read the appropriate page into the page cache */
-	/* Q: What happens if we actually try to GC the _same_ page for which commit_write()
-	 *    triggered garbage collection in the first place?
-	 * A: I _think_ it's OK. read_cache_page shouldn't deadlock, we'll write out the
-	 *    page OK. We'll actually write it out again in commit_write, which is a little
-	 *    suboptimal, but at least we're correct.
-	 */
+	/* The rules state that we must obtain the page lock *before* f->sem, so
+	 * drop f->sem temporarily. Since we also hold c->alloc_sem, nothing's
+	 * actually going to *change* so we're safe; we only allow reading.
+	 *
+	 * It is important to note that jffs2_write_begin() will ensure that its
+	 * page is marked Uptodate before allocating space. That means that if we
+	 * end up here trying to GC the *same* page that jffs2_write_begin() is
+	 * trying to write out, read_cache_page() will not deadlock. */
+	mutex_unlock(&f->sem);
 	pg_ptr = jffs2_gc_fetch_page(c, f, start, &pg);
+	mutex_lock(&f->sem);
 
 	if (IS_ERR(pg_ptr)) {
 		pr_warn("read_cache_page() returned error: %ld\n",
-- 
cgit v0.10.2


From be629c62a603e5935f8177fd8a19e014100a259e Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Mon, 1 Feb 2016 14:04:46 +0000
Subject: Fix directory hardlinks from deleted directories
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a directory is deleted, we don't take too much care about killing off
all the dirents that belong to it — on the basis that on remount, the scan
will conclude that the directory is dead anyway.

This doesn't work though, when the deleted directory contained a child
directory which was moved *out*. In the early stages of the fs build
we can then end up with an apparent hard link, with the child directory
appearing both in its true location, and as a child of the original
directory which are this stage of the mount process we don't *yet* know
is defunct.

To resolve this, take out the early special-casing of the "directories
shall not have hard links" rule in jffs2_build_inode_pass1(), and let the
normal nlink processing happen for directories as well as other inodes.

Then later in the build process we can set ic->pino_nlink to the parent
inode#, as is required for directories during normal operaton, instead
of the nlink. And complain only *then* about hard links which are still
in evidence even after killing off all the unreachable paths.

Reported-by: Liu Song <liu.song11@zte.com.cn>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: stable@vger.kernel.org

diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 0ae91ad..b288c8a 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -50,7 +50,8 @@ next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c)
 
 
 static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
-				    struct jffs2_inode_cache *ic)
+				    struct jffs2_inode_cache *ic,
+				    int *dir_hardlinks)
 {
 	struct jffs2_full_dirent *fd;
 
@@ -69,19 +70,21 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
 			dbg_fsbuild("child \"%s\" (ino #%u) of dir ino #%u doesn't exist!\n",
 				  fd->name, fd->ino, ic->ino);
 			jffs2_mark_node_obsolete(c, fd->raw);
+			/* Clear the ic/raw union so it doesn't cause problems later. */
+			fd->ic = NULL;
 			continue;
 		}
 
+		/* From this point, fd->raw is no longer used so we can set fd->ic */
+		fd->ic = child_ic;
+		child_ic->pino_nlink++;
+		/* If we appear (at this stage) to have hard-linked directories,
+		 * set a flag to trigger a scan later */
 		if (fd->type == DT_DIR) {
-			if (child_ic->pino_nlink) {
-				JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n",
-					    fd->name, fd->ino, ic->ino);
-				/* TODO: What do we do about it? */
-			} else {
-				child_ic->pino_nlink = ic->ino;
-			}
-		} else
-			child_ic->pino_nlink++;
+			child_ic->flags |= INO_FLAGS_IS_DIR;
+			if (child_ic->pino_nlink > 1)
+				*dir_hardlinks = 1;
+		}
 
 		dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino);
 		/* Can't free scan_dents so far. We might need them in pass 2 */
@@ -95,8 +98,7 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c,
 */
 static int jffs2_build_filesystem(struct jffs2_sb_info *c)
 {
-	int ret;
-	int i;
+	int ret, i, dir_hardlinks = 0;
 	struct jffs2_inode_cache *ic;
 	struct jffs2_full_dirent *fd;
 	struct jffs2_full_dirent *dead_fds = NULL;
@@ -120,7 +122,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
 	/* Now scan the directory tree, increasing nlink according to every dirent found. */
 	for_each_inode(i, c, ic) {
 		if (ic->scan_dents) {
-			jffs2_build_inode_pass1(c, ic);
+			jffs2_build_inode_pass1(c, ic, &dir_hardlinks);
 			cond_resched();
 		}
 	}
@@ -156,6 +158,20 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
 	}
 
 	dbg_fsbuild("pass 2a complete\n");
+
+	if (dir_hardlinks) {
+		/* If we detected directory hardlinks earlier, *hopefully*
+		 * they are gone now because some of the links were from
+		 * dead directories which still had some old dirents lying
+		 * around and not yet garbage-collected, but which have
+		 * been discarded above. So clear the pino_nlink field
+		 * in each directory, so that the final scan below can
+		 * print appropriate warnings. */
+		for_each_inode(i, c, ic) {
+			if (ic->flags & INO_FLAGS_IS_DIR)
+				ic->pino_nlink = 0;
+		}
+	}
 	dbg_fsbuild("freeing temporary data structures\n");
 
 	/* Finally, we can scan again and free the dirent structs */
@@ -163,6 +179,33 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
 		while(ic->scan_dents) {
 			fd = ic->scan_dents;
 			ic->scan_dents = fd->next;
+			/* We do use the pino_nlink field to count nlink of
+			 * directories during fs build, so set it to the
+			 * parent ino# now. Now that there's hopefully only
+			 * one. */
+			if (fd->type == DT_DIR) {
+				if (!fd->ic) {
+					/* We'll have complained about it and marked the coresponding
+					   raw node obsolete already. Just skip it. */
+					continue;
+				}
+
+				/* We *have* to have set this in jffs2_build_inode_pass1() */
+				BUG_ON(!(fd->ic->flags & INO_FLAGS_IS_DIR));
+
+				/* We clear ic->pino_nlink ∀ directories' ic *only* if dir_hardlinks
+				 * is set. Otherwise, we know this should never trigger anyway, so
+				 * we don't do the check. And ic->pino_nlink still contains the nlink
+				 * value (which is 1). */
+				if (dir_hardlinks && fd->ic->pino_nlink) {
+					JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u is also hard linked from dir ino #%u\n",
+						    fd->name, fd->ino, ic->ino, fd->ic->pino_nlink);
+					/* Should we unlink it from its previous parent? */
+				}
+
+				/* For directories, ic->pino_nlink holds that parent inode # */
+				fd->ic->pino_nlink = ic->ino;
+			}
 			jffs2_free_full_dirent(fd);
 		}
 		ic->scan_dents = NULL;
@@ -241,11 +284,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c,
 
 			/* Reduce nlink of the child. If it's now zero, stick it on the
 			   dead_fds list to be cleaned up later. Else just free the fd */
-
-			if (fd->type == DT_DIR)
-				child_ic->pino_nlink = 0;
-			else
-				child_ic->pino_nlink--;
+			child_ic->pino_nlink--;
 
 			if (!child_ic->pino_nlink) {
 				dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n",
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index fa35ff7..0637271 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -194,6 +194,7 @@ struct jffs2_inode_cache {
 #define INO_STATE_CLEARING	6	/* In clear_inode() */
 
 #define INO_FLAGS_XATTR_CHECKED	0x01	/* has no duplicate xattr_ref */
+#define INO_FLAGS_IS_DIR	0x02	/* is a directory */
 
 #define RAWNODE_CLASS_INODE_CACHE	0
 #define RAWNODE_CLASS_XATTR_DATUM	1
@@ -249,7 +250,10 @@ struct jffs2_readinode_info
 
 struct jffs2_full_dirent
 {
-	struct jffs2_raw_node_ref *raw;
+	union {
+		struct jffs2_raw_node_ref *raw;
+		struct jffs2_inode_cache *ic; /* Just during part of build */
+	};
 	struct jffs2_full_dirent *next;
 	uint32_t version;
 	uint32_t ino; /* == zero for unlink */
-- 
cgit v0.10.2


From 5104ffb229c357d9672344126040721e5dc4cc7b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 25 Feb 2016 10:14:50 -0300
Subject: perf tools: Use asprintf() for simple string formatting/allocation

No need to use strbuf there, its just a simple alloc+formatting, which
asprintf does just fine.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-6q6cxfhk8c8ypg3tfpo0i2iy@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index 144047c..f632119 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -454,11 +454,12 @@ static void handle_internal_command(int argc, const char **argv)
 
 static void execv_dashed_external(const char **argv)
 {
-	struct strbuf cmd = STRBUF_INIT;
+	char *cmd;
 	const char *tmp;
 	int status;
 
-	strbuf_addf(&cmd, "perf-%s", argv[0]);
+	if (asprintf(&cmd, "perf-%s", argv[0]) < 0)
+		goto do_die;
 
 	/*
 	 * argv[0] must be the perf command, but the argv array
@@ -467,7 +468,7 @@ static void execv_dashed_external(const char **argv)
 	 * restore it on error.
 	 */
 	tmp = argv[0];
-	argv[0] = cmd.buf;
+	argv[0] = cmd;
 
 	/*
 	 * if we fail because the command is not found, it is
@@ -475,15 +476,16 @@ static void execv_dashed_external(const char **argv)
 	 */
 	status = run_command_v_opt(argv, 0);
 	if (status != -ERR_RUN_COMMAND_EXEC) {
-		if (IS_RUN_COMMAND_ERR(status))
+		if (IS_RUN_COMMAND_ERR(status)) {
+do_die:
 			die("unable to run '%s'", argv[0]);
+		}
 		exit(-status);
 	}
 	errno = ENOENT; /* as if we called execvp */
 
 	argv[0] = tmp;
-
-	strbuf_release(&cmd);
+	zfree(&cmd);
 }
 
 static int run_argv(int *argcp, const char ***argv)
-- 
cgit v0.10.2


From e392d603f61767cb2de4d82bb55a035918f8342c Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 1 Feb 2016 12:00:48 +0000
Subject: clocksource/drivers/arm_arch_timer: Enable and verify MMIO access

So far, we have been blindly assuming that having access to a
memory-mapped timer frame implies that the individual elements of that
frame frame are already enabled. Whilst it's the firmware's job to give
us non-secure access to frames in the first place, we should not rely
on implementations always being generous enough to also configure CNTACR
for those non-secure frames (e.g. [1]).

Explicitly enable feature-level access per-frame, and verify that the
access we want is really implemented before trying to make use of it.

[1]:https://github.com/ARM-software/tf-issues/issues/170

Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Tested-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index c64d543..7c567f0 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -32,6 +32,14 @@
 #define CNTTIDR		0x08
 #define CNTTIDR_VIRT(n)	(BIT(1) << ((n) * 4))
 
+#define CNTACR(n)	(0x40 + ((n) * 4))
+#define CNTACR_RPCT	BIT(0)
+#define CNTACR_RVCT	BIT(1)
+#define CNTACR_RFRQ	BIT(2)
+#define CNTACR_RVOFF	BIT(3)
+#define CNTACR_RWVT	BIT(4)
+#define CNTACR_RWPT	BIT(5)
+
 #define CNTVCT_LO	0x08
 #define CNTVCT_HI	0x0c
 #define CNTFRQ		0x10
@@ -757,7 +765,6 @@ static void __init arch_timer_mem_init(struct device_node *np)
 	}
 
 	cnttidr = readl_relaxed(cntctlbase + CNTTIDR);
-	iounmap(cntctlbase);
 
 	/*
 	 * Try to find a virtual capable frame. Otherwise fall back to a
@@ -765,20 +772,31 @@ static void __init arch_timer_mem_init(struct device_node *np)
 	 */
 	for_each_available_child_of_node(np, frame) {
 		int n;
+		u32 cntacr;
 
 		if (of_property_read_u32(frame, "frame-number", &n)) {
 			pr_err("arch_timer: Missing frame-number\n");
-			of_node_put(best_frame);
 			of_node_put(frame);
-			return;
+			goto out;
 		}
 
-		if (cnttidr & CNTTIDR_VIRT(n)) {
+		/* Try enabling everything, and see what sticks */
+		cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT |
+			 CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT;
+		writel_relaxed(cntacr, cntctlbase + CNTACR(n));
+		cntacr = readl_relaxed(cntctlbase + CNTACR(n));
+
+		if ((cnttidr & CNTTIDR_VIRT(n)) &&
+		    !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) {
 			of_node_put(best_frame);
 			best_frame = frame;
 			arch_timer_mem_use_virtual = true;
 			break;
 		}
+
+		if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT))
+			continue;
+
 		of_node_put(best_frame);
 		best_frame = of_node_get(frame);
 	}
@@ -786,24 +804,26 @@ static void __init arch_timer_mem_init(struct device_node *np)
 	base = arch_counter_base = of_iomap(best_frame, 0);
 	if (!base) {
 		pr_err("arch_timer: Can't map frame's registers\n");
-		of_node_put(best_frame);
-		return;
+		goto out;
 	}
 
 	if (arch_timer_mem_use_virtual)
 		irq = irq_of_parse_and_map(best_frame, 1);
 	else
 		irq = irq_of_parse_and_map(best_frame, 0);
-	of_node_put(best_frame);
+
 	if (!irq) {
 		pr_err("arch_timer: Frame missing %s irq",
 		       arch_timer_mem_use_virtual ? "virt" : "phys");
-		return;
+		goto out;
 	}
 
 	arch_timer_detect_rate(base, np);
 	arch_timer_mem_register(base, irq);
 	arch_timer_common_init();
+out:
+	iounmap(cntctlbase);
+	of_node_put(best_frame);
 }
 CLOCKSOURCE_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem",
 		       arch_timer_mem_init);
-- 
cgit v0.10.2


From 522ed95c26cd03b768018e441bbc0ff656e30fe4 Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Mon, 15 Feb 2016 09:02:09 +0800
Subject: clocksource/drivers/rockchip: Add err handle for rk_timer_init

Currently rockchip_timer doesn't do some basic cleanup work when
failing to init the timer. Let's add err handle routine to deal
with all the err cases.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

diff --git a/drivers/clocksource/rockchip_timer.c b/drivers/clocksource/rockchip_timer.c
index 8c77a52..b991b28 100644
--- a/drivers/clocksource/rockchip_timer.c
+++ b/drivers/clocksource/rockchip_timer.c
@@ -122,23 +122,23 @@ static void __init rk_timer_init(struct device_node *np)
 	pclk = of_clk_get_by_name(np, "pclk");
 	if (IS_ERR(pclk)) {
 		pr_err("Failed to get pclk for '%s'\n", TIMER_NAME);
-		return;
+		goto out_unmap;
 	}
 
 	if (clk_prepare_enable(pclk)) {
 		pr_err("Failed to enable pclk for '%s'\n", TIMER_NAME);
-		return;
+		goto out_unmap;
 	}
 
 	timer_clk = of_clk_get_by_name(np, "timer");
 	if (IS_ERR(timer_clk)) {
 		pr_err("Failed to get timer clock for '%s'\n", TIMER_NAME);
-		return;
+		goto out_timer_clk;
 	}
 
 	if (clk_prepare_enable(timer_clk)) {
 		pr_err("Failed to enable timer clock\n");
-		return;
+		goto out_timer_clk;
 	}
 
 	bc_timer.freq = clk_get_rate(timer_clk);
@@ -146,7 +146,7 @@ static void __init rk_timer_init(struct device_node *np)
 	irq = irq_of_parse_and_map(np, 0);
 	if (!irq) {
 		pr_err("Failed to map interrupts for '%s'\n", TIMER_NAME);
-		return;
+		goto out_irq;
 	}
 
 	ce->name = TIMER_NAME;
@@ -164,10 +164,19 @@ static void __init rk_timer_init(struct device_node *np)
 	ret = request_irq(irq, rk_timer_interrupt, IRQF_TIMER, TIMER_NAME, ce);
 	if (ret) {
 		pr_err("Failed to initialize '%s': %d\n", TIMER_NAME, ret);
-		return;
+		goto out_irq;
 	}
 
 	clockevents_config_and_register(ce, bc_timer.freq, 1, UINT_MAX);
+
+	return;
+
+out_irq:
+	clk_disable_unprepare(timer_clk);
+out_timer_clk:
+	clk_disable_unprepare(pclk);
+out_unmap:
+	iounmap(bc_timer.base);
 }
 
 CLOCKSOURCE_OF_DECLARE(rk_timer, "rockchip,rk3288-timer", rk_timer_init);
-- 
cgit v0.10.2


From 751db1a6eaec3d266ccfe3f0d11323b7c82486bf Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Date: Tue, 9 Feb 2016 22:54:25 -0300
Subject: clocksource/drivers/lpc32xx: Don't use the prescaler counter for
 clockevents

This commit switches the clockevents one-shot current implementation
to avoid using the prescaler counter. The clockevents timer currently
uses MR0=1, PR=ticks; and after this commit is uses MR0=ticks, PR=0.

While using the prescaler with PR=1 works fine in one-shot mode,
it seems it doesn't work as expected in periodic mode.

By using the only match channel register (MR0) for the timer we make
the periodic mode introduction easier, and consistent with one-shot mode.

Signed-off-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Joachim Eastwood <manabian@gmail.com>
Tested-by: Joachim Eastwood <manabian@gmail.com>

diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c
index 1316876..50d1a63 100644
--- a/drivers/clocksource/time-lpc32xx.c
+++ b/drivers/clocksource/time-lpc32xx.c
@@ -60,14 +60,13 @@ static int lpc32xx_clkevt_next_event(unsigned long delta,
 		container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev);
 
 	/*
-	 * Place timer in reset and program the delta in the prescale
-	 * register (PR). When the prescale counter matches the value
-	 * in PR the counter register is incremented and the compare
-	 * match will trigger. After setup the timer is released from
-	 * reset and enabled.
+	 * Place timer in reset and program the delta in the match
+	 * channel 0 (MR0). When the timer counter matches the value
+	 * in MR0 register the match will trigger an interrupt.
+	 * After setup the timer is released from reset and enabled.
 	 */
 	writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR);
-	writel_relaxed(delta, ddata->base + LPC32XX_TIMER_PR);
+	writel_relaxed(delta, ddata->base + LPC32XX_TIMER_MR0);
 	writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR);
 
 	return 0;
@@ -210,13 +209,13 @@ static int __init lpc32xx_clockevent_init(struct device_node *np)
 
 	/*
 	 * Disable timer and clear any pending interrupt (IR) on match
-	 * channel 0 (MR0). Configure a compare match value of 1 on MR0
-	 * and enable interrupt, reset on match and stop on match (MCR).
+	 * channel 0 (MR0). Clear the prescaler as it's not used.
+	 * Enable interrupt, reset on match and stop on match (MCR).
 	 */
 	writel_relaxed(0, base + LPC32XX_TIMER_TCR);
+	writel_relaxed(0, base + LPC32XX_TIMER_PR);
 	writel_relaxed(0, base + LPC32XX_TIMER_CTCR);
 	writel_relaxed(LPC32XX_TIMER_IR_MR0INT, base + LPC32XX_TIMER_IR);
-	writel_relaxed(1, base + LPC32XX_TIMER_MR0);
 	writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R |
 		       LPC32XX_TIMER_MCR_MR0S, base + LPC32XX_TIMER_MCR);
 
-- 
cgit v0.10.2


From 32f32d982f655dd191858406b11e50219a0ee01e Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Date: Tue, 9 Feb 2016 22:54:26 -0300
Subject: clocksource/drivers/lpc32xx: Support periodic mode

This commit adds the support for periodic mode. This is done by not
setting the MR0S (Stop on TnMR0) bit on MCR, thus allowing
interrupts to be periodically generated on MR0 matches.

In order to do this, move the initial configuration that is specific to
the one-shot mode to set_state_oneshot().

Signed-off-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Joachim Eastwood <manabian@gmail.com>
Tested-by: Joachim Eastwood <manabian@gmail.com>

diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c
index 50d1a63..5694edd 100644
--- a/drivers/clocksource/time-lpc32xx.c
+++ b/drivers/clocksource/time-lpc32xx.c
@@ -43,6 +43,7 @@
 struct lpc32xx_clock_event_ddata {
 	struct clock_event_device evtdev;
 	void __iomem *base;
+	u32 ticks_per_jiffy;
 };
 
 /* Needed for the sched clock */
@@ -85,11 +86,39 @@ static int lpc32xx_clkevt_shutdown(struct clock_event_device *evtdev)
 
 static int lpc32xx_clkevt_oneshot(struct clock_event_device *evtdev)
 {
+	struct lpc32xx_clock_event_ddata *ddata =
+		container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev);
+
 	/*
 	 * When using oneshot, we must also disable the timer
 	 * to wait for the first call to set_next_event().
 	 */
-	return lpc32xx_clkevt_shutdown(evtdev);
+	writel_relaxed(0, ddata->base + LPC32XX_TIMER_TCR);
+
+	/* Enable interrupt, reset on match and stop on match (MCR). */
+	writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R |
+		       LPC32XX_TIMER_MCR_MR0S, ddata->base + LPC32XX_TIMER_MCR);
+	return 0;
+}
+
+static int lpc32xx_clkevt_periodic(struct clock_event_device *evtdev)
+{
+	struct lpc32xx_clock_event_ddata *ddata =
+		container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev);
+
+	/* Enable interrupt and reset on match. */
+	writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R,
+		       ddata->base + LPC32XX_TIMER_MCR);
+
+	/*
+	 * Place timer in reset and program the delta in the match
+	 * channel 0 (MR0).
+	 */
+	writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR);
+	writel_relaxed(ddata->ticks_per_jiffy, ddata->base + LPC32XX_TIMER_MR0);
+	writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR);
+
+	return 0;
 }
 
 static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id)
@@ -107,11 +136,13 @@ static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id)
 static struct lpc32xx_clock_event_ddata lpc32xx_clk_event_ddata = {
 	.evtdev = {
 		.name			= "lpc3220 clockevent",
-		.features		= CLOCK_EVT_FEAT_ONESHOT,
+		.features		= CLOCK_EVT_FEAT_ONESHOT |
+					  CLOCK_EVT_FEAT_PERIODIC,
 		.rating			= 300,
 		.set_next_event		= lpc32xx_clkevt_next_event,
 		.set_state_shutdown	= lpc32xx_clkevt_shutdown,
 		.set_state_oneshot	= lpc32xx_clkevt_oneshot,
+		.set_state_periodic	= lpc32xx_clkevt_periodic,
 	},
 };
 
@@ -210,17 +241,15 @@ static int __init lpc32xx_clockevent_init(struct device_node *np)
 	/*
 	 * Disable timer and clear any pending interrupt (IR) on match
 	 * channel 0 (MR0). Clear the prescaler as it's not used.
-	 * Enable interrupt, reset on match and stop on match (MCR).
 	 */
 	writel_relaxed(0, base + LPC32XX_TIMER_TCR);
 	writel_relaxed(0, base + LPC32XX_TIMER_PR);
 	writel_relaxed(0, base + LPC32XX_TIMER_CTCR);
 	writel_relaxed(LPC32XX_TIMER_IR_MR0INT, base + LPC32XX_TIMER_IR);
-	writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R |
-		       LPC32XX_TIMER_MCR_MR0S, base + LPC32XX_TIMER_MCR);
 
 	rate = clk_get_rate(clk);
 	lpc32xx_clk_event_ddata.base = base;
+	lpc32xx_clk_event_ddata.ticks_per_jiffy = DIV_ROUND_CLOSEST(rate, HZ);
 	clockevents_config_and_register(&lpc32xx_clk_event_ddata.evtdev,
 					rate, 1, -1);
 
-- 
cgit v0.10.2


From 1b18fd2023acf7679a79d94ee85d488664ef0553 Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Date: Tue, 9 Feb 2016 22:54:27 -0300
Subject: clocksource/drivers/lpc32xx: Support timer-based ARM delay

This commit implements the ARM timer-based delay timer for the
LPC32xx, LPC18xx, LPC43xx family of SoCs.

Also, add a dependency to restrict compiling this driver for
the ARM architecture.

Signed-off-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Joachim Eastwood <manabian@gmail.com>
Tested-by: Joachim Eastwood <manabian@gmail.com>

diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 56777f0..c9c4db6 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -153,6 +153,7 @@ config CLKSRC_EFM32
 config CLKSRC_LPC32XX
 	bool "Clocksource for LPC32XX" if COMPILE_TEST
 	depends on GENERIC_CLOCKEVENTS && HAS_IOMEM
+	depends on ARM
 	select CLKSRC_MMIO
 	select CLKSRC_OF
 	help
diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c
index 5694edd..daae61e 100644
--- a/drivers/clocksource/time-lpc32xx.c
+++ b/drivers/clocksource/time-lpc32xx.c
@@ -18,6 +18,7 @@
 #include <linux/clk.h>
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
+#include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
@@ -54,6 +55,15 @@ static u64 notrace lpc32xx_read_sched_clock(void)
 	return readl(clocksource_timer_counter);
 }
 
+static unsigned long lpc32xx_delay_timer_read(void)
+{
+	return readl(clocksource_timer_counter);
+}
+
+static struct delay_timer lpc32xx_delay_timer = {
+	.read_current_timer = lpc32xx_delay_timer_read,
+};
+
 static int lpc32xx_clkevt_next_event(unsigned long delta,
 				     struct clock_event_device *evtdev)
 {
@@ -192,6 +202,8 @@ static int __init lpc32xx_clocksource_init(struct device_node *np)
 	}
 
 	clocksource_timer_counter = base + LPC32XX_TIMER_TC;
+	lpc32xx_delay_timer.freq = rate;
+	register_current_timer_delay(&lpc32xx_delay_timer);
 	sched_clock_register(lpc32xx_read_sched_clock, 32, rate);
 
 	return 0;
-- 
cgit v0.10.2


From bbaa06702719818913ed686612d0db477b2b53b0 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin.vincent@axis.com>
Date: Wed, 19 Aug 2015 15:43:38 +0200
Subject: clocksource/drivers/arm_global_timer: Register delay timer

Provide a delay timer using the lower 32-bits of the global timer so
that we can use that instead of having to calibrating delays.

Signed-off-by: Rabin Vincent <rabin.vincent@axis.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c
index d189d8c..36998fa 100644
--- a/drivers/clocksource/arm_global_timer.c
+++ b/drivers/clocksource/arm_global_timer.c
@@ -16,6 +16,7 @@
 #include <linux/clockchips.h>
 #include <linux/cpu.h>
 #include <linux/clk.h>
+#include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/of.h>
@@ -221,6 +222,21 @@ static u64 notrace gt_sched_clock_read(void)
 }
 #endif
 
+static unsigned long gt_read_long(void)
+{
+	return readl_relaxed(gt_base + GT_COUNTER0);
+}
+
+static struct delay_timer gt_delay_timer = {
+	.read_current_timer = gt_read_long,
+};
+
+static void __init gt_delay_timer_init(void)
+{
+	gt_delay_timer.freq = gt_clk_rate;
+	register_current_timer_delay(&gt_delay_timer);
+}
+
 static void __init gt_clocksource_init(void)
 {
 	writel(0, gt_base + GT_CONTROL);
@@ -317,6 +333,7 @@ static void __init global_timer_of_register(struct device_node *np)
 	/* Immediately configure the timer on the boot CPU */
 	gt_clocksource_init();
 	gt_clockevents_init(this_cpu_ptr(gt_evt));
+	gt_delay_timer_init();
 
 	return;
 
-- 
cgit v0.10.2


From cf8c5009ee37d25cf4da0a250f3044ec6d1144a0 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 23 Dec 2015 16:59:12 +0530
Subject: clockevents/drivers/arm_arch_timer: Implement
 ->set_state_oneshot_stopped()

set_state_oneshot_stopped() is called by the clkevt core, when the next
event is required at an expiry time of 'KTIME_MAX'. This normally
happens with NO_HZ_{IDLE|FULL} in both LOWRES/HIGHRES modes.

This patch makes the clockevent device to stop on such an event, to
avoid spurious interrupts, as explained by: commit 8fff52fd5093
("clockevents: Introduce CLOCK_EVT_STATE_ONESHOT_STOPPED state").

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c
index 7c567f0..f0dd9d4 100644
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -274,10 +274,12 @@ static void __arch_timer_setup(unsigned type,
 		if (arch_timer_use_virtual) {
 			clk->irq = arch_timer_ppi[VIRT_PPI];
 			clk->set_state_shutdown = arch_timer_shutdown_virt;
+			clk->set_state_oneshot_stopped = arch_timer_shutdown_virt;
 			clk->set_next_event = arch_timer_set_next_event_virt;
 		} else {
 			clk->irq = arch_timer_ppi[PHYS_SECURE_PPI];
 			clk->set_state_shutdown = arch_timer_shutdown_phys;
+			clk->set_state_oneshot_stopped = arch_timer_shutdown_phys;
 			clk->set_next_event = arch_timer_set_next_event_phys;
 		}
 	} else {
@@ -287,10 +289,12 @@ static void __arch_timer_setup(unsigned type,
 		clk->cpumask = cpu_all_mask;
 		if (arch_timer_mem_use_virtual) {
 			clk->set_state_shutdown = arch_timer_shutdown_virt_mem;
+			clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem;
 			clk->set_next_event =
 				arch_timer_set_next_event_virt_mem;
 		} else {
 			clk->set_state_shutdown = arch_timer_shutdown_phys_mem;
+			clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem;
 			clk->set_next_event =
 				arch_timer_set_next_event_phys_mem;
 		}
-- 
cgit v0.10.2


From 3effa3ceea2da9f67f042a694e24ba0d8143cff8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 23 Dec 2015 16:59:13 +0530
Subject: clockevents/drivers/arm_global_timer: Implement
 ->set_state_oneshot_stopped()

set_state_oneshot_stopped() is called by the clkevt core, when the next
event is required at an expiry time of 'KTIME_MAX'. This normally
happens with NO_HZ_{IDLE|FULL} in both LOWRES/HIGHRES modes.

This patch makes the clockevent device to stop on such an event, to
avoid spurious interrupts, as explained by: commit 8fff52fd5093
("clockevents: Introduce CLOCK_EVT_STATE_ONESHOT_STOPPED state").

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c
index 36998fa..9df0d16 100644
--- a/drivers/clocksource/arm_global_timer.c
+++ b/drivers/clocksource/arm_global_timer.c
@@ -175,6 +175,7 @@ static int gt_clockevents_init(struct clock_event_device *clk)
 	clk->set_state_shutdown = gt_clockevent_shutdown;
 	clk->set_state_periodic = gt_clockevent_set_periodic;
 	clk->set_state_oneshot = gt_clockevent_shutdown;
+	clk->set_state_oneshot_stopped = gt_clockevent_shutdown;
 	clk->set_next_event = gt_clockevent_set_next_event;
 	clk->cpumask = cpumask_of(cpu);
 	clk->rating = 300;
-- 
cgit v0.10.2


From 07f101d33def572c7018626735475f88dabecebf Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 23 Dec 2015 16:59:14 +0530
Subject: clockevents/drivers/exynos_mct: Implement
 ->set_state_oneshot_stopped()

set_state_oneshot_stopped() is called by the clkevt core, when the next
event is required at an expiry time of 'KTIME_MAX'. This normally
happens with NO_HZ_{IDLE|FULL} in both LOWRES/HIGHRES modes.

This patch makes the clockevent device to stop on such an event, to
avoid spurious interrupts, as explained by: commit 8fff52fd5093
("clockevents: Introduce CLOCK_EVT_STATE_ONESHOT_STOPPED state").

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c
index ff44082..be09bc0 100644
--- a/drivers/clocksource/exynos_mct.c
+++ b/drivers/clocksource/exynos_mct.c
@@ -313,6 +313,7 @@ static struct clock_event_device mct_comp_device = {
 	.set_state_periodic	= mct_set_state_periodic,
 	.set_state_shutdown	= mct_set_state_shutdown,
 	.set_state_oneshot	= mct_set_state_shutdown,
+	.set_state_oneshot_stopped = mct_set_state_shutdown,
 	.tick_resume		= mct_set_state_shutdown,
 };
 
@@ -452,6 +453,7 @@ static int exynos4_local_timer_setup(struct mct_clock_event_device *mevt)
 	evt->set_state_periodic = set_state_periodic;
 	evt->set_state_shutdown = set_state_shutdown;
 	evt->set_state_oneshot = set_state_shutdown;
+	evt->set_state_oneshot_stopped = set_state_shutdown;
 	evt->tick_resume = set_state_shutdown;
 	evt->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT;
 	evt->rating = 450;
-- 
cgit v0.10.2


From 7e9551bc72201838ebaf388224ff43ddf2b0d853 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 25 Feb 2016 06:27:36 +0100
Subject: perf jvmti: improve error message in Makefile

This patch improves the error message given by jvmti Makefile when the
alternatives command cannot be found. It now suggests the user locates
the root of their Java installation and pass it with JDIR=

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1456378056-18812-1-git-send-email-eranian@google.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/jvmti/Makefile b/tools/perf/jvmti/Makefile
index 0277a64..5ce61a1 100644
--- a/tools/perf/jvmti/Makefile
+++ b/tools/perf/jvmti/Makefile
@@ -35,12 +35,21 @@ SOLIBEXT=so
 
 # The following works at least on fedora 23, you may need the next
 # line for other distros.
-ifeq (,$(wildcard /usr/sbin/update-java-alternatives))
-JDIR=$(shell alternatives --display java | tail -1 | cut -d' ' -f 5 | sed 's%/jre/bin/java.%%g')
-else
+ifneq (,$(wildcard /usr/sbin/update-java-alternatives))
 JDIR=$(shell /usr/sbin/update-java-alternatives -l | head -1 | cut -d ' ' -f 3)
+else
+  ifneq (,$(wildcard /usr/sbin/alternatives))
+    JDIR=$(shell alternatives --display java | tail -1 | cut -d' ' -f 5 | sed 's%/jre/bin/java.%%g')
+  endif
 endif
-
+ifndef JDIR
+$(error Could not find alternatives command, you need to set JDIR= to point to the root of your Java directory)
+else
+  ifeq (,$(wildcard $(JDIR)/include/jvmti.h))
+  $(error the openjdk development package appears to me missing, install and try again)
+  endif
+endif
+$(info Using Java from $(JDIR))
 # -lrt required in 32-bit mode for clock_gettime()
 LIBS=-lelf -lrt
 INCDIR=-I $(JDIR)/include -I $(JDIR)/include/linux
-- 
cgit v0.10.2


From bb109acc4adeae425147ca87b84d312ea40f24f1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 25 Feb 2016 10:56:21 -0300
Subject: perf tools: Fix parsing of pmu events with empty list of modifiers

In 1d55e8ef340d ("perf tools: Introduce opt_event_config nonterminal") I
removed the unconditional "'/' '/'" for pmu events such as
"intel_pt//" but forgot to use opt_event_config where it expected some
event_config, oops. Fix it.

Noticed when trying to use:

  # perf record -e intel_pt// -a sleep 1
  event syntax error: 'intel_pt//'
                               \___ parser error
  Run 'perf list' for a list of valid events

   Usage: perf record [<options>] [<command>]
      or: perf record [<options>] -- <command> [<options>]

      -e, --event <event>   event selector. use 'perf list' to list available events
  #

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: 1d55e8ef340d ("perf tools: Introduce opt_event_config nonterminal")
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index d1fbcab..85c44ba 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -217,14 +217,14 @@ event_def: event_pmu |
 	   event_bpf_file
 
 event_pmu:
-PE_NAME '/' event_config '/'
+PE_NAME opt_event_config
 {
 	struct parse_events_evlist *data = _data;
 	struct list_head *list;
 
 	ALLOC_LIST(list);
-	ABORT_ON(parse_events_add_pmu(data, list, $1, $3));
-	parse_events_terms__delete($3);
+	ABORT_ON(parse_events_add_pmu(data, list, $1, $2));
+	parse_events_terms__delete($2);
 	$$ = list;
 }
 |
-- 
cgit v0.10.2


From b26a719bdba9aa926ceaadecc66e07623d2b8a53 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Thu, 18 Feb 2016 17:06:30 +0100
Subject: gpio: rcar: Add Runtime PM handling for interrupts

The R-Car GPIO driver handles Runtime PM for requested GPIOs only.

When using a GPIO purely as an interrupt source, no Runtime PM handling
is done, and the GPIO module's clock may not be enabled.

To fix this:
  - Add .irq_request_resources() and .irq_release_resources() callbacks
    to handle Runtime PM when an interrupt is requested,
  - Add irq_bus_lock() and sync_unlock() callbacks to handle Runtime PM
    when e.g. disabling/enabling an interrupt, or configuring the
    interrupt type.

Fixes: d5c3d84657db57bd "net: phy: Avoid polling PHY with PHY_IGNORE_INTERRUPTS"
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>

diff --git a/drivers/gpio/gpio-rcar.c b/drivers/gpio/gpio-rcar.c
index cf41440..d9ab0cd 100644
--- a/drivers/gpio/gpio-rcar.c
+++ b/drivers/gpio/gpio-rcar.c
@@ -196,6 +196,44 @@ static int gpio_rcar_irq_set_wake(struct irq_data *d, unsigned int on)
 	return 0;
 }
 
+static void gpio_rcar_irq_bus_lock(struct irq_data *d)
+{
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct gpio_rcar_priv *p = gpiochip_get_data(gc);
+
+	pm_runtime_get_sync(&p->pdev->dev);
+}
+
+static void gpio_rcar_irq_bus_sync_unlock(struct irq_data *d)
+{
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct gpio_rcar_priv *p = gpiochip_get_data(gc);
+
+	pm_runtime_put(&p->pdev->dev);
+}
+
+
+static int gpio_rcar_irq_request_resources(struct irq_data *d)
+{
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct gpio_rcar_priv *p = gpiochip_get_data(gc);
+	int error;
+
+	error = pm_runtime_get_sync(&p->pdev->dev);
+	if (error < 0)
+		return error;
+
+	return 0;
+}
+
+static void gpio_rcar_irq_release_resources(struct irq_data *d)
+{
+	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	struct gpio_rcar_priv *p = gpiochip_get_data(gc);
+
+	pm_runtime_put(&p->pdev->dev);
+}
+
 static irqreturn_t gpio_rcar_irq_handler(int irq, void *dev_id)
 {
 	struct gpio_rcar_priv *p = dev_id;
@@ -450,6 +488,10 @@ static int gpio_rcar_probe(struct platform_device *pdev)
 	irq_chip->irq_unmask = gpio_rcar_irq_enable;
 	irq_chip->irq_set_type = gpio_rcar_irq_set_type;
 	irq_chip->irq_set_wake = gpio_rcar_irq_set_wake;
+	irq_chip->irq_bus_lock = gpio_rcar_irq_bus_lock;
+	irq_chip->irq_bus_sync_unlock = gpio_rcar_irq_bus_sync_unlock;
+	irq_chip->irq_request_resources = gpio_rcar_irq_request_resources;
+	irq_chip->irq_release_resources = gpio_rcar_irq_release_resources;
 	irq_chip->flags	= IRQCHIP_SET_TYPE_MASKED | IRQCHIP_MASK_ON_SUSPEND;
 
 	ret = gpiochip_add_data(gpio_chip, p);
-- 
cgit v0.10.2


From 38e45d02ea9f194b89d6bf41e52ccafc8e2c2b47 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Date: Tue, 23 Feb 2016 13:03:30 +0100
Subject: iommu/amd: Fix boot warning when device 00:00.0 is not iommu covered

The setup code for the performance counters in the AMD IOMMU driver
tests whether the counters can be written. It tests to setup a counter
for device 00:00.0, which fails on systems where this particular device
is not covered by the IOMMU.

Fix this by not relying on device 00:00.0 but only on the IOMMU being
present.

Cc: stable@vger.kernel.org
Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 013bdff..d06a6d9 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -228,6 +228,10 @@ static int amd_iommu_enable_interrupts(void);
 static int __init iommu_go_to_state(enum iommu_init_state state);
 static void init_device_table_dma(void);
 
+static int iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
+				    u8 bank, u8 cntr, u8 fxn,
+				    u64 *value, bool is_write);
+
 static inline void update_last_devid(u16 devid)
 {
 	if (devid > amd_iommu_last_bdf)
@@ -1142,8 +1146,8 @@ static void init_iommu_perf_ctr(struct amd_iommu *iommu)
 	amd_iommu_pc_present = true;
 
 	/* Check if the performance counters can be written to */
-	if ((0 != amd_iommu_pc_get_set_reg_val(0, 0, 0, 0, &val, true)) ||
-	    (0 != amd_iommu_pc_get_set_reg_val(0, 0, 0, 0, &val2, false)) ||
+	if ((0 != iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val, true)) ||
+	    (0 != iommu_pc_get_set_reg_val(iommu, 0, 0, 0, &val2, false)) ||
 	    (val != val2)) {
 		pr_err("AMD-Vi: Unable to write to IOMMU perf counter.\n");
 		amd_iommu_pc_present = false;
@@ -2283,22 +2287,15 @@ u8 amd_iommu_pc_get_max_counters(u16 devid)
 }
 EXPORT_SYMBOL(amd_iommu_pc_get_max_counters);
 
-int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
+static int iommu_pc_get_set_reg_val(struct amd_iommu *iommu,
+				    u8 bank, u8 cntr, u8 fxn,
 				    u64 *value, bool is_write)
 {
-	struct amd_iommu *iommu;
 	u32 offset;
 	u32 max_offset_lim;
 
-	/* Make sure the IOMMU PC resource is available */
-	if (!amd_iommu_pc_present)
-		return -ENODEV;
-
-	/* Locate the iommu associated with the device ID */
-	iommu = amd_iommu_rlookup_table[devid];
-
 	/* Check for valid iommu and pc register indexing */
-	if (WARN_ON((iommu == NULL) || (fxn > 0x28) || (fxn & 7)))
+	if (WARN_ON((fxn > 0x28) || (fxn & 7)))
 		return -ENODEV;
 
 	offset = (u32)(((0x40|bank) << 12) | (cntr << 8) | fxn);
@@ -2322,3 +2319,16 @@ int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
 	return 0;
 }
 EXPORT_SYMBOL(amd_iommu_pc_get_set_reg_val);
+
+int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, u8 fxn,
+				    u64 *value, bool is_write)
+{
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+	/* Make sure the IOMMU PC resource is available */
+	if (!amd_iommu_pc_present || iommu == NULL)
+		return -ENODEV;
+
+	return iommu_pc_get_set_reg_val(iommu, bank, cntr, fxn,
+					value, is_write);
+}
-- 
cgit v0.10.2


From 358875fd52ab8f00f66328cbf1a1d2486f265829 Mon Sep 17 00:00:00 2001
From: Jay Cornwall <jay@jcornwall.me>
Date: Wed, 10 Feb 2016 15:48:01 -0600
Subject: iommu/amd: Apply workaround for ATS write permission check

The AMD Family 15h Models 30h-3Fh (Kaveri) BIOS and Kernel Developer's
Guide omitted part of the BIOS IOMMU L2 register setup specification.
Without this setup the IOMMU L2 does not fully respect write permissions
when handling an ATS translation request.

The IOMMU L2 will set PTE dirty bit when handling an ATS translation with
write permission request, even when PTE RW bit is clear. This may occur by
direct translation (which would cause a PPR) or by prefetch request from
the ATC.

This is observed in practice when the IOMMU L2 modifies a PTE which maps a
pagecache page. The ext4 filesystem driver BUGs when asked to writeback
these (non-modified) pages.

Enable ATS write permission check in the Kaveri IOMMU L2 if BIOS has not.

Signed-off-by: Jay Cornwall <jay@jcornwall.me>
Cc: <stable@vger.kernel.org> # v3.19+
Signed-off-by: Joerg Roedel <jroedel@suse.de>

diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index d06a6d9..bf4959f 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -1020,6 +1020,34 @@ static void amd_iommu_erratum_746_workaround(struct amd_iommu *iommu)
 }
 
 /*
+ * Family15h Model 30h-3fh (IOMMU Mishandles ATS Write Permission)
+ * Workaround:
+ *     BIOS should enable ATS write permission check by setting
+ *     L2_DEBUG_3[AtsIgnoreIWDis](D0F2xF4_x47[0]) = 1b
+ */
+static void amd_iommu_ats_write_check_workaround(struct amd_iommu *iommu)
+{
+	u32 value;
+
+	if ((boot_cpu_data.x86 != 0x15) ||
+	    (boot_cpu_data.x86_model < 0x30) ||
+	    (boot_cpu_data.x86_model > 0x3f))
+		return;
+
+	/* Test L2_DEBUG_3[AtsIgnoreIWDis] == 1 */
+	value = iommu_read_l2(iommu, 0x47);
+
+	if (value & BIT(0))
+		return;
+
+	/* Set L2_DEBUG_3[AtsIgnoreIWDis] = 1 */
+	iommu_write_l2(iommu, 0x47, value | BIT(0));
+
+	pr_info("AMD-Vi: Applying ATS write check workaround for IOMMU at %s\n",
+		dev_name(&iommu->dev->dev));
+}
+
+/*
  * This function clues the initialization function for one IOMMU
  * together and also allocates the command buffer and programs the
  * hardware. It does NOT enable the IOMMU. This is done afterwards.
@@ -1288,6 +1316,7 @@ static int iommu_init_pci(struct amd_iommu *iommu)
 	}
 
 	amd_iommu_erratum_746_workaround(iommu);
+	amd_iommu_ats_write_check_workaround(iommu);
 
 	iommu->iommu_dev = iommu_device_create(&iommu->dev->dev, iommu,
 					       amd_iommu_groups, "ivhd%d",
-- 
cgit v0.10.2


From 8579aca3f9a8f890d6d94ccaed7cf5fd54a0c3bd Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Fri, 26 Feb 2016 00:12:59 +0900
Subject: perf script: Exception handling when the print fmt is empty

After collecting samples for events 'syscalls:', perf-script with python
script doesn't occasionally work generating a segmentation fault.

The reason is that the print fmt is empty and a value of
event->print_fmt.args is NULL, so dereferencing the null pointer results
in a segmentation fault i.e.:

    # perf record -e syscalls:*
    # perf script -g python
    # perf script -s perf-script.py

    in trace_begin
    syscalls__sys_enter_brk  3 79841.832099154  3777 test.sh  syscall_nr=12, brk=0

    ... (omitted) ...

    Segmentation fault (core dumped)

For example, a format of sys_enter_getuid() hasn't
print fmt as below.

    # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_getuid/format
    name: sys_enter_getuid
    ID: 188
    format:
            field:unsigned short common_type;         offset:0; size:2; signed:0;
            field:unsigned char common_flags;         offset:2; size:1; signed:0;
            field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
            field:int common_pid;                     offset:4; size:4; signed:1;
            field:int syscall_nr;                     offset:8; size:4; signed:1;

    print fmt: ""

So add exception handling to avoid this problem.

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456413179-12331-1-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c
index 544509c..b3aabc0 100644
--- a/tools/perf/util/scripting-engines/trace-event-perl.c
+++ b/tools/perf/util/scripting-engines/trace-event-perl.c
@@ -187,6 +187,9 @@ static void define_event_symbols(struct event_format *event,
 				 const char *ev_name,
 				 struct print_arg *args)
 {
+	if (args == NULL)
+		return;
+
 	switch (args->type) {
 	case PRINT_NULL:
 		break;
diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index d72fafc..309d90f 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -205,6 +205,9 @@ static void define_event_symbols(struct event_format *event,
 				 const char *ev_name,
 				 struct print_arg *args)
 {
+	if (args == NULL)
+		return;
+
 	switch (args->type) {
 	case PRINT_NULL:
 		break;
-- 
cgit v0.10.2


From 8560bae02a948876b26d1d86423cf5e0bb04a815 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Fri, 26 Feb 2016 00:13:10 +0900
Subject: perf script: Remove duplicated code and needless
 script_spec__findnew()

script_spec_register() called two functions: script_spec__find() and
script_spec__findnew().  But this way script_spec__find() gets called
two times, directly and via script_spec__findnew().

So remove script_spec__findnew() and make script_spec_register() only
call once script_spec__find().

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/1456413190-12378-1-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index ec4fbd4..57f9a7e 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -1212,23 +1212,6 @@ static struct script_spec *script_spec__find(const char *spec)
 	return NULL;
 }
 
-static struct script_spec *script_spec__findnew(const char *spec,
-						struct scripting_ops *ops)
-{
-	struct script_spec *s = script_spec__find(spec);
-
-	if (s)
-		return s;
-
-	s = script_spec__new(spec, ops);
-	if (!s)
-		return NULL;
-
-	script_spec__add(s);
-
-	return s;
-}
-
 int script_spec_register(const char *spec, struct scripting_ops *ops)
 {
 	struct script_spec *s;
@@ -1237,9 +1220,11 @@ int script_spec_register(const char *spec, struct scripting_ops *ops)
 	if (s)
 		return -1;
 
-	s = script_spec__findnew(spec, ops);
+	s = script_spec__new(spec, ops);
 	if (!s)
 		return -1;
+	else
+		script_spec__add(s);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 44248affb5d82d32cdb58a92a94ce191d4ddee60 Mon Sep 17 00:00:00 2001
From: Han Xu <han.xu@nxp.com>
Date: Thu, 18 Feb 2016 03:55:15 +0000
Subject: MAINTAINERS: add maintainer entry for FREESCALE GPMI NAND driver

Add a maintainer entry for FREESCALE GPMI NAND driver and add myself as a
maintainer.

Signed-off-by: Han Xu <han.xu@nxp.com>
Acked-by: Huang Shijie <shijie.huang@arm.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index ce4a0d6..0db5eb9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4522,6 +4522,12 @@ L:	linuxppc-dev@lists.ozlabs.org
 S:	Maintained
 F:	drivers/dma/fsldma.*
 
+FREESCALE GPMI NAND DRIVER
+M:	Han Xu <han.xu@nxp.com>
+L:	linux-mtd@lists.infradead.org
+S:	Maintained
+F:	drivers/mtd/nand/gpmi-nand/*
+
 FREESCALE I2C CPM DRIVER
 M:	Jochen Friedrich <jochen@scram.de>
 L:	linuxppc-dev@lists.ozlabs.org
-- 
cgit v0.10.2


From 65c38aa653c14df49e19faad74bd375f36e61c57 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 23 Feb 2016 10:10:26 -0800
Subject: net: vrf: Remove direct access to skb->data

Nik pointed that the VRF driver should be using skb_header_pointer
instead of accessing skb->data and bits beyond directly which can
be garbage.

Fixes: 35402e313663 ("net: Add IPv6 support to VRF device")
Cc: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 66addb7..bdcf617 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -104,20 +104,23 @@ static struct dst_ops vrf_dst_ops = {
 #if IS_ENABLED(CONFIG_IPV6)
 static bool check_ipv6_frame(const struct sk_buff *skb)
 {
-	const struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb->data;
-	size_t hlen = sizeof(*ipv6h);
+	const struct ipv6hdr *ipv6h;
+	struct ipv6hdr _ipv6h;
 	bool rc = true;
 
-	if (skb->len < hlen)
+	ipv6h = skb_header_pointer(skb, 0, sizeof(_ipv6h), &_ipv6h);
+	if (!ipv6h)
 		goto out;
 
 	if (ipv6h->nexthdr == NEXTHDR_ICMP) {
 		const struct icmp6hdr *icmph;
+		struct icmp6hdr _icmph;
 
-		if (skb->len < hlen + sizeof(*icmph))
+		icmph = skb_header_pointer(skb, sizeof(_ipv6h),
+					   sizeof(_icmph), &_icmph);
+		if (!icmph)
 			goto out;
 
-		icmph = (struct icmp6hdr *)(skb->data + sizeof(*ipv6h));
 		switch (icmph->icmp6_type) {
 		case NDISC_ROUTER_SOLICITATION:
 		case NDISC_ROUTER_ADVERTISEMENT:
-- 
cgit v0.10.2


From 2b70bad23c89b121a3e4a00f8968d14ebb78887d Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Tue, 23 Feb 2016 19:23:23 +0000
Subject: net: qca_spi: Don't clear IFF_BROADCAST

Currently qcaspi_netdev_setup accidentally clears IFF_BROADCAST.
So fix this by keeping the flags from ether_setup.

Reported-by: Michael Heimpold <michael.heimpold@i2se.com>
Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Fixes: 291ab06ecf67 (net: qualcomm: new Ethernet over SPI driver for QCA7000)
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c
index 689a4a5..f2ee3e5f 100644
--- a/drivers/net/ethernet/qualcomm/qca_spi.c
+++ b/drivers/net/ethernet/qualcomm/qca_spi.c
@@ -811,7 +811,6 @@ qcaspi_netdev_setup(struct net_device *dev)
 	dev->netdev_ops = &qcaspi_netdev_ops;
 	qcaspi_set_ethtool_ops(dev);
 	dev->watchdog_timeo = QCASPI_TX_TIMEOUT;
-	dev->flags = IFF_MULTICAST;
 	dev->tx_queue_len = 100;
 
 	qca = netdev_priv(dev);
-- 
cgit v0.10.2


From a4690afeb0d2d7ba4d60dfa98a89f3bb1ce60ecd Mon Sep 17 00:00:00 2001
From: Stefan Wahren <stefan.wahren@i2se.com>
Date: Tue, 23 Feb 2016 19:23:24 +0000
Subject: net: qca_spi: clear IFF_TX_SKB_SHARING

ether_setup sets IFF_TX_SKB_SHARING but this is not supported by
qca_spi as it modifies the skb on xmit.

Signed-off-by: Stefan Wahren <stefan.wahren@i2se.com>
Fixes: 291ab06ecf67 (net: qualcomm: new Ethernet over SPI driver for QCA7000)
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c
index f2ee3e5f..1ef0393 100644
--- a/drivers/net/ethernet/qualcomm/qca_spi.c
+++ b/drivers/net/ethernet/qualcomm/qca_spi.c
@@ -811,6 +811,7 @@ qcaspi_netdev_setup(struct net_device *dev)
 	dev->netdev_ops = &qcaspi_netdev_ops;
 	qcaspi_set_ethtool_ops(dev);
 	dev->watchdog_timeo = QCASPI_TX_TIMEOUT;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 	dev->tx_queue_len = 100;
 
 	qca = netdev_priv(dev);
-- 
cgit v0.10.2


From 9b368814b336b0a1a479135eb2815edbc00efd3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@c0d3.blue>
Date: Wed, 24 Feb 2016 04:21:42 +0100
Subject: net: fix bridge multicast packet checksum validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need to update the skb->csum after pulling the skb, otherwise
an unnecessary checksum (re)computation can ocure for IGMP/MLD packets
in the bridge code. Additionally this fixes the following splats for
network devices / bridge ports with support for and enabled RX checksum
offloading:

[...]
[   43.986968] eth0: hw csum failure
[   43.990344] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.4.0 #2
[   43.996193] Hardware name: BCM2709
[   43.999647] [<800204e0>] (unwind_backtrace) from [<8001cf14>] (show_stack+0x10/0x14)
[   44.007432] [<8001cf14>] (show_stack) from [<801ab614>] (dump_stack+0x80/0x90)
[   44.014695] [<801ab614>] (dump_stack) from [<802e4548>] (__skb_checksum_complete+0x6c/0xac)
[   44.023090] [<802e4548>] (__skb_checksum_complete) from [<803a055c>] (ipv6_mc_validate_checksum+0x104/0x178)
[   44.032959] [<803a055c>] (ipv6_mc_validate_checksum) from [<802e111c>] (skb_checksum_trimmed+0x130/0x188)
[   44.042565] [<802e111c>] (skb_checksum_trimmed) from [<803a06e8>] (ipv6_mc_check_mld+0x118/0x338)
[   44.051501] [<803a06e8>] (ipv6_mc_check_mld) from [<803b2c98>] (br_multicast_rcv+0x5dc/0xd00)
[   44.060077] [<803b2c98>] (br_multicast_rcv) from [<803aa510>] (br_handle_frame_finish+0xac/0x51c)
[...]

Fixes: 9afd85c9e455 ("net: Export IGMP/MLD message validation code")
Reported-by: Álvaro Fernández Rojas <noltari@gmail.com>
Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5bf88f5..8616d11 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2948,6 +2948,24 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
 EXPORT_SYMBOL_GPL(skb_append_pagefrags);
 
 /**
+ *	skb_push_rcsum - push skb and update receive checksum
+ *	@skb: buffer to update
+ *	@len: length of data pulled
+ *
+ *	This function performs an skb_push on the packet and updates
+ *	the CHECKSUM_COMPLETE checksum.  It should be used on
+ *	receive path processing instead of skb_push unless you know
+ *	that the checksum difference is zero (e.g., a valid IP header)
+ *	or you are setting ip_summed to CHECKSUM_NONE.
+ */
+static unsigned char *skb_push_rcsum(struct sk_buff *skb, unsigned len)
+{
+	skb_push(skb, len);
+	skb_postpush_rcsum(skb, skb->data, len);
+	return skb->data;
+}
+
+/**
  *	skb_pull_rcsum - pull skb and update receive checksum
  *	@skb: buffer to update
  *	@len: length of data pulled
@@ -4084,9 +4102,9 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
 	if (!pskb_may_pull(skb_chk, offset))
 		goto err;
 
-	__skb_pull(skb_chk, offset);
+	skb_pull_rcsum(skb_chk, offset);
 	ret = skb_chkf(skb_chk);
-	__skb_push(skb_chk, offset);
+	skb_push_rcsum(skb_chk, offset);
 
 	if (ret)
 		goto err;
-- 
cgit v0.10.2


From f09cf4b7832d029fb22d8f476eac12fc27dde61f Mon Sep 17 00:00:00 2001
From: Chun-Hao Lin <hau@realtek.com>
Date: Wed, 24 Feb 2016 14:18:42 +0800
Subject: r8169:fix "rtl_counters_cond == 1 (loop: 1000, delay: 10)" log spam.

There will be a log spam when there is no cable plugged. Please refer to
following links. https://bugzilla.kernel.org/show_bug.cgi?id=104351
https://bugzilla.kernel.org/show_bug.cgi?id=107421

This issue is caused by runtime power management. When there is no cable
plugged, the driver will be suspend (runtime suspend) by OS and NIC will be
put into the D3 state. During this time, if OS call rtl8169_get_stats64()
to dump tally counter, because NIC is in D3 state, the register value read
by driver will return all 0xff. This will let driver think tally counter
flag is not toggled and then sends the warning message "rtl_counters_cond
== 1 (loop: 1000, delay: 10)" to kernel log.

For fixing this issue, 1.add checking driver's pm runtime status in
rtl8169_get_stats64(). 2.dump tally counter before going runtime suspend
for counter accuracy in runtime suspend.

Signed-off-by: Chunhao Lin <hau@realtek.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 537974c..4caeacb 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7730,10 +7730,13 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
 	void __iomem *ioaddr = tp->mmio_addr;
+	struct pci_dev *pdev = tp->pci_dev;
 	struct rtl8169_counters *counters = tp->counters;
 	unsigned int start;
 
-	if (netif_running(dev))
+	pm_runtime_get_noresume(&pdev->dev);
+
+	if (netif_running(dev) && pm_runtime_active(&pdev->dev))
 		rtl8169_rx_missed(dev, ioaddr);
 
 	do {
@@ -7761,7 +7764,8 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	 * Fetch additonal counter values missing in stats collected by driver
 	 * from tally counters.
 	 */
-	rtl8169_update_counters(dev);
+	if (pm_runtime_active(&pdev->dev))
+		rtl8169_update_counters(dev);
 
 	/*
 	 * Subtract values fetched during initalization.
@@ -7774,6 +7778,8 @@ rtl8169_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->tx_aborted_errors = le16_to_cpu(counters->tx_aborted) -
 		le16_to_cpu(tp->tc_offset.tx_aborted);
 
+	pm_runtime_put_noidle(&pdev->dev);
+
 	return stats;
 }
 
@@ -7853,6 +7859,10 @@ static int rtl8169_runtime_suspend(struct device *device)
 
 	rtl8169_net_suspend(dev);
 
+	/* Update counters before going runtime suspend */
+	rtl8169_rx_missed(dev, tp->mmio_addr);
+	rtl8169_update_counters(dev);
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 4c0b6eaf373a5323f03a3a20c42fc435715b073d Mon Sep 17 00:00:00 2001
From: Sunil Goutham <sgoutham@cavium.com>
Date: Wed, 24 Feb 2016 16:40:50 +0530
Subject: net: thunderx: Fix for Qset error due to CQ full

On Thunderx pass 1.x and pass2 due to a HW errata default CQ
DROP_LEVEL of 0x80 is not sufficient to avoid CQ_WR_FULL Qset
error when packets are being received at >20Mpps resulting in
complete stall of packet reception.

This patch will configure it to 0x100 which is what is expected
by HW on Thunderx. On future passes of thunderx and other chips
HW default/reset value will be 0x100 or higher hence not overwritten.

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
Signed-off-by: Sunil Goutham <sgoutham@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h
index 6888288..34e9ace 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -116,6 +116,15 @@
 #define NIC_PF_INTR_ID_MBOX0		8
 #define NIC_PF_INTR_ID_MBOX1		9
 
+/* Minimum FIFO level before all packets for the CQ are dropped
+ *
+ * This value ensures that once a packet has been "accepted"
+ * for reception it will not get dropped due to non-availability
+ * of CQ descriptor. An errata in HW mandates this value to be
+ * atleast 0x100.
+ */
+#define NICPF_CQM_MIN_DROP_LEVEL       0x100
+
 /* Global timer for CQ timer thresh interrupts
  * Calculated for SCLK of 700Mhz
  * value written should be a 1/16th of what is expected
diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c
index 4dded90..95f17f8 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -304,6 +304,7 @@ static void nic_set_lmac_vf_mapping(struct nicpf *nic)
 static void nic_init_hw(struct nicpf *nic)
 {
 	int i;
+	u64 cqm_cfg;
 
 	/* Enable NIC HW block */
 	nic_reg_write(nic, NIC_PF_CFG, 0x3);
@@ -340,6 +341,11 @@ static void nic_init_hw(struct nicpf *nic)
 	/* Enable VLAN ethertype matching and stripping */
 	nic_reg_write(nic, NIC_PF_RX_ETYPE_0_7,
 		      (2 << 19) | (ETYPE_ALG_VLAN_STRIP << 16) | ETH_P_8021Q);
+
+	/* Check if HW expected value is higher (could be in future chips) */
+	cqm_cfg = nic_reg_read(nic, NIC_PF_CQM_CFG);
+	if (cqm_cfg < NICPF_CQM_MIN_DROP_LEVEL)
+		nic_reg_write(nic, NIC_PF_CQM_CFG, NICPF_CQM_MIN_DROP_LEVEL);
 }
 
 /* Channel parse index configuration */
diff --git a/drivers/net/ethernet/cavium/thunder/nic_reg.h b/drivers/net/ethernet/cavium/thunder/nic_reg.h
index dd536be..afb10e3 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_reg.h
+++ b/drivers/net/ethernet/cavium/thunder/nic_reg.h
@@ -21,7 +21,7 @@
 #define   NIC_PF_TCP_TIMER			(0x0060)
 #define   NIC_PF_BP_CFG				(0x0080)
 #define   NIC_PF_RRM_CFG			(0x0088)
-#define   NIC_PF_CQM_CF				(0x00A0)
+#define   NIC_PF_CQM_CFG			(0x00A0)
 #define   NIC_PF_CNM_CF				(0x00A8)
 #define   NIC_PF_CNM_STATUS			(0x00B0)
 #define   NIC_PF_CQ_AVG_CFG			(0x00C0)
-- 
cgit v0.10.2


From 4ee34ea3a12396f35b26d90a094c75db95080baa Mon Sep 17 00:00:00 2001
From: Harvey Hunt <harvey.hunt@imgtec.com>
Date: Wed, 24 Feb 2016 15:16:43 +0000
Subject: libata: Align ata_device's id on a cacheline

The id buffer in ata_device is a DMA target, but it isn't explicitly
cacheline aligned. Due to this, adjacent fields can be overwritten with
stale data from memory on non coherent architectures. As a result, the
kernel is sometimes unable to communicate with an ATA device.

Fix this by ensuring that the id buffer is cacheline aligned.

This issue is similar to that fixed by Commit 84bda12af31f
("libata: align ap->sector_buf").

Signed-off-by: Harvey Hunt <harvey.hunt@imgtec.com>
Cc: linux-kernel@vger.kernel.org
Cc: <stable@vger.kernel.org> # 2.6.18
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/include/linux/libata.h b/include/linux/libata.h
index bec2abb..2c4ebef 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -720,7 +720,7 @@ struct ata_device {
 	union {
 		u16		id[ATA_ID_WORDS]; /* IDENTIFY xxx DEVICE data */
 		u32		gscr[SATA_PMP_GSCR_DWORDS]; /* PMP GSCR block */
-	};
+	} ____cacheline_aligned;
 
 	/* DEVSLP Timing Variables from Identify Device Data Log */
 	u8			devslp_timing[ATA_LOG_DEVSLP_SIZE];
-- 
cgit v0.10.2


From f4833a519aec793cf8349bf479589d37473ef6a7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 24 Feb 2016 17:38:14 +0100
Subject: ASoC: trace: fix printing jack name

After a change to the snd_jack structure, the 'name' member
is no longer available in all configurations, which results in a
build failure in the tracing code:

include/trace/events/asoc.h: In function 'trace_event_raw_event_snd_soc_jack_report':
include/trace/events/asoc.h:240:32: error: 'struct snd_jack' has no member named 'name'

The name field is normally initialized from the card shortname and
the jack "id" field:

        snprintf(jack->name, sizeof(jack->name), "%s %s",
                 card->shortname, jack->id);

This changes the tracing output to just contain the 'id' by
itself, which slightly changes the output format but avoids the
link error and is hopefully still enough to see what is going on.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: fe0d128c57bf ("ALSA: jack: Allow building the jack layer without input device")
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h
index 317a1ed..9130dd5 100644
--- a/include/trace/events/asoc.h
+++ b/include/trace/events/asoc.h
@@ -231,13 +231,13 @@ TRACE_EVENT(snd_soc_jack_report,
 	TP_ARGS(jack, mask, val),
 
 	TP_STRUCT__entry(
-		__string(	name,		jack->jack->name	)
+		__string(	name,		jack->jack->id		)
 		__field(	int,		mask			)
 		__field(	int,		val			)
 	),
 
 	TP_fast_assign(
-		__assign_str(name, jack->jack->name);
+		__assign_str(name, jack->jack->id);
 		__entry->mask = mask;
 		__entry->val = val;
 	),
@@ -253,12 +253,12 @@ TRACE_EVENT(snd_soc_jack_notify,
 	TP_ARGS(jack, val),
 
 	TP_STRUCT__entry(
-		__string(	name,		jack->jack->name	)
+		__string(	name,		jack->jack->id		)
 		__field(	int,		val			)
 	),
 
 	TP_fast_assign(
-		__assign_str(name, jack->jack->name);
+		__assign_str(name, jack->jack->id);
 		__entry->val = val;
 	),
 
-- 
cgit v0.10.2


From c8560b7c917f8738f5d80dd516930edc1d05e4e4 Mon Sep 17 00:00:00 2001
From: Carlo Caione <carlo@endlessm.com>
Date: Tue, 23 Feb 2016 09:50:20 +0100
Subject: ASoC: cht_bsw_rt5645: Fix writing to string literal

We cannot use strcpy() to write to a const char * location. This is
causing a 'BUG: unable to handle kernel paging request' error at boot
when using the cht-bsw-rt5645 driver.

With this patch we also fix a wrong indexing in the driver where the
codec_name of the wrong dai_link is being overwritten.

Signed-off-by: Carlo Caione <carlo@endlessm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/intel/boards/cht_bsw_rt5645.c b/sound/soc/intel/boards/cht_bsw_rt5645.c
index 2d3afdd..a7b96a9 100644
--- a/sound/soc/intel/boards/cht_bsw_rt5645.c
+++ b/sound/soc/intel/boards/cht_bsw_rt5645.c
@@ -367,8 +367,12 @@ static int snd_cht_mc_probe(struct platform_device *pdev)
 	}
 	card->dev = &pdev->dev;
 	sprintf(codec_name, "i2c-%s:00", drv->acpi_card->codec_id);
+
 	/* set correct codec name */
-	strcpy((char *)card->dai_link[2].codec_name, codec_name);
+	for (i = 0; i < ARRAY_SIZE(cht_dailink); i++)
+		if (!strcmp(card->dai_link[i].codec_name, "i2c-10EC5645:00"))
+			card->dai_link[i].codec_name = kstrdup(codec_name, GFP_KERNEL);
+
 	snd_soc_card_set_drvdata(card, drv);
 	ret_val = devm_snd_soc_register_card(&pdev->dev, card);
 	if (ret_val) {
-- 
cgit v0.10.2


From e9a2d81b1761093386a0bb8a4f51642ac785ef63 Mon Sep 17 00:00:00 2001
From: Maximilain Schneider <max@schneidersoft.net>
Date: Tue, 23 Feb 2016 01:17:28 +0000
Subject: can: gs_usb: fixed disconnect bug by removing erroneous use of
 kfree()

gs_destroy_candev() erroneously calls kfree() on a struct gs_can *, which is
allocated through alloc_candev() and should instead be freed using
free_candev() alone.

The inappropriate use of kfree() causes the kernel to hang when
gs_destroy_candev() is called.

Only the struct gs_usb * which is allocated through kzalloc() should be freed
using kfree() when the device is disconnected.

Signed-off-by: Maximilian Schneider <max@schneidersoft.net>
Cc: linux-stable <stable@vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>

diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c
index 5eee62b..cbc99d5 100644
--- a/drivers/net/can/usb/gs_usb.c
+++ b/drivers/net/can/usb/gs_usb.c
@@ -826,9 +826,8 @@ static struct gs_can *gs_make_candev(unsigned int channel, struct usb_interface
 static void gs_destroy_candev(struct gs_can *dev)
 {
 	unregister_candev(dev->netdev);
-	free_candev(dev->netdev);
 	usb_kill_anchored_urbs(&dev->tx_submitted);
-	kfree(dev);
+	free_candev(dev->netdev);
 }
 
 static int gs_usb_probe(struct usb_interface *intf, const struct usb_device_id *id)
@@ -913,12 +912,15 @@ static int gs_usb_probe(struct usb_interface *intf, const struct usb_device_id *
 	for (i = 0; i < icount; i++) {
 		dev->canch[i] = gs_make_candev(i, intf);
 		if (IS_ERR_OR_NULL(dev->canch[i])) {
+			/* save error code to return later */
+			rc = PTR_ERR(dev->canch[i]);
+
 			/* on failure destroy previously created candevs */
 			icount = i;
-			for (i = 0; i < icount; i++) {
+			for (i = 0; i < icount; i++)
 				gs_destroy_candev(dev->canch[i]);
-				dev->canch[i] = NULL;
-			}
+
+			usb_kill_anchored_urbs(&dev->rx_submitted);
 			kfree(dev);
 			return rc;
 		}
@@ -939,16 +941,12 @@ static void gs_usb_disconnect(struct usb_interface *intf)
 		return;
 	}
 
-	for (i = 0; i < GS_MAX_INTF; i++) {
-		struct gs_can *can = dev->canch[i];
-
-		if (!can)
-			continue;
-
-		gs_destroy_candev(can);
-	}
+	for (i = 0; i < GS_MAX_INTF; i++)
+		if (dev->canch[i])
+			gs_destroy_candev(dev->canch[i]);
 
 	usb_kill_anchored_urbs(&dev->rx_submitted);
+	kfree(dev);
 }
 
 static const struct usb_device_id gs_usb_table[] = {
-- 
cgit v0.10.2


From 10da848f67a7e7152bf7cbe332e4c92d71a990d2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 25 Feb 2016 21:57:43 +0100
Subject: ssb: host_soc depends on sprom

Drivers that use the SSB sprom functionality typically 'select SSB_SPROM'
from Kconfig, but CONFIG_SSB_HOST_SOC misses this, which results in
a build failure unless at least one of the other drivers that selects
it is enabled:

drivers/built-in.o: In function `ssb_host_soc_get_invariants':
(.text+0x459494): undefined reference to `ssb_fill_sprom_with_fallback'

This adds the same select statement that is used elsewhere.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 541c9a84cd85 ("ssb: pick SoC invariants code from MIPS BCM47xx arch")
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>

diff --git a/drivers/ssb/Kconfig b/drivers/ssb/Kconfig
index 0c67586..d8e4219 100644
--- a/drivers/ssb/Kconfig
+++ b/drivers/ssb/Kconfig
@@ -83,6 +83,7 @@ config SSB_SDIOHOST
 config SSB_HOST_SOC
 	bool "Support for SSB bus on SoC"
 	depends on SSB && BCM47XX_NVRAM
+	select SSB_SPROM
 	help
 	  Host interface for a SSB directly mapped into memory. This is
 	  for some Broadcom SoCs from the BCM47xx and BCM53xx lines.
-- 
cgit v0.10.2


From a1e533ec07d583d01349ef13c0c965b8633e1b91 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Date: Mon, 15 Feb 2016 18:41:33 +0000
Subject: fbcon: set a default value to blink interval

Since commit 27a4c827c34ac4256a190cc9d24607f953c1c459
	fbcon: use the cursor blink interval provided by vt

two attempts have been made at fixing a possible hang caused by
cursor_timer_handler. That function registers a timer to be triggered at
"jiffies + fbcon_ops.cur_blink_jiffies".

A new case had been encountered during initialisation of clcd-pl11x:

    fbcon_fb_registered
    do_fbcon_takeover

    ->  do_register_con_driver
        fbcon_startup
    (A) add_cursor_timer (with cur_blink_jiffies = 0)

    ->  do_bind_con_driver
        visual_init
        fbcon_init
    (B) cur_blink_jiffies = msecs_to_jiffies(vc->vc_cur_blink_ms);

If we take an softirq anywhere between A and B (and we do),
cursor_timer_handler executes indefinitely.

Instead of patching all possible paths that lead to this case one at a
time, fix the issue at the source and initialise cur_blink_jiffies to
200ms when allocating fbcon_ops. This was its default value before
aforesaid commit. fbcon_cursor or fbcon_init will refine this value
downstream.

Signed-off-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
Cc: <stable@vger.kernel.org> # v4.2
Tested-by: Scot Doyle <lkml14@scotdoyle.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>

diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 92f3949..6e92917 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -709,6 +709,7 @@ static int con2fb_acquire_newinfo(struct vc_data *vc, struct fb_info *info,
 	}
 
 	if (!err) {
+		ops->cur_blink_jiffies = HZ / 5;
 		info->fbcon_par = ops;
 
 		if (vc)
@@ -956,6 +957,7 @@ static const char *fbcon_startup(void)
 	ops->currcon = -1;
 	ops->graphics = 1;
 	ops->cur_rotate = -1;
+	ops->cur_blink_jiffies = HZ / 5;
 	info->fbcon_par = ops;
 	p->con_rotate = initial_rotation;
 	set_blitting_type(vc, info);
-- 
cgit v0.10.2


From 70e4da7a8ff62f2775337b705f45c804bb450454 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 26 Feb 2016 12:28:40 +0100
Subject: KVM: x86: fix root cause for missed hardware breakpoints

Commit 172b2386ed16 ("KVM: x86: fix missed hardware breakpoints",
2016-02-10) worked around a case where the debug registers are not loaded
correctly on preemption and on the first entry to KVM_RUN.

However, Xiao Guangrong pointed out that the root cause must be that
KVM_DEBUGREG_BP_ENABLED is not being set correctly.  This can indeed
happen due to the lazy debug exit mechanism, which does not call
kvm_update_dr7.  Fix it by replacing the existing loop (more or less
equivalent to kvm_update_dr0123) with calls to all the kvm_update_dr*
functions.

Cc: stable@vger.kernel.org   # 4.1+
Fixes: 172b2386ed16a9143d9a456aae5ec87275c61489
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f4891f2..eaf6ee8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2752,7 +2752,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
-	vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -6619,12 +6618,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	 * KVM_DEBUGREG_WONT_EXIT again.
 	 */
 	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
-		int i;
-
 		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
 		kvm_x86_ops->sync_dirty_debug_regs(vcpu);
-		for (i = 0; i < KVM_NR_DB_REGS; i++)
-			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+		kvm_update_dr0123(vcpu);
+		kvm_update_dr6(vcpu);
+		kvm_update_dr7(vcpu);
+		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
 	}
 
 	/*
-- 
cgit v0.10.2


From a7b5895b91fb97f2b0dcc2e3ce47413c18d19ca5 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 26 Feb 2016 21:13:16 +0900
Subject: perf hists: Add more helper functions for the hierarchy mode

The hists__overhead_width() is to calculate width occupied by the
overhead (and others) columns before the sort columns.

The hist_entry__has_hiearchy_children() is to check whether an entry has
lower entries (children) in the hierarchy to be shown in the output.
This means the children should not be filtered out and above the percent
limit.

These two functions will be used to show information when all children
of an entry is omitted by the percent limit (or filter).

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456488800-28124-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index edbf854..7c0585c 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -643,6 +643,28 @@ unsigned int hists__sort_list_width(struct hists *hists)
 	return ret;
 }
 
+unsigned int hists__overhead_width(struct hists *hists)
+{
+	struct perf_hpp_fmt *fmt;
+	int ret = 0;
+	bool first = true;
+	struct perf_hpp dummy_hpp;
+
+	hists__for_each_format(hists, fmt) {
+		if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
+			break;
+
+		if (first)
+			first = false;
+		else
+			ret += 2;
+
+		ret += fmt->width(fmt, &dummy_hpp, hists_to_evsel(hists));
+	}
+
+	return ret;
+}
+
 void perf_hpp__reset_width(struct perf_hpp_fmt *fmt, struct hists *hists)
 {
 	if (perf_hpp__is_sort_entry(fmt))
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 1c53042..e716919 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1582,6 +1582,31 @@ struct rb_node *rb_hierarchy_prev(struct rb_node *node)
 	return &he->rb_node;
 }
 
+bool hist_entry__has_hierarchy_children(struct hist_entry *he, float limit)
+{
+	struct rb_node *node;
+	struct hist_entry *child;
+	float percent;
+
+	if (he->leaf)
+		return false;
+
+	node = rb_first(&he->hroot_out);
+	child = rb_entry(node, struct hist_entry, rb_node);
+
+	while (node && child->filtered) {
+		node = rb_next(node);
+		child = rb_entry(node, struct hist_entry, rb_node);
+	}
+
+	if (node)
+		percent = hist_entry__get_percent_limit(child);
+	else
+		percent = 0;
+
+	return node && percent >= limit;
+}
+
 static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h,
 				       enum hist_filter filter)
 {
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 722aa44..da3e7b6 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -410,6 +410,7 @@ static inline int script_browse(const char *script_opt __maybe_unused)
 #endif
 
 unsigned int hists__sort_list_width(struct hists *hists);
+unsigned int hists__overhead_width(struct hists *hists);
 
 void hist__account_cycles(struct branch_stack *bs, struct addr_location *al,
 			  struct perf_sample *sample, bool nonany_branch_mode);
@@ -439,4 +440,6 @@ static inline struct rb_node *rb_hierarchy_next(struct rb_node *node)
 
 #define HIERARCHY_INDENT  3
 
+bool hist_entry__has_hierarchy_children(struct hist_entry *he, float limit);
+
 #endif	/* __PERF_HIST_H */
-- 
cgit v0.10.2


From bd4abd39db92225dde8335c37d6f4efb319f9cf2 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 26 Feb 2016 21:13:17 +0900
Subject: perf report: Show message for percent limit on stdio

When the hierarchy mode is used, some entries might be omiited due to a
percent limit or filter.  In this case the output hierarchy is different
than other entries.  Add an informative message to users about this.

For example, when 4% of percent limit is applied:

Before:
  #       Overhead  Command / Shared Object / Symbol
  # ..............  ..........................................
  #
      49.09%        swapper
         48.67%        [kernel.vmlinux]
            34.42%        [k] intel_idle
      11.51%        firefox
          8.87%        libpthread-2.22.so
             6.60%        [.] __GI___libc_recvmsg
      10.49%        gnome-shell
          4.74%        libc-2.22.so
      10.08%        Xorg
          6.11%        libc-2.22.so
             5.27%        [.] __memcpy_sse2_unaligned
       6.15%        perf

Note that, gnome-shell/libc has no symbols and perf has no dso/symbols.
With that patch the output will look like below:

After:

  #       Overhead  Command / Shared Object / Symbol
  # ..............  ..........................................
  #
      49.09%        swapper
         48.67%        [kernel.vmlinux]
            34.42%        [k] intel_idle
      11.51%        firefox
          8.87%        libpthread-2.22.so
             6.60%        [.] __GI___libc_recvmsg
      10.49%        gnome-shell
          4.74%        libc-2.22.so
                          no entry >= 4.00%
      10.08%        Xorg
          6.11%        libc-2.22.so
             5.27%        [.] __memcpy_sse2_unaligned
       6.15%        perf
                       no entry >= 4.00%

Suggested-and-Tested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456488800-28124-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 435eaaa..b3bdfcb 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -628,6 +628,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 	bool first = true;
 	size_t linesz;
 	char *line = NULL;
+	unsigned indent;
 
 	init_rem_hits();
 
@@ -704,6 +705,8 @@ print_entries:
 		goto out;
 	}
 
+	indent = hists__overhead_width(hists) + 4;
+
 	for (nd = rb_first(&hists->entries); nd; nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD)) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
 		float percent;
@@ -720,6 +723,20 @@ print_entries:
 		if (max_rows && ++nr_rows >= max_rows)
 			break;
 
+		/*
+		 * If all children are filtered out or percent-limited,
+		 * display "no entry >= x.xx%" message.
+		 */
+		if (!h->leaf && !hist_entry__has_hierarchy_children(h, min_pcnt)) {
+			int nr_sort = hists->hpp_list->nr_sort_keys;
+
+			print_hierarchy_indent(sep, nr_sort + h->depth + 1, spaces, fp);
+			fprintf(fp, "%*sno entry >= %.2f%%\n", indent, "", min_pcnt);
+
+			if (max_rows && ++nr_rows >= max_rows)
+				break;
+		}
+
 		if (h->ms.map == NULL && verbose > 1) {
 			__map_groups__fprintf_maps(h->thread->mg,
 						   MAP__FUNCTION, fp);
-- 
cgit v0.10.2


From 201fde73b111e7c31fdc0e9fa6bc4b73dfef699d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 26 Feb 2016 21:13:18 +0900
Subject: perf hists browser: Cleanup hist_browser__update_percent_limit()

The previous patch introduced __rb_hierarchy_next() function with
various move direction like HMD_FORCE_CHILD but missed to change using
it some place.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456488800-28124-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 6bcd767..904eaa7 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -2477,12 +2477,7 @@ static void hist_browser__update_percent_limit(struct hist_browser *hb,
 				     min_callchain_hits, &callchain_param);
 
 next:
-		/*
-		 * Tentatively set unfolded so that the rb_hierarchy_next()
-		 * can toggle children of folded entries too.
-		 */
-		he->unfolded = he->has_children;
-		nd = rb_hierarchy_next(nd);
+		nd = __rb_hierarchy_next(nd, HMD_FORCE_CHILD);
 
 		/* force to re-evaluate folding state of callchains */
 		he->init_have_children = false;
-- 
cgit v0.10.2


From 79dded8776c2dc4d6e1229de69f4027e84d63673 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 26 Feb 2016 21:13:19 +0900
Subject: perf hists browser: Show message for percent limit

Like the stdio, it should show messages about omitted hierarchy entries.
Please refer the previous commit for more details.

As it needs to check an entry is omitted or not multiple times, add the
has_no_entry field in the hist entry.

Suggested-and-Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456488800-28124-4-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 904eaa7..71c6d51 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -260,6 +260,9 @@ static int hierarchy_count_rows(struct hist_browser *hb, struct hist_entry *he,
 	if (he->leaf)
 		return callchain__count_rows(&he->sorted_chain);
 
+	if (he->has_no_entry)
+		return 1;
+
 	node = rb_first(&he->hroot_out);
 	while (node) {
 		float percent;
@@ -409,10 +412,18 @@ static bool hist_browser__toggle_fold(struct hist_browser *browser)
 			/* account grand children */
 			if (symbol_conf.report_hierarchy)
 				browser->b.nr_entries += child_rows - he->nr_rows;
+
+			if (!he->leaf && he->nr_rows == 0) {
+				he->has_no_entry = true;
+				he->nr_rows = 1;
+			}
 		} else {
 			if (symbol_conf.report_hierarchy)
 				browser->b.nr_entries -= child_rows - he->nr_rows;
 
+			if (he->has_no_entry)
+				he->has_no_entry = false;
+
 			he->nr_rows = 0;
 		}
 
@@ -545,6 +556,12 @@ __hist_browser__set_folding(struct hist_browser *browser, bool unfold)
 			browser->nr_hierarchy_entries++;
 		if (he->leaf)
 			browser->nr_callchain_rows += he->nr_rows;
+		else if (unfold && !hist_entry__has_hierarchy_children(he, browser->min_pcnt)) {
+			browser->nr_hierarchy_entries++;
+			he->has_no_entry = true;
+			he->nr_rows = 1;
+		} else
+			he->has_no_entry = false;
 	}
 }
 
@@ -1412,6 +1429,75 @@ show_callchain:
 	return printed;
 }
 
+static int hist_browser__show_no_entry(struct hist_browser *browser,
+				       unsigned short row,
+				       int level, int nr_sort_keys)
+{
+	int width = browser->b.width;
+	bool current_entry = ui_browser__is_current_entry(&browser->b, row);
+	bool first = true;
+	int column = 0;
+	int ret;
+	struct perf_hpp_fmt *fmt;
+
+	if (current_entry) {
+		browser->he_selection = NULL;
+		browser->selection = NULL;
+	}
+
+	hist_browser__gotorc(browser, row, 0);
+
+	if (current_entry && browser->b.navkeypressed)
+		ui_browser__set_color(&browser->b, HE_COLORSET_SELECTED);
+	else
+		ui_browser__set_color(&browser->b, HE_COLORSET_NORMAL);
+
+	ui_browser__write_nstring(&browser->b, "", level * HIERARCHY_INDENT);
+	width -= level * HIERARCHY_INDENT;
+
+	hists__for_each_format(browser->hists, fmt) {
+		if (perf_hpp__should_skip(fmt, browser->hists) ||
+		    column++ < browser->b.horiz_scroll)
+			continue;
+
+		if (perf_hpp__is_sort_entry(fmt) ||
+		    perf_hpp__is_dynamic_entry(fmt))
+			break;
+
+		ret = fmt->width(fmt, NULL, hists_to_evsel(browser->hists));
+
+		if (first) {
+			/* for folded sign */
+			first = false;
+			ret++;
+		} else {
+			/* space between columns */
+			ret += 2;
+		}
+
+		ui_browser__write_nstring(&browser->b, "", ret);
+		width -= ret;
+	}
+
+	ui_browser__write_nstring(&browser->b, "", nr_sort_keys * HIERARCHY_INDENT);
+	width -= nr_sort_keys * HIERARCHY_INDENT;
+
+	if (column >= browser->b.horiz_scroll) {
+		char buf[32];
+
+		ret = snprintf(buf, sizeof(buf), "no entry >= %.2f%%", browser->min_pcnt);
+		ui_browser__printf(&browser->b, "  %s", buf);
+		width -= ret + 2;
+	}
+
+	/* The scroll bar isn't being used */
+	if (!browser->b.navkeypressed)
+		width += 1;
+
+	ui_browser__write_nstring(&browser->b, "", width);
+	return 1;
+}
+
 static int advance_hpp_check(struct perf_hpp *hpp, int inc)
 {
 	advance_hpp(hpp, inc);
@@ -1575,6 +1661,14 @@ static unsigned int hist_browser__refresh(struct ui_browser *browser)
 			row += hist_browser__show_hierarchy_entry(hb, h, row,
 								  h->depth,
 								  nr_sort);
+			if (row == browser->rows)
+				break;
+
+			if (h->has_no_entry) {
+				hist_browser__show_no_entry(hb, row, h->depth,
+							    nr_sort);
+				row++;
+			}
 		} else {
 			row += hist_browser__show_entry(hb, h, row);
 		}
@@ -2461,6 +2555,11 @@ static void hist_browser__update_percent_limit(struct hist_browser *hb,
 	while ((nd = hists__filter_entries(nd, hb->min_pcnt)) != NULL) {
 		he = rb_entry(nd, struct hist_entry, rb_node);
 
+		if (he->has_no_entry) {
+			he->has_no_entry = false;
+			he->nr_rows = 0;
+		}
+
 		if (!he->leaf || !symbol_conf.use_callchain)
 			goto next;
 
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index e716919..75dc41d 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1625,6 +1625,7 @@ static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *h
 
 			/* force fold unfiltered entry for simplicity */
 			parent->unfolded = false;
+			parent->has_no_entry = false;
 			parent->row_offset = 0;
 			parent->nr_rows = 0;
 next:
@@ -1637,6 +1638,7 @@ next:
 
 	/* force fold unfiltered entry for simplicity */
 	h->unfolded = false;
+	h->has_no_entry = false;
 	h->row_offset = 0;
 	h->nr_rows = 0;
 
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index a8d53ff..25a5529 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -117,6 +117,7 @@ struct hist_entry {
 			bool	init_have_children;
 			bool	unfolded;
 			bool	has_children;
+			bool	has_no_entry;
 		};
 	};
 	char			*srcline;
-- 
cgit v0.10.2


From 2ddda792373065577eb9207370cb7a28395caffa Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 26 Feb 2016 21:13:20 +0900
Subject: perf report: Show message for percent limit on gtk

Like the stdio, it should show messages about omitted hierarchy
entries.  Please refer the previous commit for more details.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456488800-28124-5-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 7f343339..a5758fd 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -449,6 +449,17 @@ static void perf_gtk__add_hierarchy_entries(struct hists *hists,
 			perf_gtk__add_hierarchy_entries(hists, &he->hroot_out,
 							store, &iter, hpp,
 							min_pcnt);
+
+			if (!hist_entry__has_hierarchy_children(he, min_pcnt)) {
+				char buf[32];
+				GtkTreeIter child;
+
+				snprintf(buf, sizeof(buf), "no entry >= %.2f%%",
+					 min_pcnt);
+
+				gtk_tree_store_append(store, &child, &iter);
+				gtk_tree_store_set(store, &child, col_idx, buf, -1);
+			}
 		}
 
 		if (symbol_conf.use_callchain && he->leaf) {
-- 
cgit v0.10.2


From dfd55ad85e4a7fbaa82df12467515ac3c81e8a3e Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 26 Feb 2016 17:57:13 +0100
Subject: arm64: vmemmap: use virtual projection of linear region

Commit dd006da21646 ("arm64: mm: increase VA range of identity map") made
some changes to the memory mapping code to allow physical memory to reside
at an offset that exceeds the size of the virtual mapping.

However, since the size of the vmemmap area is proportional to the size of
the VA area, but it is populated relative to the physical space, we may
end up with the struct page array being mapped outside of the vmemmap
region. For instance, on my Seattle A0 box, I can see the following output
in the dmesg log.

   vmemmap : 0xffffffbdc0000000 - 0xffffffbfc0000000   (     8 GB maximum)
             0xffffffbfc0000000 - 0xffffffbfd0000000   (   256 MB actual)

We can fix this by deciding that the vmemmap region is not a projection of
the physical space, but of the virtual space above PAGE_OFFSET, i.e., the
linear region. This way, we are guaranteed that the vmemmap region is of
sufficient size, and we can even reduce the size by half.

Cc: <stable@vger.kernel.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index bf464de..f506086 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -34,13 +34,13 @@
 /*
  * VMALLOC and SPARSEMEM_VMEMMAP ranges.
  *
- * VMEMAP_SIZE: allows the whole VA space to be covered by a struct page array
+ * VMEMAP_SIZE: allows the whole linear region to be covered by a struct page array
  *	(rounded up to PUD_SIZE).
  * VMALLOC_START: beginning of the kernel VA space
  * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
  *	fixed mappings and modules
  */
-#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
+#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE)
 
 #ifndef CONFIG_KASAN
 #define VMALLOC_START		(VA_START)
@@ -51,7 +51,8 @@
 
 #define VMALLOC_END		(PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
 
-#define vmemmap			((struct page *)(VMALLOC_END + SZ_64K))
+#define VMEMMAP_START		(VMALLOC_END + SZ_64K)
+#define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
 
 #define FIRST_USER_ADDRESS	0UL
 
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index f3b061e..7802f21 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -319,8 +319,8 @@ void __init mem_init(void)
 #endif
 		  MLG(VMALLOC_START, VMALLOC_END),
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-		  MLG((unsigned long)vmemmap,
-		      (unsigned long)vmemmap + VMEMMAP_SIZE),
+		  MLG(VMEMMAP_START,
+		      VMEMMAP_START + VMEMMAP_SIZE),
 		  MLM((unsigned long)virt_to_page(PAGE_OFFSET),
 		      (unsigned long)virt_to_page(high_memory)),
 #endif
-- 
cgit v0.10.2


From 472681d57a5dde7c6d16b05469be57f1c4ed9d99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?MINOURA=20Makoto=20/=20=E7=AE=95=E6=B5=A6=20=E7=9C=9F?=
 <minoura@valinux.co.jp>
Date: Thu, 25 Feb 2016 14:20:48 +0900
Subject: net: ndo_fdb_dump should report -EMSGSIZE to rtnl_fdb_dump.

When the send skbuff reaches the end, nlmsg_put and friends returns
-EMSGSIZE but it is silently thrown away in ndo_fdb_dump. It is called
within a for_each_netdev loop and the first fdb entry of a following
netdev could fit in the remaining skbuff.  This breaks the mechanism
of cb->args[0] and idx to keep track of the entries that are already
dumped, which results missing entries in bridge fdb show command.

Signed-off-by: Minoura Makoto <minoura@valinux.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index e6944b2..b601139 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -931,8 +931,10 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 						     cb->nlh->nlmsg_seq,
 						     RTM_NEWNEIGH,
 						     NLM_F_MULTI, rd);
-				if (err < 0)
+				if (err < 0) {
+					cb->args[1] = err;
 					goto out;
+				}
 skip:
 				++idx;
 			}
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 82e3e97..dcea4f4 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -723,6 +723,8 @@ int br_fdb_dump(struct sk_buff *skb,
 		struct net_bridge_fdb_entry *f;
 
 		hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
+			int err;
+
 			if (idx < cb->args[0])
 				goto skip;
 
@@ -741,12 +743,15 @@ int br_fdb_dump(struct sk_buff *skb,
 			if (!filter_dev && f->dst)
 				goto skip;
 
-			if (fdb_fill_info(skb, br, f,
-					  NETLINK_CB(cb->skb).portid,
-					  cb->nlh->nlmsg_seq,
-					  RTM_NEWNEIGH,
-					  NLM_F_MULTI) < 0)
+			err = fdb_fill_info(skb, br, f,
+					    NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq,
+					    RTM_NEWNEIGH,
+					    NLM_F_MULTI);
+			if (err < 0) {
+				cb->args[1] = err;
 				break;
+			}
 skip:
 			++idx;
 		}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d735e85..8261d95 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2911,6 +2911,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
 	nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
 out:
 	netif_addr_unlock_bh(dev);
+	cb->args[1] = err;
 	return idx;
 }
 EXPORT_SYMBOL(ndo_dflt_fdb_dump);
@@ -2944,6 +2945,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
 		ops = br_dev->netdev_ops;
 	}
 
+	cb->args[1] = 0;
 	for_each_netdev(net, dev) {
 		if (brport_idx && (dev->ifindex != brport_idx))
 			continue;
@@ -2971,12 +2973,16 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
 				idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
 							 idx);
 		}
+		if (cb->args[1] == -EMSGSIZE)
+			break;
 
 		if (dev->netdev_ops->ndo_fdb_dump)
 			idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
 							    idx);
 		else
 			idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+		if (cb->args[1] == -EMSGSIZE)
+			break;
 
 		cops = NULL;
 	}
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 47f7da5..8b5833c 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -1093,8 +1093,11 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		.cb = cb,
 		.idx = idx,
 	};
+	int err;
 
-	switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb);
+	err = switchdev_port_obj_dump(dev, &dump.fdb.obj,
+				      switchdev_port_fdb_dump_cb);
+	cb->args[1] = err;
 	return dump.idx;
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
-- 
cgit v0.10.2


From 84b6ee8ea36ff797afa13c297a86ed0144482bee Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 27 Feb 2016 03:52:43 +0900
Subject: perf hists: Fix comparing of dynamic entries

When hist_entry__cmp() and hist_entry__collapse() are called, they
should check if the dynamic entry is comparing matching hists only.

Otherwise it might access different hists resulting in incorrect output.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456512767-1164-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 75dc41d..cc849d3 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1002,6 +1002,10 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 	int64_t cmp = 0;
 
 	hists__for_each_sort_list(hists, fmt) {
+		if (perf_hpp__is_dynamic_entry(fmt) &&
+		    !perf_hpp__defined_dynamic_entry(fmt, hists))
+			continue;
+
 		cmp = fmt->cmp(fmt, left, right);
 		if (cmp)
 			break;
@@ -1018,6 +1022,10 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 	int64_t cmp = 0;
 
 	hists__for_each_sort_list(hists, fmt) {
+		if (perf_hpp__is_dynamic_entry(fmt) &&
+		    !perf_hpp__defined_dynamic_entry(fmt, hists))
+			continue;
+
 		cmp = fmt->collapse(fmt, left, right);
 		if (cmp)
 			break;
-- 
cgit v0.10.2


From d3a72fd8187b7fa0014394c9dec95ba349b3301e Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 27 Feb 2016 03:52:44 +0900
Subject: perf report: Fix indentation of dynamic entries in hierarchy

When dynamic entries are used in the hierarchy mode with multiple
events, the output might not be aligned properly.  In the hierarchy
mode, the each sort column is indented using total number of sort keys.
So it keeps track of number of sort keys when adding them.  However
a dynamic sort key can be added more than once when multiple events have
same field names.  This results in unnecessarily long indentation in the
output.

For example perf kmem records following events:

  $ perf evlist --trace-fields -i perf.data.kmem
  kmem:kmalloc: trace_fields: call_site,ptr,bytes_req,bytes_alloc,gfp_flags
  kmem:kmalloc_node: trace_fields: call_site,ptr,bytes_req,bytes_alloc,gfp_flags,node
  kmem:kfree: trace_fields: call_site,ptr
  kmem:kmem_cache_alloc: trace_fields: call_site,ptr,bytes_req,bytes_alloc,gfp_flags
  kmem:kmem_cache_alloc_node: trace_fields: call_site,ptr,bytes_req,bytes_alloc,gfp_flags,node
  kmem:kmem_cache_free: trace_fields: call_site,ptr
  kmem:mm_page_alloc: trace_fields: page,order,gfp_flags,migratetype
  kmem:mm_page_free: trace_fields: page,order

As you can see, many field names shared between kmem events.  So adding
'ptr' dynamic sort key alone will set nr_sort_keys to 6.  And this adds
many unnecessary spaces between columns.

Before:

  $ perf report -i perf.data.kmem --hierarchy -s ptr -g none --stdio
  ...
  #                Overhead                 ptr
  # .......................  ...................................
  #
      99.89%                 0xffff8803ffb79720
       0.06%                 0xffff8803d228a000
       0.03%                 0xffff8803f7678f00
       0.00%                 0xffff880401dc5280
       0.00%                 0xffff880406172380
       0.00%                 0xffff8803ffac3a00
       0.00%                 0xffff8803ffac1600

After:

  # Overhead                 ptr
  # ........  ....................
  #
      99.89%  0xffff8803ffb79720
       0.06%  0xffff8803d228a000
       0.03%  0xffff8803f7678f00
       0.00%  0xffff880401dc5280
       0.00%  0xffff880406172380
       0.00%  0xffff8803ffac3a00
       0.00%  0xffff8803ffac1600

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456512767-1164-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 71c6d51..5f74c67 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1547,7 +1547,7 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows
 	struct perf_hpp_fmt *fmt;
 	size_t ret = 0;
 	int column = 0;
-	int nr_sort_keys = hists->hpp_list->nr_sort_keys;
+	int nr_sort_keys = hists->nr_sort_keys;
 	bool first = true;
 
 	ret = scnprintf(buf, size, " ");
@@ -1632,7 +1632,7 @@ static unsigned int hist_browser__refresh(struct ui_browser *browser)
 	u16 header_offset = 0;
 	struct rb_node *nd;
 	struct hist_browser *hb = container_of(browser, struct hist_browser, b);
-	int nr_sort = hb->hists->hpp_list->nr_sort_keys;
+	int nr_sort = hb->hists->nr_sort_keys;
 
 	if (hb->show_headers) {
 		hist_browser__show_headers(hb);
@@ -1969,7 +1969,7 @@ static int hist_browser__fprintf(struct hist_browser *browser, FILE *fp)
 	struct rb_node *nd = hists__filter_entries(rb_first(browser->b.entries),
 						   browser->min_pcnt);
 	int printed = 0;
-	int nr_sort = browser->hists->hpp_list->nr_sort_keys;
+	int nr_sort = browser->hists->nr_sort_keys;
 
 	while (nd) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index b3bdfcb..5733d6c 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -495,7 +495,7 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
 		size = hpp.size = bfsz;
 
 	if (symbol_conf.report_hierarchy) {
-		int nr_sort = hists->hpp_list->nr_sort_keys;
+		int nr_sort = hists->nr_sort_keys;
 
 		return hist_entry__hierarchy_fprintf(he, &hpp, nr_sort,
 						     hists, fp);
@@ -529,7 +529,7 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
 	unsigned header_width = 0;
 	struct perf_hpp_fmt *fmt;
 
-	nr_sort = hists->hpp_list->nr_sort_keys;
+	nr_sort = hists->nr_sort_keys;
 
 	/* preserve max indent depth for column headers */
 	print_hierarchy_indent(sep, nr_sort, spaces, fp);
@@ -728,7 +728,7 @@ print_entries:
 		 * display "no entry >= x.xx%" message.
 		 */
 		if (!h->leaf && !hist_entry__has_hierarchy_children(h, min_pcnt)) {
-			int nr_sort = hists->hpp_list->nr_sort_keys;
+			int nr_sort = hists->nr_sort_keys;
 
 			print_hierarchy_indent(sep, nr_sort + h->depth + 1, spaces, fp);
 			fprintf(fp, "%*sno entry >= %.2f%%\n", indent, "", min_pcnt);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index da3e7b6..da5e505 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -78,6 +78,7 @@ struct hists {
 	u16			col_len[HISTC_NR_COLS];
 	int			socket_filter;
 	struct perf_hpp_list	*hpp_list;
+	int			nr_sort_keys;
 };
 
 struct hist_entry_iter;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 6bee8bd..2beb7a6 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2633,6 +2633,9 @@ out:
 int setup_sorting(struct perf_evlist *evlist)
 {
 	int err;
+	struct hists *hists;
+	struct perf_evsel *evsel;
+	struct perf_hpp_fmt *fmt;
 
 	err = __setup_sorting(evlist);
 	if (err < 0)
@@ -2644,6 +2647,22 @@ int setup_sorting(struct perf_evlist *evlist)
 			return err;
 	}
 
+	evlist__for_each(evlist, evsel) {
+		hists = evsel__hists(evsel);
+		hists->nr_sort_keys = perf_hpp_list.nr_sort_keys;
+
+		/*
+		 * If dynamic entries were used, it might add multiple
+		 * entries to each evsel for a single field name.  Set
+		 * actual number of sort keys for each hists.
+		 */
+		perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
+			if (perf_hpp__is_dynamic_entry(fmt) &&
+			    !perf_hpp__defined_dynamic_entry(fmt, hists))
+				hists->nr_sort_keys--;
+		}
+	}
+
 	reset_dimensions();
 
 	/*
-- 
cgit v0.10.2


From cb1fab917206f822d1f905cbc45971eefdef361d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 27 Feb 2016 03:52:45 +0900
Subject: perf report: Left align dynamic entries in hierarchy

The dynamic entries are right-aligned unlike other entries since it
usually has numeric value.  But for the hierarchy mode, left alignment
is more appropriate IMHO.  Also trim spaces on the left so that we can
easily identify the hierarchy.

Before:

  $ perf report --hierarchy -i perf.data.kmem -s gfp_flags,ptr,bytes_req --stdio -g none
  ...
  #
  #       Overhead                                        gfp_flags /                ptr /          bytes_req
  # ..............  .................................................................................................
  #
      91.67%                   GFP_ATOMIC|GFP_NOWARN|GFP_NOMEMALLOC
         37.50%        0xffff8803f7669400
            37.50%                       448
          8.33%        0xffff8803f766be00
             8.33%                        96
          4.17%        0xffff8800d156dc00
             4.17%                       704

After:

  #       Overhead  gfp_flags / ptr / bytes_req
  # ..............  ....................................
  #
      91.67%        GFP_ATOMIC|GFP_NOWARN|GFP_NOMEMALLOC
         37.50%        0xffff8803f7669400
            37.50%        448
          8.33%        0xffff8803f766be00
             8.33%        96
          4.17%        0xffff8800d156dc00
             4.17%        704

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456512767-1164-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 5f74c67..5ffffcb 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1400,8 +1400,13 @@ static int hist_browser__show_hierarchy_entry(struct hist_browser *browser,
 		if (fmt->color) {
 			width -= fmt->color(fmt, &hpp, entry);
 		} else {
+			int i = 0;
+
 			width -= fmt->entry(fmt, &hpp, entry);
-			ui_browser__printf(&browser->b, "%s", s);
+			ui_browser__printf(&browser->b, "%s", ltrim(s));
+
+			while (isspace(s[i++]))
+				width++;
 		}
 	}
 
@@ -1576,6 +1581,8 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows
 		return ret;
 
 	hists__for_each_format(hists, fmt) {
+		char *start;
+
 		if (!perf_hpp__is_sort_entry(fmt) && !perf_hpp__is_dynamic_entry(fmt))
 			continue;
 		if (perf_hpp__should_skip(fmt, hists))
@@ -1593,7 +1600,12 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows
 		dummy_hpp.buf[ret] = '\0';
 		rtrim(dummy_hpp.buf);
 
-		ret = strlen(dummy_hpp.buf);
+		start = ltrim(dummy_hpp.buf);
+		ret = strlen(start);
+
+		if (start != dummy_hpp.buf)
+			memmove(dummy_hpp.buf, start, ret + 1);
+
 		if (advance_hpp_check(&dummy_hpp, ret))
 			break;
 	}
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 5733d6c..6d06fbb 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -418,6 +418,7 @@ static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
 	const char *sep = symbol_conf.field_sep;
 	struct perf_hpp_fmt *fmt;
 	char *buf = hpp->buf;
+	size_t size = hpp->size;
 	int ret, printed = 0;
 	bool first = true;
 
@@ -457,6 +458,11 @@ static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
 				(nr_sort_key - 1) * HIERARCHY_INDENT + 2, "");
 	advance_hpp(hpp, ret);
 
+	printed += fprintf(fp, "%s", buf);
+
+	hpp->buf  = buf;
+	hpp->size = size;
+
 	/*
 	 * No need to call hist_entry__snprintf_alignment() since this
 	 * fmt is always the last column in the hierarchy mode.
@@ -467,7 +473,11 @@ static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
 	else
 		fmt->entry(fmt, hpp, he);
 
-	printed += fprintf(fp, "%s\n", buf);
+	/*
+	 * dynamic entries are right-aligned but we want left-aligned
+	 * in the hierarchy mode
+	 */
+	printed += fprintf(fp, "%s\n", ltrim(buf));
 
 	if (symbol_conf.use_callchain && he->leaf) {
 		u64 total = hists__total_period(hists);
@@ -525,6 +535,7 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
 {
 	bool first = true;
 	int nr_sort;
+	int depth;
 	unsigned width = 0;
 	unsigned header_width = 0;
 	struct perf_hpp_fmt *fmt;
@@ -558,19 +569,16 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
 		if (!first)
 			header_width += fprintf(fp, " / ");
 		else {
-			header_width += fprintf(fp, "%s", sep ?: "  ");
+			fprintf(fp, "%s", sep ?: "  ");
 			first = false;
 		}
 
 		fmt->header(fmt, hpp, hists_to_evsel(hists));
 		rtrim(hpp->buf);
 
-		header_width += fprintf(fp, "%s", hpp->buf);
+		header_width += fprintf(fp, "%s", ltrim(hpp->buf));
 	}
 
-	/* preserve max indent depth for combined sort headers */
-	print_hierarchy_indent(sep, nr_sort, spaces, fp);
-
 	fprintf(fp, "\n# ");
 
 	/* preserve max indent depth for initial dots */
@@ -590,6 +598,7 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
 		fprintf(fp, "%.*s", width, dots);
 	}
 
+	depth = 0;
 	hists__for_each_format(hists, fmt) {
 		if (!perf_hpp__is_sort_entry(fmt) && !perf_hpp__is_dynamic_entry(fmt))
 			continue;
@@ -597,15 +606,16 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
 			continue;
 
 		width = fmt->width(fmt, hpp, hists_to_evsel(hists));
+		width += depth * HIERARCHY_INDENT;
+
 		if (width > header_width)
 			header_width = width;
+
+		depth++;
 	}
 
 	fprintf(fp, "%s%-.*s", sep ?: "  ", header_width, dots);
 
-	/* preserve max indent depth for dots under sort headers */
-	print_hierarchy_indent(sep, nr_sort, dots, fp);
-
 	fprintf(fp, "\n#\n");
 
 	return 2;
-- 
cgit v0.10.2


From e1cd3911170eda37fd9501e81ee1c2eb63803fd0 Mon Sep 17 00:00:00 2001
From: jiangyiwen <jiangyiwen@huawei.com>
Date: Tue, 16 Feb 2016 20:14:13 +0800
Subject: SCSI: Free resources when we return BLKPREP_INVALID

When called scsi_prep_fn return BLKPREP_INVALID, we should use the same
code with BLKPREP_KILL in scsi_prep_return.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index fa6b2c4..8c6e318 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1344,6 +1344,7 @@ scsi_prep_return(struct request_queue *q, struct request *req, int ret)
 
 	switch (ret) {
 	case BLKPREP_KILL:
+	case BLKPREP_INVALID:
 		req->errors = DID_NO_CONNECT << 16;
 		/* release the command and kill it */
 		if (req->special) {
-- 
cgit v0.10.2


From 21b81716c6bff24cda52dc75588455f879ddbfe9 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Date: Thu, 25 Feb 2016 13:54:20 -0300
Subject: ipr: Fix regression when loading firmware

Commit d63c7dd5bcb9 ("ipr: Fix out-of-bounds null overwrite") removed
the end of line handling when storing the update_fw sysfs attribute.
This changed the userpace API because it started refusing writes
terminated by a line feed, which broke the update tools we already have.

This patch re-adds that handling, so both a write terminated by a line
feed or not can make it through with the update.

Fixes: d63c7dd5bcb9 ("ipr: Fix out-of-bounds null overwrite")
Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Cc: Insu Yun <wuninsu@gmail.com>
Acked-by: Brian King <brking@linux.vnet.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>

diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 3b3e099..d6a691e 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -4002,6 +4002,7 @@ static ssize_t ipr_store_update_fw(struct device *dev,
 	struct ipr_sglist *sglist;
 	char fname[100];
 	char *src;
+	char *endline;
 	int result, dnld_size;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -4009,6 +4010,10 @@ static ssize_t ipr_store_update_fw(struct device *dev,
 
 	snprintf(fname, sizeof(fname), "%s", buf);
 
+	endline = strchr(fname, '\n');
+	if (endline)
+		*endline = '\0';
+
 	if (request_firmware(&fw_entry, fname, &ioa_cfg->pdev->dev)) {
 		dev_err(&ioa_cfg->pdev->dev, "Firmware file %s not found\n", fname);
 		return -EIO;
-- 
cgit v0.10.2


From e049d4a3fa194c8aa0d3ca29a9b11b32387ca6e3 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 27 Feb 2016 03:52:46 +0900
Subject: perf hists: Fix dynamic entry display in hierarchy

When dynamic sort key is used it might not show pretty printed output.
This is because the trace output was not set only for the first dynamic
sort key.  During hierarchy_insert_entry() it missed to pass the
trace_output to dynamic entries.  Also even if it did, only first entry
will have it.  Subsequent entries might set it during collapsing stage
but it's not guaranteed.

Before:

  $ perf report --hierarchy --stdio -s ptr,bytes_req,gfp_flags -g none
  #
  #       Overhead  ptr / bytes_req / gfp_flags
  # ..............  ..........................................
  #
      37.50%        0xffff8803f7669400
         37.50%        448
            37.50%        66080
      10.42%        0xffff8803f766be00
          8.33%        96
             8.33%        66080
          2.08%        512
             2.08%        67280

After:

  #
  #       Overhead  ptr / bytes_req / gfp_flags
  # ..............  ..........................................
  #
      37.50%        0xffff8803f7669400
         37.50%        448
            37.50%        GFP_ATOMIC|GFP_NOWARN|GFP_NOMEMALLOC
      10.42%        0xffff8803f766be00
          8.33%        96
             8.33%        GFP_ATOMIC|GFP_NOWARN|GFP_NOMEMALLOC
          2.08%        512
             2.08%        GFP_KERNEL|GFP_NOWARN|GFP_REPEAT|GFP

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456512767-1164-4-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index cc849d3..9b3f582 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1125,7 +1125,7 @@ static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
 	new->fmt = fmt;
 
 	/* some fields are now passed to 'new' */
-	if (perf_hpp__is_trace_entry(fmt))
+	if (perf_hpp__is_trace_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
 		he->trace_output = NULL;
 	else
 		new->trace_output = NULL;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 2beb7a6..d26c6b9 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1764,6 +1764,9 @@ static int __sort__hde_entry(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
 	if (hde->raw_trace)
 		goto raw_field;
 
+	if (!he->trace_output)
+		he->trace_output = get_trace_output(he);
+
 	field = hde->field;
 	namelen = strlen(field->name);
 	str = he->trace_output;
-- 
cgit v0.10.2


From abab5e7fcec16e526968f8a5448cd81c635705ce Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Sat, 27 Feb 2016 03:52:47 +0900
Subject: perf report: Update column width of dynamic entries

The column width of dynamic entries is updated when comparing hist
entries.  However some unique entries can miss the chance to update.  So
move the update to output resort stage to make sure every entry will get
called before display.

To do that, abuse ->sort callback to update the width when the third
argument is NULL.  When resorting entries in normal path, it never be
NULL so it should be fine IMHO.

Before:

  #       Overhead  ptr / bytes_req / gfp_flags
  # ..............  ..........................................
  #
      37.50%        0xffff8803f7669400
         37.50%        448
            37.50%        GFP_ATOMIC|GFP_NOWARN|GFP_NOMEMALLOC
      10.42%        0xffff8803f766be00
          8.33%        96
             8.33%        GFP_ATOMIC|GFP_NOWARN|GFP_NOMEMALLOC
          2.08%        512
             2.08%        GFP_KERNEL|GFP_NOWARN|GFP_REPEAT|GFP   <-- here

After:

  #       Overhead  ptr / bytes_req / gfp_flags
  # ..............  .....................................................
  #
      37.50%        0xffff8803f7669400
         37.50%        448
            37.50%        GFP_ATOMIC|GFP_NOWARN|GFP_NOMEMALLOC
      10.42%        0xffff8803f766be00
          8.33%        96
             8.33%        GFP_ATOMIC|GFP_NOWARN|GFP_NOMEMALLOC
          2.08%        512
             2.08%        GFP_KERNEL|GFP_NOWARN|GFP_REPEAT|GFP_NOMEMALLOC

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1456512767-1164-5-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 9b3f582..4b8b67b 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1371,6 +1371,10 @@ static void hierarchy_insert_output_entry(struct rb_root *root,
 
 	rb_link_node(&he->rb_node, parent, p);
 	rb_insert_color(&he->rb_node, root);
+
+	/* update column width of dynamic entry */
+	if (perf_hpp__is_dynamic_entry(he->fmt))
+		he->fmt->sort(he->fmt, he, NULL);
 }
 
 static void hists__hierarchy_output_resort(struct hists *hists,
@@ -1440,6 +1444,7 @@ static void __hists__insert_output_entry(struct rb_root *entries,
 	struct rb_node **p = &entries->rb_node;
 	struct rb_node *parent = NULL;
 	struct hist_entry *iter;
+	struct perf_hpp_fmt *fmt;
 
 	if (use_callchain) {
 		if (callchain_param.mode == CHAIN_GRAPH_REL) {
@@ -1466,6 +1471,12 @@ static void __hists__insert_output_entry(struct rb_root *entries,
 
 	rb_link_node(&he->rb_node, parent, p);
 	rb_insert_color(&he->rb_node, entries);
+
+	perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
+		if (perf_hpp__is_dynamic_entry(fmt) &&
+		    perf_hpp__defined_dynamic_entry(fmt, he->hists))
+			fmt->sort(fmt, he, NULL);  /* update column width */
+	}
 }
 
 static void output_resort(struct hists *hists, struct ui_progress *prog,
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index d26c6b9..5888bfe 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1816,6 +1816,11 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
 
 	hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
 
+	if (b == NULL) {
+		update_dynamic_len(hde, a);
+		return 0;
+	}
+
 	field = hde->field;
 	if (field->flags & FIELD_IS_DYNAMIC) {
 		unsigned long long dyn;
@@ -1830,9 +1835,6 @@ static int64_t __sort__hde_cmp(struct perf_hpp_fmt *fmt,
 	} else {
 		offset = field->offset;
 		size = field->size;
-
-		update_dynamic_len(hde, a);
-		update_dynamic_len(hde, b);
 	}
 
 	return memcmp(a->raw_data + offset, b->raw_data + offset, size);
-- 
cgit v0.10.2


From b8cbb349061edda648463b086cfa869a7ab583af Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 26 Feb 2016 09:31:51 +0000
Subject: perf config: Bring perf_default_config to the very beginning at
 main()

Before this patch each subcommand calls perf_config() by themself,
reading the default configuration together with subcommand specific
options. If a subcommand doesn't have it own options, it needs to call
'perf_config(perf_default_config, NULL)' to ensure .perfconfig is
loaded.

This patch brings perf_config(perf_default_config, NULL) to the very
start of main(), so subcommands don't need to do it.

After this patch, 'llvm.clang-path' works for 'perf trace'.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Suggested-and-Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456479154-136027-4-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 36ccc2b..4d72359 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1264,8 +1264,6 @@ int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused)
 	if (ret < 0)
 		return ret;
 
-	perf_config(perf_default_config, NULL);
-
 	argc = parse_options(argc, argv, options, diff_usage, 0);
 
 	if (symbol__init(NULL) < 0)
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index f4dd2b4..49d55e2 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -272,7 +272,7 @@ static int perf_help_config(const char *var, const char *value, void *cb)
 	if (!prefixcmp(var, "man."))
 		return add_man_viewer_info(var, value);
 
-	return perf_default_config(var, value, cb);
+	return 0;
 }
 
 static struct cmdnames main_cmds, other_cmds;
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 1180105..4d3340c 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -1834,7 +1834,7 @@ static int __cmd_record(int argc, const char **argv)
 	return cmd_record(i, rec_argv, NULL);
 }
 
-static int kmem_config(const char *var, const char *value, void *cb)
+static int kmem_config(const char *var, const char *value, void *cb __maybe_unused)
 {
 	if (!strcmp(var, "kmem.default")) {
 		if (!strcmp(value, "slab"))
@@ -1847,7 +1847,7 @@ static int kmem_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
-	return perf_default_config(var, value, cb);
+	return 0;
 }
 
 int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f4d8244..7eea49f 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -90,7 +90,7 @@ static int report__config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
-	return perf_default_config(var, value, cb);
+	return 0;
 }
 
 static int hist_iter__report_callback(struct hist_entry_iter *iter,
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index b86b623..94af190 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -1065,7 +1065,7 @@ parse_callchain_opt(const struct option *opt, const char *arg, int unset)
 	return parse_callchain_top_opt(arg);
 }
 
-static int perf_top_config(const char *var, const char *value, void *cb)
+static int perf_top_config(const char *var, const char *value, void *cb __maybe_unused)
 {
 	if (!strcmp(var, "top.call-graph"))
 		var = "call-graph.record-mode"; /* fall-through */
@@ -1074,7 +1074,7 @@ static int perf_top_config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
-	return perf_default_config(var, value, cb);
+	return 0;
 }
 
 static int
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index f632119..aaee0a7 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -548,6 +548,8 @@ int main(int argc, const char **argv)
 
 	srandom(time(NULL));
 
+	perf_config(perf_default_config, NULL);
+
 	/* get debugfs/tracefs mount point from /proc/mounts */
 	tracing_path_mount();
 
diff --git a/tools/perf/tests/llvm.c b/tools/perf/tests/llvm.c
index 70edcdf..cff564f 100644
--- a/tools/perf/tests/llvm.c
+++ b/tools/perf/tests/llvm.c
@@ -6,12 +6,6 @@
 #include "tests.h"
 #include "debug.h"
 
-static int perf_config_cb(const char *var, const char *val,
-			  void *arg __maybe_unused)
-{
-	return perf_default_config(var, val, arg);
-}
-
 #ifdef HAVE_LIBBPF_SUPPORT
 static int test__bpf_parsing(void *obj_buf, size_t obj_buf_sz)
 {
@@ -77,8 +71,6 @@ test_llvm__fetch_bpf_obj(void **p_obj_buf,
 	if (should_load_fail)
 		*should_load_fail = bpf_source_table[idx].should_load_fail;
 
-	perf_config(perf_config_cb, NULL);
-
 	/*
 	 * Skip this test if user's .perfconfig doesn't set [llvm] section
 	 * and clang is not found in $PATH, and this is not perf test -v
diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
index e5fb88b..43e84aa 100644
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c
@@ -32,14 +32,15 @@ int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty)
 	return 0;
 }
 
-int perf_color_default_config(const char *var, const char *value, void *cb)
+int perf_color_default_config(const char *var, const char *value,
+			      void *cb __maybe_unused)
 {
 	if (!strcmp(var, "color.ui")) {
 		perf_use_color_default = perf_config_colorbool(var, value, -1);
 		return 0;
 	}
 
-	return perf_default_config(var, value, cb);
+	return 0;
 }
 
 static int __color_vsnprintf(char *bf, size_t size, const char *color,
diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index b722e57..6729f4d 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -1117,7 +1117,7 @@ static int convert__config(const char *var, const char *value, void *cb)
 		return 0;
 	}
 
-	return perf_default_config(var, value, cb);
+	return 0;
 }
 
 int bt_convert__perf2ctf(const char *input, const char *path, bool force)
diff --git a/tools/perf/util/help-unknown-cmd.c b/tools/perf/util/help-unknown-cmd.c
index dc1e41c..43a98a4 100644
--- a/tools/perf/util/help-unknown-cmd.c
+++ b/tools/perf/util/help-unknown-cmd.c
@@ -6,7 +6,8 @@
 static int autocorrect;
 static struct cmdnames aliases;
 
-static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
+static int perf_unknown_cmd_config(const char *var, const char *value,
+				   void *cb __maybe_unused)
 {
 	if (!strcmp(var, "help.autocorrect"))
 		autocorrect = perf_config_int(var,value);
@@ -14,7 +15,7 @@ static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
 	if (!prefixcmp(var, "alias."))
 		add_cmdname(&aliases, var + 6, strlen(var + 6));
 
-	return perf_default_config(var, value, cb);
+	return 0;
 }
 
 static int levenshtein_compare(const void *p1, const void *p2)
-- 
cgit v0.10.2


From fdf14720fbd02d406ac2c1c50444774b4c7eed9a Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 26 Feb 2016 09:31:53 +0000
Subject: perf tools: Only set filter for tracepoints events

perf_evlist__set_filter() tries to set filter to every evsel linked in
the evlist. However, since filters can only be applied to tracepoints,
checking type of evsel before calling perf_evsel__set_filter() would be
better.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456479154-136027-6-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index c42e196..86a0383 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1223,6 +1223,9 @@ int perf_evlist__set_filter(struct perf_evlist *evlist, const char *filter)
 	int err = 0;
 
 	evlist__for_each(evlist, evsel) {
+		if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
+			continue;
+
 		err = perf_evsel__set_filter(evsel, filter);
 		if (err)
 			break;
-- 
cgit v0.10.2


From ba50423530200659d4deb703a8f72d3b69bc13ce Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 26 Feb 2016 09:31:54 +0000
Subject: perf trace: Call bpf__apply_obj_config in 'perf trace'

Without this patch BPF map configuration is not applied.

Command like this:
 # ./perf trace --ev bpf-output/no-inherit,name=evt/ \
                --ev ./test_bpf_trace.c/map:channel.event=evt/ \
                usleep 100000

Load BPF files without error, but since map:channel.event=evt is not
applied, bpf-output event not work.

This patch allows 'perf trace' load and run BPF scripts.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456479154-136027-7-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 20916dd..254149c 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -33,6 +33,7 @@
 #include "util/stat.h"
 #include "trace-event.h"
 #include "util/parse-events.h"
+#include "util/bpf-loader.h"
 
 #include <libaudit.h>
 #include <stdlib.h>
@@ -2586,6 +2587,16 @@ static int trace__run(struct trace *trace, int argc, const char **argv)
 	if (err < 0)
 		goto out_error_open;
 
+	err = bpf__apply_obj_config();
+	if (err) {
+		char errbuf[BUFSIZ];
+
+		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
+		pr_err("ERROR: Apply config to BPF failed: %s\n",
+			 errbuf);
+		goto out_error_open;
+	}
+
 	/*
 	 * Better not use !target__has_task() here because we need to cover the
 	 * case where no threads were specified in the command line, but a
-- 
cgit v0.10.2


From 1d6c9407d45dd622b277ca9f725da3cc9e95b5de Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 26 Feb 2016 09:31:55 +0000
Subject: perf trace: Print content of bpf-output event

With this patch the contend of BPF output event is printed by
'perf trace'. For example:

 # ./perf trace -a --ev bpf-output/no-inherit,name=evt/ \
                   --ev ./test_bpf_trace.c/map:channel.event=evt/ \
                   usleep 100000
  ...
    1.787 ( 0.004 ms): usleep/3832 nanosleep(rqtp: 0x7ffc78b18980                                        ) ...
    1.787 (         ): evt:Raise a BPF event!..)
    1.788 (         ): perf_bpf_probe:func_begin:(ffffffff810e97d0))
  ...
  101.866 (87.038 ms): gmain/1654 poll(ufds: 0x7f57a80008c0, nfds: 2, timeout_msecs: 1000               ) ...
  101.866 (         ): evt:Raise a BPF event!..)
  101.867 (         ): perf_bpf_probe:func_end:(ffffffff810e97d0 <- ffffffff81796173))
  101.869 (100.087 ms): usleep/3832  ... [continued]: nanosleep()) = 0
  ...

 (There is an extra ')' at the end of several lines. However, it is
  another problem, unrelated to this commit.)

Where test_bpf_trace.c is:

  /************************ BEGIN **************************/
  #include <uapi/linux/bpf.h>
  struct bpf_map_def {
        unsigned int type;
        unsigned int key_size;
        unsigned int value_size;
        unsigned int max_entries;
  };
  #define SEC(NAME) __attribute__((section(NAME), used))
  static u64 (*ktime_get_ns)(void) =
        (void *)BPF_FUNC_ktime_get_ns;
  static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
        (void *)BPF_FUNC_trace_printk;
  static int (*get_smp_processor_id)(void) =
        (void *)BPF_FUNC_get_smp_processor_id;
  static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *, unsigned long) =
        (void *)BPF_FUNC_perf_event_output;

  struct bpf_map_def SEC("maps") channel = {
        .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
        .key_size = sizeof(int),
        .value_size = sizeof(u32),
        .max_entries = __NR_CPUS__,
  };

  static inline int __attribute__((always_inline))
  func(void *ctx, int type)
  {
	char output_str[] = "Raise a BPF event!";
	char err_str[] = "BAD %d\n";
	int err;

        err = perf_event_output(ctx, &channel, get_smp_processor_id(),
			        &output_str, sizeof(output_str));
	if (err)
		trace_printk(err_str, sizeof(err_str), err);
        return 1;
  }
  SEC("func_begin=sys_nanosleep")
  int func_begin(void *ctx) {return func(ctx, 1);}
  SEC("func_end=sys_nanosleep%return")
  int func_end(void *ctx) { return func(ctx, 2);}
  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  /************************* END ***************************/

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456479154-136027-8-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 254149c..26a337f 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -2178,6 +2178,37 @@ out_dump:
 	return 0;
 }
 
+static void bpf_output__printer(enum binary_printer_ops op,
+				unsigned int val, void *extra)
+{
+	FILE *output = extra;
+	unsigned char ch = (unsigned char)val;
+
+	switch (op) {
+	case BINARY_PRINT_CHAR_DATA:
+		fprintf(output, "%c", isprint(ch) ? ch : '.');
+		break;
+	case BINARY_PRINT_DATA_BEGIN:
+	case BINARY_PRINT_LINE_BEGIN:
+	case BINARY_PRINT_ADDR:
+	case BINARY_PRINT_NUM_DATA:
+	case BINARY_PRINT_NUM_PAD:
+	case BINARY_PRINT_SEP:
+	case BINARY_PRINT_CHAR_PAD:
+	case BINARY_PRINT_LINE_END:
+	case BINARY_PRINT_DATA_END:
+	default:
+		break;
+	}
+}
+
+static void bpf_output__fprintf(struct trace *trace,
+				struct perf_sample *sample)
+{
+	print_binary(sample->raw_data, sample->raw_size, 8,
+		     bpf_output__printer, trace->output);
+}
+
 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 				union perf_event *event __maybe_unused,
 				struct perf_sample *sample)
@@ -2190,7 +2221,9 @@ static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
 
 	fprintf(trace->output, "%s:", evsel->name);
 
-	if (evsel->tp_format) {
+	if (perf_evsel__is_bpf_output(evsel)) {
+		bpf_output__fprintf(trace, sample);
+	} else if (evsel->tp_format) {
 		event_format__fprintf(evsel->tp_format, sample->cpu,
 				      sample->raw_data, sample->raw_size,
 				      trace->output);
-- 
cgit v0.10.2


From 7aca0c07207385cca76025cc85231519935722b9 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Fri, 26 Feb 2016 19:14:13 -0800
Subject: clocksource: Introduce clocksource_freq2mult()

The clocksource_khz2mult() and clocksource_hz2mult() share similar
code wihch calculates a mult from the given frequency. Both implementations
in differ only in value of a frequency. This patch introduces the
clocksource_freq2mult() helper with generic implementation of
mult calculation to prevent code duplication.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Link: http://lkml.kernel.org/r/1456542854-22104-2-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 6013021..a307bf6 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -118,6 +118,23 @@ struct clocksource {
 /* simplify initialization of mask field */
 #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
 
+static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from)
+{
+	/*  freq = cyc/from
+	 *  mult/2^shift  = ns/cyc
+	 *  mult = ns/cyc * 2^shift
+	 *  mult = from/freq * 2^shift
+	 *  mult = from * 2^shift / freq
+	 *  mult = (from<<shift) / freq
+	 */
+	u64 tmp = ((u64)from) << shift_constant;
+
+	tmp += freq/2; /* round for do_div */
+	do_div(tmp, freq);
+
+	return (u32)tmp;
+}
+
 /**
  * clocksource_khz2mult - calculates mult from khz and shift
  * @khz:		Clocksource frequency in KHz
@@ -128,19 +145,7 @@ struct clocksource {
  */
 static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
 {
-	/*  khz = cyc/(Million ns)
-	 *  mult/2^shift  = ns/cyc
-	 *  mult = ns/cyc * 2^shift
-	 *  mult = 1Million/khz * 2^shift
-	 *  mult = 1000000 * 2^shift / khz
-	 *  mult = (1000000<<shift) / khz
-	 */
-	u64 tmp = ((u64)1000000) << shift_constant;
-
-	tmp += khz/2; /* round for do_div */
-	do_div(tmp, khz);
-
-	return (u32)tmp;
+	return clocksource_freq2mult(khz, shift_constant, NSEC_PER_MSEC);
 }
 
 /**
@@ -154,19 +159,7 @@ static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
  */
 static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
 {
-	/*  hz = cyc/(Billion ns)
-	 *  mult/2^shift  = ns/cyc
-	 *  mult = ns/cyc * 2^shift
-	 *  mult = 1Billion/hz * 2^shift
-	 *  mult = 1000000000 * 2^shift / hz
-	 *  mult = (1000000000<<shift) / hz
-	 */
-	u64 tmp = ((u64)1000000000) << shift_constant;
-
-	tmp += hz/2; /* round for do_div */
-	do_div(tmp, hz);
-
-	return (u32)tmp;
+	return clocksource_freq2mult(hz, shift_constant, NSEC_PER_SEC);
 }
 
 /**
-- 
cgit v0.10.2


From 232d26373d310a941ef2ab46e53ea62fe076ed13 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Fri, 26 Feb 2016 19:14:14 -0800
Subject: jiffies: Use CLOCKSOURCE_MASK instead of constant

The CLOCKSOURCE_MASK(32) macro expands to the same value, but
makes code more readable.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Link: http://lkml.kernel.org/r/1456542854-22104-3-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 347fecf..555e21f 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = {
 	.name		= "jiffies",
 	.rating		= 1, /* lowest valid rating*/
 	.read		= jiffies_read,
-	.mask		= 0xffffffff, /*32bits*/
+	.mask		= CLOCKSOURCE_MASK(32),
 	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
 	.shift		= JIFFIES_SHIFT,
 	.max_cycles	= 10,
-- 
cgit v0.10.2


From b009b096c473669311612a05b1dc2057e0ef256c Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 24 Feb 2016 16:24:54 +0100
Subject: dt-bindings: interrupt-controller: Add SoC-specific compatible string
 to Marvell ODMI

As requested by Rob Herring, this commit adds a SoC-specific
compatible string to the Marvell ODMI DT binding.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Link: https://lkml.kernel.org/r/1456327494-31358-1-git-send-email-thomas.petazzoni@free-electrons.com
Signed-off-by: Jason Cooper <jason@lakedaemon.net>

diff --git a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt
index 252d5c9..8af0a8e 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/marvell,odmi-controller.txt
@@ -6,7 +6,9 @@ which can be used by on-board peripheral for MSI interrupts.
 
 Required properties:
 
-- compatible           : The value here should contain "marvell,odmi-controller".
+- compatible           : The value here should contain:
+
+    "marvell,ap806-odmi-controller", "marvell,odmi-controller".
 
 - interrupt,controller : Identifies the node as an interrupt controller.
 
@@ -29,7 +31,8 @@ Required properties:
 Example:
 
 	odmi: odmi@300000 {
-		compatible = "marvell,odmi-controller";
+		compatible = "marvell,ap806-odm-controller",
+			     "marvell,odmi-controller";
 		interrupt-controller;
 		msi-controller;
 		marvell,odmi-frames = <4>;
-- 
cgit v0.10.2


From 8160c4e455820d5008a1116d2dca35f0363bb062 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 28 Feb 2016 16:31:39 +0200
Subject: vfio: fix ioctl error handling

Calling return copy_to_user(...) in an ioctl will not
do the right thing if there's a pagefault:
copy_to_user returns the number of bytes not copied
in this case.

Fix up vfio to do
	return copy_to_user(...)) ?
		-EFAULT : 0;

everywhere.

Cc: stable@vger.kernel.org
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 2760a7b..8c80a48 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -446,7 +446,8 @@ static long vfio_pci_ioctl(void *device_data,
 		info.num_regions = VFIO_PCI_NUM_REGIONS;
 		info.num_irqs = VFIO_PCI_NUM_IRQS;
 
-		return copy_to_user((void __user *)arg, &info, minsz);
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
 
 	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
 		struct pci_dev *pdev = vdev->pdev;
@@ -520,7 +521,8 @@ static long vfio_pci_ioctl(void *device_data,
 			return -EINVAL;
 		}
 
-		return copy_to_user((void __user *)arg, &info, minsz);
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
 
 	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
 		struct vfio_irq_info info;
@@ -555,7 +557,8 @@ static long vfio_pci_ioctl(void *device_data,
 		else
 			info.flags |= VFIO_IRQ_INFO_NORESIZE;
 
-		return copy_to_user((void __user *)arg, &info, minsz);
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
 
 	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
 		struct vfio_irq_set hdr;
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 418cdd9..e65b142 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -219,7 +219,8 @@ static long vfio_platform_ioctl(void *device_data,
 		info.num_regions = vdev->num_regions;
 		info.num_irqs = vdev->num_irqs;
 
-		return copy_to_user((void __user *)arg, &info, minsz);
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
 
 	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
 		struct vfio_region_info info;
@@ -240,7 +241,8 @@ static long vfio_platform_ioctl(void *device_data,
 		info.size = vdev->regions[info.index].size;
 		info.flags = vdev->regions[info.index].flags;
 
-		return copy_to_user((void __user *)arg, &info, minsz);
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
 
 	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
 		struct vfio_irq_info info;
@@ -259,7 +261,8 @@ static long vfio_platform_ioctl(void *device_data,
 		info.flags = vdev->irqs[info.index].flags;
 		info.count = vdev->irqs[info.index].count;
 
-		return copy_to_user((void __user *)arg, &info, minsz);
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
 
 	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
 		struct vfio_irq_set hdr;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6f1ea3d..75b24e9 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -999,7 +999,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 
 		info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
 
-		return copy_to_user((void __user *)arg, &info, minsz);
+		return copy_to_user((void __user *)arg, &info, minsz) ?
+			-EFAULT : 0;
 
 	} else if (cmd == VFIO_IOMMU_MAP_DMA) {
 		struct vfio_iommu_type1_dma_map map;
@@ -1032,7 +1033,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 		if (ret)
 			return ret;
 
-		return copy_to_user((void __user *)arg, &unmap, minsz);
+		return copy_to_user((void __user *)arg, &unmap, minsz) ?
+			-EFAULT : 0;
 	}
 
 	return -ENOTTY;
-- 
cgit v0.10.2


From 6236d8bb2afcfe71b88ecea554e0dc638090a45f Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Sat, 27 Feb 2016 17:52:42 +0100
Subject: ALSA: ctl: Fix ioctls for X32 ABI

The X32 ABI takes the same alignment like x86-64, and this may result
in the incompatible struct size from ia32.  Unfortunately, we hit this
in some control ABI: struct snd_ctl_elem_value differs between them
due to the position of 64bit variable array.  This ends up with the
unknown ioctl (ENOTTY) error.

The fix is to add the compat entries for the new aligned struct.

Reported-and-tested-by: Steven Newbury <steve@snewbury.org.uk>
Cc: <stable@vger.kernel.org> # v3.4+
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/core/control_compat.c b/sound/core/control_compat.c
index b9c0910..0608f21 100644
--- a/sound/core/control_compat.c
+++ b/sound/core/control_compat.c
@@ -170,6 +170,19 @@ struct snd_ctl_elem_value32 {
         unsigned char reserved[128];
 };
 
+#ifdef CONFIG_X86_X32
+/* x32 has a different alignment for 64bit values from ia32 */
+struct snd_ctl_elem_value_x32 {
+	struct snd_ctl_elem_id id;
+	unsigned int indirect;	/* bit-field causes misalignment */
+	union {
+		s32 integer[128];
+		unsigned char data[512];
+		s64 integer64[64];
+	} value;
+	unsigned char reserved[128];
+};
+#endif /* CONFIG_X86_X32 */
 
 /* get the value type and count of the control */
 static int get_ctl_type(struct snd_card *card, struct snd_ctl_elem_id *id,
@@ -219,9 +232,11 @@ static int get_elem_size(int type, int count)
 
 static int copy_ctl_value_from_user(struct snd_card *card,
 				    struct snd_ctl_elem_value *data,
-				    struct snd_ctl_elem_value32 __user *data32,
+				    void __user *userdata,
+				    void __user *valuep,
 				    int *typep, int *countp)
 {
+	struct snd_ctl_elem_value32 __user *data32 = userdata;
 	int i, type, size;
 	int uninitialized_var(count);
 	unsigned int indirect;
@@ -239,8 +254,9 @@ static int copy_ctl_value_from_user(struct snd_card *card,
 	if (type == SNDRV_CTL_ELEM_TYPE_BOOLEAN ||
 	    type == SNDRV_CTL_ELEM_TYPE_INTEGER) {
 		for (i = 0; i < count; i++) {
+			s32 __user *intp = valuep;
 			int val;
-			if (get_user(val, &data32->value.integer[i]))
+			if (get_user(val, &intp[i]))
 				return -EFAULT;
 			data->value.integer.value[i] = val;
 		}
@@ -250,8 +266,7 @@ static int copy_ctl_value_from_user(struct snd_card *card,
 			dev_err(card->dev, "snd_ioctl32_ctl_elem_value: unknown type %d\n", type);
 			return -EINVAL;
 		}
-		if (copy_from_user(data->value.bytes.data,
-				   data32->value.data, size))
+		if (copy_from_user(data->value.bytes.data, valuep, size))
 			return -EFAULT;
 	}
 
@@ -261,7 +276,8 @@ static int copy_ctl_value_from_user(struct snd_card *card,
 }
 
 /* restore the value to 32bit */
-static int copy_ctl_value_to_user(struct snd_ctl_elem_value32 __user *data32,
+static int copy_ctl_value_to_user(void __user *userdata,
+				  void __user *valuep,
 				  struct snd_ctl_elem_value *data,
 				  int type, int count)
 {
@@ -270,22 +286,22 @@ static int copy_ctl_value_to_user(struct snd_ctl_elem_value32 __user *data32,
 	if (type == SNDRV_CTL_ELEM_TYPE_BOOLEAN ||
 	    type == SNDRV_CTL_ELEM_TYPE_INTEGER) {
 		for (i = 0; i < count; i++) {
+			s32 __user *intp = valuep;
 			int val;
 			val = data->value.integer.value[i];
-			if (put_user(val, &data32->value.integer[i]))
+			if (put_user(val, &intp[i]))
 				return -EFAULT;
 		}
 	} else {
 		size = get_elem_size(type, count);
-		if (copy_to_user(data32->value.data,
-				 data->value.bytes.data, size))
+		if (copy_to_user(valuep, data->value.bytes.data, size))
 			return -EFAULT;
 	}
 	return 0;
 }
 
-static int snd_ctl_elem_read_user_compat(struct snd_card *card, 
-					 struct snd_ctl_elem_value32 __user *data32)
+static int ctl_elem_read_user(struct snd_card *card,
+			      void __user *userdata, void __user *valuep)
 {
 	struct snd_ctl_elem_value *data;
 	int err, type, count;
@@ -294,7 +310,9 @@ static int snd_ctl_elem_read_user_compat(struct snd_card *card,
 	if (data == NULL)
 		return -ENOMEM;
 
-	if ((err = copy_ctl_value_from_user(card, data, data32, &type, &count)) < 0)
+	err = copy_ctl_value_from_user(card, data, userdata, valuep,
+				       &type, &count);
+	if (err < 0)
 		goto error;
 
 	snd_power_lock(card);
@@ -303,14 +321,15 @@ static int snd_ctl_elem_read_user_compat(struct snd_card *card,
 		err = snd_ctl_elem_read(card, data);
 	snd_power_unlock(card);
 	if (err >= 0)
-		err = copy_ctl_value_to_user(data32, data, type, count);
+		err = copy_ctl_value_to_user(userdata, valuep, data,
+					     type, count);
  error:
 	kfree(data);
 	return err;
 }
 
-static int snd_ctl_elem_write_user_compat(struct snd_ctl_file *file,
-					  struct snd_ctl_elem_value32 __user *data32)
+static int ctl_elem_write_user(struct snd_ctl_file *file,
+			       void __user *userdata, void __user *valuep)
 {
 	struct snd_ctl_elem_value *data;
 	struct snd_card *card = file->card;
@@ -320,7 +339,9 @@ static int snd_ctl_elem_write_user_compat(struct snd_ctl_file *file,
 	if (data == NULL)
 		return -ENOMEM;
 
-	if ((err = copy_ctl_value_from_user(card, data, data32, &type, &count)) < 0)
+	err = copy_ctl_value_from_user(card, data, userdata, valuep,
+				       &type, &count);
+	if (err < 0)
 		goto error;
 
 	snd_power_lock(card);
@@ -329,12 +350,39 @@ static int snd_ctl_elem_write_user_compat(struct snd_ctl_file *file,
 		err = snd_ctl_elem_write(card, file, data);
 	snd_power_unlock(card);
 	if (err >= 0)
-		err = copy_ctl_value_to_user(data32, data, type, count);
+		err = copy_ctl_value_to_user(userdata, valuep, data,
+					     type, count);
  error:
 	kfree(data);
 	return err;
 }
 
+static int snd_ctl_elem_read_user_compat(struct snd_card *card,
+					 struct snd_ctl_elem_value32 __user *data32)
+{
+	return ctl_elem_read_user(card, data32, &data32->value);
+}
+
+static int snd_ctl_elem_write_user_compat(struct snd_ctl_file *file,
+					  struct snd_ctl_elem_value32 __user *data32)
+{
+	return ctl_elem_write_user(file, data32, &data32->value);
+}
+
+#ifdef CONFIG_X86_X32
+static int snd_ctl_elem_read_user_x32(struct snd_card *card,
+				      struct snd_ctl_elem_value_x32 __user *data32)
+{
+	return ctl_elem_read_user(card, data32, &data32->value);
+}
+
+static int snd_ctl_elem_write_user_x32(struct snd_ctl_file *file,
+				       struct snd_ctl_elem_value_x32 __user *data32)
+{
+	return ctl_elem_write_user(file, data32, &data32->value);
+}
+#endif /* CONFIG_X86_X32 */
+
 /* add or replace a user control */
 static int snd_ctl_elem_add_compat(struct snd_ctl_file *file,
 				   struct snd_ctl_elem_info32 __user *data32,
@@ -393,6 +441,10 @@ enum {
 	SNDRV_CTL_IOCTL_ELEM_WRITE32 = _IOWR('U', 0x13, struct snd_ctl_elem_value32),
 	SNDRV_CTL_IOCTL_ELEM_ADD32 = _IOWR('U', 0x17, struct snd_ctl_elem_info32),
 	SNDRV_CTL_IOCTL_ELEM_REPLACE32 = _IOWR('U', 0x18, struct snd_ctl_elem_info32),
+#ifdef CONFIG_X86_X32
+	SNDRV_CTL_IOCTL_ELEM_READ_X32 = _IOWR('U', 0x12, struct snd_ctl_elem_value_x32),
+	SNDRV_CTL_IOCTL_ELEM_WRITE_X32 = _IOWR('U', 0x13, struct snd_ctl_elem_value_x32),
+#endif /* CONFIG_X86_X32 */
 };
 
 static inline long snd_ctl_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg)
@@ -431,6 +483,12 @@ static inline long snd_ctl_ioctl_compat(struct file *file, unsigned int cmd, uns
 		return snd_ctl_elem_add_compat(ctl, argp, 0);
 	case SNDRV_CTL_IOCTL_ELEM_REPLACE32:
 		return snd_ctl_elem_add_compat(ctl, argp, 1);
+#ifdef CONFIG_X86_X32
+	case SNDRV_CTL_IOCTL_ELEM_READ_X32:
+		return snd_ctl_elem_read_user_x32(ctl->card, argp);
+	case SNDRV_CTL_IOCTL_ELEM_WRITE_X32:
+		return snd_ctl_elem_write_user_x32(ctl, argp);
+#endif /* CONFIG_X86_X32 */
 	}
 
 	down_read(&snd_ioctl_rwsem);
-- 
cgit v0.10.2


From 513ace79b657e2022a592e77f24074e088681ecc Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Sun, 28 Feb 2016 11:23:09 +0100
Subject: ALSA: pcm: Fix ioctls for X32 ABI

X32 ABI uses the 64bit timespec in addition to 64bit alignment of
64bit values.  This leads to incompatibilities in some PCM ioctls
involved with snd_pcm_channel_info, snd_pcm_status and
snd_pcm_sync_ptr structs.  Fix the PCM compat ABI for these ioctls
like the previous commit for ctl API.

Reported-by: Steven Newbury <steve@snewbury.org.uk>
Cc: <stable@vger.kernel.org> # v3.4+
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/core/pcm_compat.c b/sound/core/pcm_compat.c
index 9630e9f..1f64ab0 100644
--- a/sound/core/pcm_compat.c
+++ b/sound/core/pcm_compat.c
@@ -183,6 +183,14 @@ static int snd_pcm_ioctl_channel_info_compat(struct snd_pcm_substream *substream
 	return err;
 }
 
+#ifdef CONFIG_X86_X32
+/* X32 ABI has the same struct as x86-64 for snd_pcm_channel_info */
+static int snd_pcm_channel_info_user(struct snd_pcm_substream *substream,
+				     struct snd_pcm_channel_info __user *src);
+#define snd_pcm_ioctl_channel_info_x32(s, p)	\
+	snd_pcm_channel_info_user(s, p)
+#endif /* CONFIG_X86_X32 */
+
 struct snd_pcm_status32 {
 	s32 state;
 	struct compat_timespec trigger_tstamp;
@@ -243,6 +251,71 @@ static int snd_pcm_status_user_compat(struct snd_pcm_substream *substream,
 	return err;
 }
 
+#ifdef CONFIG_X86_X32
+/* X32 ABI has 64bit timespec and 64bit alignment */
+struct snd_pcm_status_x32 {
+	s32 state;
+	u32 rsvd; /* alignment */
+	struct timespec trigger_tstamp;
+	struct timespec tstamp;
+	u32 appl_ptr;
+	u32 hw_ptr;
+	s32 delay;
+	u32 avail;
+	u32 avail_max;
+	u32 overrange;
+	s32 suspended_state;
+	u32 audio_tstamp_data;
+	struct timespec audio_tstamp;
+	struct timespec driver_tstamp;
+	u32 audio_tstamp_accuracy;
+	unsigned char reserved[52-2*sizeof(struct timespec)];
+} __packed;
+
+#define put_timespec(src, dst) copy_to_user(dst, src, sizeof(*dst))
+
+static int snd_pcm_status_user_x32(struct snd_pcm_substream *substream,
+				   struct snd_pcm_status_x32 __user *src,
+				   bool ext)
+{
+	struct snd_pcm_status status;
+	int err;
+
+	memset(&status, 0, sizeof(status));
+	/*
+	 * with extension, parameters are read/write,
+	 * get audio_tstamp_data from user,
+	 * ignore rest of status structure
+	 */
+	if (ext && get_user(status.audio_tstamp_data,
+				(u32 __user *)(&src->audio_tstamp_data)))
+		return -EFAULT;
+	err = snd_pcm_status(substream, &status);
+	if (err < 0)
+		return err;
+
+	if (clear_user(src, sizeof(*src)))
+		return -EFAULT;
+	if (put_user(status.state, &src->state) ||
+	    put_timespec(&status.trigger_tstamp, &src->trigger_tstamp) ||
+	    put_timespec(&status.tstamp, &src->tstamp) ||
+	    put_user(status.appl_ptr, &src->appl_ptr) ||
+	    put_user(status.hw_ptr, &src->hw_ptr) ||
+	    put_user(status.delay, &src->delay) ||
+	    put_user(status.avail, &src->avail) ||
+	    put_user(status.avail_max, &src->avail_max) ||
+	    put_user(status.overrange, &src->overrange) ||
+	    put_user(status.suspended_state, &src->suspended_state) ||
+	    put_user(status.audio_tstamp_data, &src->audio_tstamp_data) ||
+	    put_timespec(&status.audio_tstamp, &src->audio_tstamp) ||
+	    put_timespec(&status.driver_tstamp, &src->driver_tstamp) ||
+	    put_user(status.audio_tstamp_accuracy, &src->audio_tstamp_accuracy))
+		return -EFAULT;
+
+	return err;
+}
+#endif /* CONFIG_X86_X32 */
+
 /* both for HW_PARAMS and HW_REFINE */
 static int snd_pcm_ioctl_hw_params_compat(struct snd_pcm_substream *substream,
 					  int refine, 
@@ -469,6 +542,93 @@ static int snd_pcm_ioctl_sync_ptr_compat(struct snd_pcm_substream *substream,
 	return 0;
 }
 
+#ifdef CONFIG_X86_X32
+/* X32 ABI has 64bit timespec and 64bit alignment */
+struct snd_pcm_mmap_status_x32 {
+	s32 state;
+	s32 pad1;
+	u32 hw_ptr;
+	u32 pad2; /* alignment */
+	struct timespec tstamp;
+	s32 suspended_state;
+	struct timespec audio_tstamp;
+} __packed;
+
+struct snd_pcm_mmap_control_x32 {
+	u32 appl_ptr;
+	u32 avail_min;
+};
+
+struct snd_pcm_sync_ptr_x32 {
+	u32 flags;
+	u32 rsvd; /* alignment */
+	union {
+		struct snd_pcm_mmap_status_x32 status;
+		unsigned char reserved[64];
+	} s;
+	union {
+		struct snd_pcm_mmap_control_x32 control;
+		unsigned char reserved[64];
+	} c;
+} __packed;
+
+static int snd_pcm_ioctl_sync_ptr_x32(struct snd_pcm_substream *substream,
+				      struct snd_pcm_sync_ptr_x32 __user *src)
+{
+	struct snd_pcm_runtime *runtime = substream->runtime;
+	volatile struct snd_pcm_mmap_status *status;
+	volatile struct snd_pcm_mmap_control *control;
+	u32 sflags;
+	struct snd_pcm_mmap_control scontrol;
+	struct snd_pcm_mmap_status sstatus;
+	snd_pcm_uframes_t boundary;
+	int err;
+
+	if (snd_BUG_ON(!runtime))
+		return -EINVAL;
+
+	if (get_user(sflags, &src->flags) ||
+	    get_user(scontrol.appl_ptr, &src->c.control.appl_ptr) ||
+	    get_user(scontrol.avail_min, &src->c.control.avail_min))
+		return -EFAULT;
+	if (sflags & SNDRV_PCM_SYNC_PTR_HWSYNC) {
+		err = snd_pcm_hwsync(substream);
+		if (err < 0)
+			return err;
+	}
+	status = runtime->status;
+	control = runtime->control;
+	boundary = recalculate_boundary(runtime);
+	if (!boundary)
+		boundary = 0x7fffffff;
+	snd_pcm_stream_lock_irq(substream);
+	/* FIXME: we should consider the boundary for the sync from app */
+	if (!(sflags & SNDRV_PCM_SYNC_PTR_APPL))
+		control->appl_ptr = scontrol.appl_ptr;
+	else
+		scontrol.appl_ptr = control->appl_ptr % boundary;
+	if (!(sflags & SNDRV_PCM_SYNC_PTR_AVAIL_MIN))
+		control->avail_min = scontrol.avail_min;
+	else
+		scontrol.avail_min = control->avail_min;
+	sstatus.state = status->state;
+	sstatus.hw_ptr = status->hw_ptr % boundary;
+	sstatus.tstamp = status->tstamp;
+	sstatus.suspended_state = status->suspended_state;
+	sstatus.audio_tstamp = status->audio_tstamp;
+	snd_pcm_stream_unlock_irq(substream);
+	if (put_user(sstatus.state, &src->s.status.state) ||
+	    put_user(sstatus.hw_ptr, &src->s.status.hw_ptr) ||
+	    put_timespec(&sstatus.tstamp, &src->s.status.tstamp) ||
+	    put_user(sstatus.suspended_state, &src->s.status.suspended_state) ||
+	    put_timespec(&sstatus.audio_tstamp, &src->s.status.audio_tstamp) ||
+	    put_user(scontrol.appl_ptr, &src->c.control.appl_ptr) ||
+	    put_user(scontrol.avail_min, &src->c.control.avail_min))
+		return -EFAULT;
+
+	return 0;
+}
+#endif /* CONFIG_X86_X32 */
 
 /*
  */
@@ -487,7 +647,12 @@ enum {
 	SNDRV_PCM_IOCTL_WRITEN_FRAMES32 = _IOW('A', 0x52, struct snd_xfern32),
 	SNDRV_PCM_IOCTL_READN_FRAMES32 = _IOR('A', 0x53, struct snd_xfern32),
 	SNDRV_PCM_IOCTL_SYNC_PTR32 = _IOWR('A', 0x23, struct snd_pcm_sync_ptr32),
-
+#ifdef CONFIG_X86_X32
+	SNDRV_PCM_IOCTL_CHANNEL_INFO_X32 = _IOR('A', 0x32, struct snd_pcm_channel_info),
+	SNDRV_PCM_IOCTL_STATUS_X32 = _IOR('A', 0x20, struct snd_pcm_status_x32),
+	SNDRV_PCM_IOCTL_STATUS_EXT_X32 = _IOWR('A', 0x24, struct snd_pcm_status_x32),
+	SNDRV_PCM_IOCTL_SYNC_PTR_X32 = _IOWR('A', 0x23, struct snd_pcm_sync_ptr_x32),
+#endif /* CONFIG_X86_X32 */
 };
 
 static long snd_pcm_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg)
@@ -559,6 +724,16 @@ static long snd_pcm_ioctl_compat(struct file *file, unsigned int cmd, unsigned l
 		return snd_pcm_ioctl_rewind_compat(substream, argp);
 	case SNDRV_PCM_IOCTL_FORWARD32:
 		return snd_pcm_ioctl_forward_compat(substream, argp);
+#ifdef CONFIG_X86_X32
+	case SNDRV_PCM_IOCTL_STATUS_X32:
+		return snd_pcm_status_user_x32(substream, argp, false);
+	case SNDRV_PCM_IOCTL_STATUS_EXT_X32:
+		return snd_pcm_status_user_x32(substream, argp, true);
+	case SNDRV_PCM_IOCTL_SYNC_PTR_X32:
+		return snd_pcm_ioctl_sync_ptr_x32(substream, argp);
+	case SNDRV_PCM_IOCTL_CHANNEL_INFO_X32:
+		return snd_pcm_ioctl_channel_info_x32(substream, argp);
+#endif /* CONFIG_X86_X32 */
 	}
 
 	return -ENOIOCTLCMD;
-- 
cgit v0.10.2


From dd7e3f805231cf83317c39bd346d5e4839f5cbb0 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Sun, 28 Feb 2016 11:30:53 +0100
Subject: ALSA: rawmidi: Use comapt_put_timespec()

Instead of open-coding, use the existing helper to copy a 32bit
timespec from/to 64bit.

Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/core/rawmidi_compat.c b/sound/core/rawmidi_compat.c
index 5268c1f..836d1c9 100644
--- a/sound/core/rawmidi_compat.c
+++ b/sound/core/rawmidi_compat.c
@@ -85,8 +85,7 @@ static int snd_rawmidi_ioctl_status_compat(struct snd_rawmidi_file *rfile,
 	if (err < 0)
 		return err;
 
-	if (put_user(status.tstamp.tv_sec, &src->tstamp.tv_sec) ||
-	    put_user(status.tstamp.tv_nsec, &src->tstamp.tv_nsec) ||
+	if (compat_put_timespec(&status.tstamp, &src->tstamp) ||
 	    put_user(status.avail, &src->avail) ||
 	    put_user(status.xruns, &src->xruns))
 		return -EFAULT;
-- 
cgit v0.10.2


From 2251fbbc1539f05b0b206b37a602d5776be37252 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Sun, 28 Feb 2016 11:28:08 +0100
Subject: ALSA: rawmidi: Fix ioctls X32 ABI

Like the previous fixes for ctl and PCM, we need a fix for
incompatible X32 ABI regarding the rawmidi: namely, struct
snd_rawmidi_status has the timespec, and the size and the alignment on
X32 differ from IA32.

This patch fixes the incompatible ioctl for X32.

Cc: <stable@vger.kernel.org> # v3.4+
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/core/rawmidi_compat.c b/sound/core/rawmidi_compat.c
index 836d1c9..f69764d 100644
--- a/sound/core/rawmidi_compat.c
+++ b/sound/core/rawmidi_compat.c
@@ -93,9 +93,58 @@ static int snd_rawmidi_ioctl_status_compat(struct snd_rawmidi_file *rfile,
 	return 0;
 }
 
+#ifdef CONFIG_X86_X32
+/* X32 ABI has 64bit timespec and 64bit alignment */
+struct snd_rawmidi_status_x32 {
+	s32 stream;
+	u32 rsvd; /* alignment */
+	struct timespec tstamp;
+	u32 avail;
+	u32 xruns;
+	unsigned char reserved[16];
+} __attribute__((packed));
+
+#define put_timespec(src, dst) copy_to_user(dst, src, sizeof(*dst))
+
+static int snd_rawmidi_ioctl_status_x32(struct snd_rawmidi_file *rfile,
+					struct snd_rawmidi_status_x32 __user *src)
+{
+	int err;
+	struct snd_rawmidi_status status;
+
+	if (rfile->output == NULL)
+		return -EINVAL;
+	if (get_user(status.stream, &src->stream))
+		return -EFAULT;
+
+	switch (status.stream) {
+	case SNDRV_RAWMIDI_STREAM_OUTPUT:
+		err = snd_rawmidi_output_status(rfile->output, &status);
+		break;
+	case SNDRV_RAWMIDI_STREAM_INPUT:
+		err = snd_rawmidi_input_status(rfile->input, &status);
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (err < 0)
+		return err;
+
+	if (put_timespec(&status.tstamp, &src->tstamp) ||
+	    put_user(status.avail, &src->avail) ||
+	    put_user(status.xruns, &src->xruns))
+		return -EFAULT;
+
+	return 0;
+}
+#endif /* CONFIG_X86_X32 */
+
 enum {
 	SNDRV_RAWMIDI_IOCTL_PARAMS32 = _IOWR('W', 0x10, struct snd_rawmidi_params32),
 	SNDRV_RAWMIDI_IOCTL_STATUS32 = _IOWR('W', 0x20, struct snd_rawmidi_status32),
+#ifdef CONFIG_X86_X32
+	SNDRV_RAWMIDI_IOCTL_STATUS_X32 = _IOWR('W', 0x20, struct snd_rawmidi_status_x32),
+#endif /* CONFIG_X86_X32 */
 };
 
 static long snd_rawmidi_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg)
@@ -114,6 +163,10 @@ static long snd_rawmidi_ioctl_compat(struct file *file, unsigned int cmd, unsign
 		return snd_rawmidi_ioctl_params_compat(rfile, argp);
 	case SNDRV_RAWMIDI_IOCTL_STATUS32:
 		return snd_rawmidi_ioctl_status_compat(rfile, argp);
+#ifdef CONFIG_X86_X32
+	case SNDRV_RAWMIDI_IOCTL_STATUS_X32:
+		return snd_rawmidi_ioctl_status_x32(rfile, argp);
+#endif /* CONFIG_X86_X32 */
 	}
 	return -ENOIOCTLCMD;
 }
-- 
cgit v0.10.2


From 3a72494ac2a3bd229db941d51e7efe2f6ccd947b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Sun, 28 Feb 2016 11:36:14 +0100
Subject: ALSA: timer: Fix broken compat timer user status ioctl

The timer user status compat ioctl returned the bogus struct used for
64bit architectures instead of the 32bit one.  This patch addresses
it to return the proper struct.

Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/core/timer_compat.c b/sound/core/timer_compat.c
index e05802a..8e7eddf 100644
--- a/sound/core/timer_compat.c
+++ b/sound/core/timer_compat.c
@@ -70,13 +70,14 @@ static int snd_timer_user_status_compat(struct file *file,
 					struct snd_timer_status32 __user *_status)
 {
 	struct snd_timer_user *tu;
-	struct snd_timer_status status;
+	struct snd_timer_status32 status;
 	
 	tu = file->private_data;
 	if (snd_BUG_ON(!tu->timeri))
 		return -ENXIO;
 	memset(&status, 0, sizeof(status));
-	status.tstamp = tu->tstamp;
+	status.tstamp.tv_sec = tu->tstamp.tv_sec;
+	status.tstamp.tv_nsec = tu->tstamp.tv_nsec;
 	status.resolution = snd_timer_resolution(tu->timeri);
 	status.lost = tu->timeri->lost;
 	status.overrun = tu->overrun;
-- 
cgit v0.10.2


From b24e7ad1fdc22177eb3e51584e1cfcb45d818488 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Sun, 28 Feb 2016 11:41:47 +0100
Subject: ALSA: timer: Fix ioctls for X32 ABI

X32 ABI takes the 64bit timespec, thus the timer user status ioctl becomes
incompatible with IA32.  This results in NOTTY error when the ioctl is
issued.

Meanwhile, this struct in X32 is essentially identical with the one in
X86-64, so we can just bypassing to the existing code for this
specific compat ioctl.

Cc: <stable@vger.kernel.org> # v3.4+
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/core/timer_compat.c b/sound/core/timer_compat.c
index 8e7eddf..2e90822 100644
--- a/sound/core/timer_compat.c
+++ b/sound/core/timer_compat.c
@@ -89,12 +89,21 @@ static int snd_timer_user_status_compat(struct file *file,
 	return 0;
 }
 
+#ifdef CONFIG_X86_X32
+/* X32 ABI has the same struct as x86-64 */
+#define snd_timer_user_status_x32(file, s) \
+	snd_timer_user_status(file, s)
+#endif /* CONFIG_X86_X32 */
+
 /*
  */
 
 enum {
 	SNDRV_TIMER_IOCTL_INFO32 = _IOR('T', 0x11, struct snd_timer_info32),
 	SNDRV_TIMER_IOCTL_STATUS32 = _IOW('T', 0x14, struct snd_timer_status32),
+#ifdef CONFIG_X86_X32
+	SNDRV_TIMER_IOCTL_STATUS_X32 = _IOW('T', 0x14, struct snd_timer_status),
+#endif /* CONFIG_X86_X32 */
 };
 
 static long snd_timer_user_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg)
@@ -123,6 +132,10 @@ static long snd_timer_user_ioctl_compat(struct file *file, unsigned int cmd, uns
 		return snd_timer_user_info_compat(file, argp);
 	case SNDRV_TIMER_IOCTL_STATUS32:
 		return snd_timer_user_status_compat(file, argp);
+#ifdef CONFIG_X86_X32
+	case SNDRV_TIMER_IOCTL_STATUS_X32:
+		return snd_timer_user_status_x32(file, argp);
+#endif /* CONFIG_X86_X32 */
 	}
 	return -ENOIOCTLCMD;
 }
-- 
cgit v0.10.2


From deb7deff2f00bdbbcb3d560dad2a89ef37df837d Mon Sep 17 00:00:00 2001
From: Justin Maggard <jmaggard10@gmail.com>
Date: Tue, 9 Feb 2016 15:52:08 -0800
Subject: cifs: fix out-of-bounds access in lease parsing

When opening a file, SMB2_open() attempts to parse the lease state from the
SMB2 CREATE Response.  However, the parsing code was not careful to ensure
that the create contexts are not empty or invalid, which can lead to out-
of-bounds memory access.  This can be seen easily by trying
to read a file from a OSX 10.11 SMB3 server.  Here is sample crash output:

BUG: unable to handle kernel paging request at ffff8800a1a77cc6
IP: [<ffffffff8828a734>] SMB2_open+0x804/0x960
PGD 8f77067 PUD 0
Oops: 0000 [#1] SMP
Modules linked in:
CPU: 3 PID: 2876 Comm: cp Not tainted 4.5.0-rc3.x86_64.1+ #14
Hardware name: NETGEAR ReadyNAS 314          /ReadyNAS 314          , BIOS 4.6.5 10/11/2012
task: ffff880073cdc080 ti: ffff88005b31c000 task.ti: ffff88005b31c000
RIP: 0010:[<ffffffff8828a734>]  [<ffffffff8828a734>] SMB2_open+0x804/0x960
RSP: 0018:ffff88005b31fa08  EFLAGS: 00010282
RAX: 0000000000000015 RBX: 0000000000000000 RCX: 0000000000000006
RDX: 0000000000000000 RSI: 0000000000000246 RDI: ffff88007eb8c8b0
RBP: ffff88005b31fad8 R08: 666666203d206363 R09: 6131613030383866
R10: 3030383866666666 R11: 00000000000002b0 R12: ffff8800660fd800
R13: ffff8800a1a77cc2 R14: 00000000424d53fe R15: ffff88005f5a28c0
FS:  00007f7c8a2897c0(0000) GS:ffff88007eb80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffff8800a1a77cc6 CR3: 000000005b281000 CR4: 00000000000006e0
Stack:
 ffff88005b31fa70 ffffffff88278789 00000000000001d3 ffff88005f5a2a80
 ffffffff00000003 ffff88005d029d00 ffff88006fde05a0 0000000000000000
 ffff88005b31fc78 ffff88006fde0780 ffff88005b31fb2f 0000000100000fe0
Call Trace:
 [<ffffffff88278789>] ? cifsConvertToUTF16+0x159/0x2d0
 [<ffffffff8828cf68>] smb2_open_file+0x98/0x210
 [<ffffffff8811e80c>] ? __kmalloc+0x1c/0xe0
 [<ffffffff882685f4>] cifs_open+0x2a4/0x720
 [<ffffffff88122cef>] do_dentry_open+0x1ff/0x310
 [<ffffffff88268350>] ? cifsFileInfo_get+0x30/0x30
 [<ffffffff88123d92>] vfs_open+0x52/0x60
 [<ffffffff88131dd0>] path_openat+0x170/0xf70
 [<ffffffff88097d48>] ? remove_wait_queue+0x48/0x50
 [<ffffffff88133a29>] do_filp_open+0x79/0xd0
 [<ffffffff8813f2ca>] ? __alloc_fd+0x3a/0x170
 [<ffffffff881240c4>] do_sys_open+0x114/0x1e0
 [<ffffffff881241a9>] SyS_open+0x19/0x20
 [<ffffffff8896e257>] entry_SYSCALL_64_fastpath+0x12/0x6a
Code: 4d 8d 6c 07 04 31 c0 4c 89 ee e8 47 6f e5 ff 31 c9 41 89 ce 44 89 f1 48 c7 c7 28 b1 bd 88 31 c0 49 01 cd 4c 89 ee e8 2b 6f e5 ff <45> 0f b7 75 04 48 c7 c7 31 b1 bd 88 31 c0 4d 01 ee 4c 89 f6 e8
RIP  [<ffffffff8828a734>] SMB2_open+0x804/0x960
 RSP <ffff88005b31fa08>
CR2: ffff8800a1a77cc6
---[ end trace d9f69ba64feee469 ]---

Signed-off-by: Justin Maggard <jmaggard@netgear.com>
Signed-off-by: Steve French <smfrench@gmail.com>
CC: Stable <stable@vger.kernel.org>

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 10f8d5c..42e1f44 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1106,21 +1106,25 @@ parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
 {
 	char *data_offset;
 	struct create_context *cc;
-	unsigned int next = 0;
+	unsigned int next;
+	unsigned int remaining;
 	char *name;
 
 	data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
+	remaining = le32_to_cpu(rsp->CreateContextsLength);
 	cc = (struct create_context *)data_offset;
-	do {
-		cc = (struct create_context *)((char *)cc + next);
+	while (remaining >= sizeof(struct create_context)) {
 		name = le16_to_cpu(cc->NameOffset) + (char *)cc;
-		if (le16_to_cpu(cc->NameLength) != 4 ||
-		    strncmp(name, "RqLs", 4)) {
-			next = le32_to_cpu(cc->Next);
-			continue;
-		}
-		return server->ops->parse_lease_buf(cc, epoch);
-	} while (next != 0);
+		if (le16_to_cpu(cc->NameLength) == 4 &&
+		    strncmp(name, "RqLs", 4) == 0)
+			return server->ops->parse_lease_buf(cc, epoch);
+
+		next = le32_to_cpu(cc->Next);
+		if (!next)
+			break;
+		remaining -= next;
+		cc = (struct create_context *)((char *)cc + next);
+	}
 
 	return 0;
 }
-- 
cgit v0.10.2


From 6cc3b24235929b54acd5ecc987ef11a425bd209e Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilovsky@samba.org>
Date: Sat, 27 Feb 2016 11:58:18 +0300
Subject: CIFS: Fix SMB2+ interim response processing for read requests

For interim responses we only need to parse a header and update
a number credits. Now it is done for all SMB2+ command except
SMB2_READ which is wrong. Fix this by adding such processing.

Signed-off-by: Pavel Shilovsky <pshilovsky@samba.org>
Tested-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
CC: Stable <stable@vger.kernel.org>
Signed-off-by: Steve French <smfrench@gmail.com>

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 90b4f9f..76fcb50 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1396,11 +1396,10 @@ openRetry:
  * current bigbuf.
  */
 static int
-cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+discard_remaining_data(struct TCP_Server_Info *server)
 {
 	unsigned int rfclen = get_rfc1002_length(server->smallbuf);
 	int remaining = rfclen + 4 - server->total_read;
-	struct cifs_readdata *rdata = mid->callback_data;
 
 	while (remaining > 0) {
 		int length;
@@ -1414,10 +1413,20 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 		remaining -= length;
 	}
 
-	dequeue_mid(mid, rdata->result);
 	return 0;
 }
 
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	int length;
+	struct cifs_readdata *rdata = mid->callback_data;
+
+	length = discard_remaining_data(server);
+	dequeue_mid(mid, rdata->result);
+	return length;
+}
+
 int
 cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 {
@@ -1446,6 +1455,12 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 		return length;
 	server->total_read += length;
 
+	if (server->ops->is_status_pending &&
+	    server->ops->is_status_pending(buf, server, 0)) {
+		discard_remaining_data(server);
+		return -1;
+	}
+
 	/* Was the SMB read successful? */
 	rdata->result = server->ops->map_error(buf, false);
 	if (rdata->result != 0) {
-- 
cgit v0.10.2


From 1ee9f4bd1a97026a7b2d7ae9f1f74b45680d0003 Mon Sep 17 00:00:00 2001
From: Yadan Fan <ydfan@novell.com>
Date: Mon, 29 Feb 2016 14:44:57 +0800
Subject: Fix cifs_uniqueid_to_ino_t() function for s390x

This issue is caused by commit 02323db17e3a7 ("cifs: fix
cifs_uniqueid_to_ino_t not to ever return 0"), when BITS_PER_LONG
is 64 on s390x, the corresponding cifs_uniqueid_to_ino_t()
function will cast 64-bit fileid to 32-bit by using (ino_t)fileid,
because ino_t (typdefed __kernel_ino_t) is int type.

It's defined in arch/s390/include/uapi/asm/posix_types.h

    #ifndef __s390x__

    typedef unsigned long   __kernel_ino_t;
    ...
    #else /* __s390x__ */

    typedef unsigned int    __kernel_ino_t;

So the #ifdef condition is wrong for s390x, we can just still use
one cifs_uniqueid_to_ino_t() function with comparing sizeof(ino_t)
and sizeof(u64) to choose the correct execution accordingly.

Signed-off-by: Yadan Fan <ydfan@suse.com>
CC: stable <stable@vger.kernel.org>
Signed-off-by: Steve French <smfrench@gmail.com>

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 68c4547..83aac8b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -31,19 +31,15 @@
  * so that it will fit. We use hash_64 to convert the value to 31 bits, and
  * then add 1, to ensure that we don't end up with a 0 as the value.
  */
-#if BITS_PER_LONG == 64
 static inline ino_t
 cifs_uniqueid_to_ino_t(u64 fileid)
 {
+	if ((sizeof(ino_t)) < (sizeof(u64)))
+		return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
+
 	return (ino_t)fileid;
+
 }
-#else
-static inline ino_t
-cifs_uniqueid_to_ino_t(u64 fileid)
-{
-	return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
-}
-#endif
 
 extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
-- 
cgit v0.10.2


From 869ae76147ffdf21ad24f0e599303cd58a2bb39f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 27 Feb 2016 23:11:28 +0100
Subject: uprobes: __create_xol_area() must nullify xol_mapping.fault

As Jiri pointed out, this recent commit:

 f872f5400cc0 ("mm: Add a vm_special_mapping.fault() method")

breaks uprobes: __create_xol_area() doesn't initialize the new ->fault()
method and this obviously leads to kernel crash when the application
tries to execute the probed insn after bp hit.

We probably want to add uprobes_special_mapping_fault(), this allows to
turn xol_area->xol_mapping into a single instance of vm_special_mapping.
But we need a simple fix, so lets change __create_xol() to nullify the
new member as Jiri suggests.

Suggested-by: Jiri Olsa <jolsa@redhat.com>
Reported-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <tipbot@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Pratyush Anand <panand@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160227221128.GA29565@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0167679..5f6ce93 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1178,6 +1178,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
 		goto free_area;
 
 	area->xol_mapping.name = "[uprobes]";
+	area->xol_mapping.fault = NULL;
 	area->xol_mapping.pages = area->pages;
 	area->pages[0] = alloc_page(GFP_HIGHUSER);
 	if (!area->pages[0])
-- 
cgit v0.10.2


From 6cb2f1d9af5b0f0afdd4e689d969df4b5c76a4c2 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Wed, 24 Feb 2016 05:07:43 -0500
Subject: perf/x86/intel/uncore: Remove SBOX support for BDX-DE

BDX-DE and BDX-EP share the same uncore code path. But there is no sbox
in BDX-DE. This patch remove SBOX support for BDX-DE.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: <tonyb@cybernetics.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Battersby <tonyb@cybernetics.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/37D7C6CF3E00A74B8858931C1DB2F0770589D336@SHSMSX103.ccr.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 0c801f7..d967fcc 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -2874,11 +2874,13 @@ static struct intel_uncore_type bdx_uncore_sbox = {
 	.format_group		= &hswep_uncore_sbox_format_group,
 };
 
+#define BDX_MSR_UNCORE_SBOX	3
+
 static struct intel_uncore_type *bdx_msr_uncores[] = {
 	&bdx_uncore_ubox,
 	&bdx_uncore_cbox,
-	&bdx_uncore_sbox,
 	&hswep_uncore_pcu,
+	&bdx_uncore_sbox,
 	NULL,
 };
 
@@ -2887,6 +2889,10 @@ void bdx_uncore_cpu_init(void)
 	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
 		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
 	uncore_msr_uncores = bdx_msr_uncores;
+
+	/* BDX-DE doesn't have SBOX */
+	if (boot_cpu_data.x86_model == 86)
+		uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
 }
 
 static struct intel_uncore_type bdx_uncore_ha = {
-- 
cgit v0.10.2


From a54d690e7ca24c8c8eea32856286ef298ed0608b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 26 Feb 2016 10:35:45 +0100
Subject: perf: Add a reviewer

Alexander volunteered to review perf (kernel) patches.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/MAINTAINERS b/MAINTAINERS
index da3e4d8..90d370c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8448,6 +8448,7 @@ PERFORMANCE EVENTS SUBSYSTEM
 M:	Peter Zijlstra <peterz@infradead.org>
 M:	Ingo Molnar <mingo@redhat.com>
 M:	Arnaldo Carvalho de Melo <acme@kernel.org>
+R:	Alexander Shishkin <alexander.shishkin@linux.intel.com>
 L:	linux-kernel@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core
 S:	Supported
-- 
cgit v0.10.2


From 3ccca9eca67ec68461a5266832d02aee538599b2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:08 +0000
Subject: perf/x86/intel/uncore: Remove pointless mask check

uncore_cpumask_init() is only ever called from intel_uncore_init() where the
mask is guaranteed to be empty.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221010.657326866@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 91a18d6..c422d52e 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1342,12 +1342,6 @@ static void __init uncore_cpumask_init(void)
 {
 	int cpu;
 
-	/*
-	 * ony invoke once from msr or pci init code
-	 */
-	if (!cpumask_empty(&uncore_cpu_mask))
-		return;
-
 	cpu_notifier_register_begin();
 
 	for_each_online_cpu(cpu) {
-- 
cgit v0.10.2


From ffeda003803213a8d0babefdd6a95fe424884c14 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:09 +0000
Subject: perf/x86/intel/uncore: Simplify error rollback

No point in doing partial rollbacks. Robustify uncore_exit_type() so it does
not dereference type->pmus unconditionally and remove all the partial rollback
hackery.

Preparatory patch for proper error handling.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221010.751077467@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index c422d52e..91facdc 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -767,10 +767,12 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
 {
 	int i;
 
-	for (i = 0; i < type->num_boxes; i++)
-		free_percpu(type->pmus[i].box);
-	kfree(type->pmus);
-	type->pmus = NULL;
+	if (type->pmus) {
+		for (i = 0; i < type->num_boxes; i++)
+			free_percpu(type->pmus[i].box);
+		kfree(type->pmus);
+		type->pmus = NULL;
+	}
 	kfree(type->events_group);
 	type->events_group = NULL;
 }
@@ -778,6 +780,7 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
 static void __init uncore_types_exit(struct intel_uncore_type **types)
 {
 	int i;
+
 	for (i = 0; types[i]; i++)
 		uncore_type_exit(types[i]);
 }
@@ -806,7 +809,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
 		INIT_LIST_HEAD(&pmus[i].box_list);
 		pmus[i].box = alloc_percpu(struct intel_uncore_box *);
 		if (!pmus[i].box)
-			goto fail;
+			return -ENOMEM;
 	}
 
 	if (type->event_descs) {
@@ -817,7 +820,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
 		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
 					sizeof(*attr_group), GFP_KERNEL);
 		if (!attr_group)
-			goto fail;
+			return -ENOMEM;
 
 		attrs = (struct attribute **)(attr_group + 1);
 		attr_group->name = "events";
@@ -831,9 +834,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
 
 	type->pmu_group = &uncore_pmu_attr_group;
 	return 0;
-fail:
-	uncore_type_exit(type);
-	return -ENOMEM;
 }
 
 static int __init uncore_types_init(struct intel_uncore_type **types)
@@ -843,13 +843,9 @@ static int __init uncore_types_init(struct intel_uncore_type **types)
 	for (i = 0; types[i]; i++) {
 		ret = uncore_type_init(types[i]);
 		if (ret)
-			goto fail;
+			return ret;
 	}
 	return 0;
-fail:
-	while (--i >= 0)
-		uncore_type_exit(types[i]);
-	return ret;
 }
 
 /*
@@ -1007,17 +1003,21 @@ static int __init uncore_pci_init(void)
 
 	ret = uncore_types_init(uncore_pci_uncores);
 	if (ret)
-		return ret;
+		goto err;
 
 	uncore_pci_driver->probe = uncore_pci_probe;
 	uncore_pci_driver->remove = uncore_pci_remove;
 
 	ret = pci_register_driver(uncore_pci_driver);
-	if (ret == 0)
-		pcidrv_registered = true;
-	else
-		uncore_types_exit(uncore_pci_uncores);
+	if (ret)
+		goto err;
+
+	pcidrv_registered = true;
+	return 0;
 
+err:
+	uncore_types_exit(uncore_pci_uncores);
+	uncore_pci_uncores = empty_uncore;
 	return ret;
 }
 
@@ -1316,9 +1316,12 @@ static int __init uncore_cpu_init(void)
 
 	ret = uncore_types_init(uncore_msr_uncores);
 	if (ret)
-		return ret;
-
+		goto err;
 	return 0;
+err:
+	uncore_types_exit(uncore_msr_uncores);
+	uncore_msr_uncores = empty_uncore;
+	return ret;
 }
 
 static int __init uncore_pmus_register(void)
-- 
cgit v0.10.2


From 4f089678d071781851c3b73c41e55a3765b6a5ee Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:09 +0000
Subject: perf/x86/intel/uncore: Fix error handling

This driver lacks any form of proper error handling. If initialization fails
or hotplug prepare fails, it lets the facility with half initialized stuff
around.

Fix the state and memory leaks in a first step. As a second step we need to
undo the hardware state which is set via uncore_box_init() on some of the
uncore implementations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221010.848880559@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 91facdc..25e6203 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -38,6 +38,16 @@ int uncore_pcibus_to_physid(struct pci_bus *bus)
 	return phys_id;
 }
 
+static void uncore_free_pcibus_map(void)
+{
+	struct pci2phy_map *map, *tmp;
+
+	list_for_each_entry_safe(map, tmp, &pci2phy_map_head, list) {
+		list_del(&map->list);
+		kfree(map);
+	}
+}
+
 struct pci2phy_map *__find_pci2phy_map(int segment)
 {
 	struct pci2phy_map *map, *alloc = NULL;
@@ -760,16 +770,28 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 	}
 
 	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
+	if (!ret)
+		pmu->registered = true;
 	return ret;
 }
 
+static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
+{
+	if (!pmu->registered)
+		return;
+	perf_pmu_unregister(&pmu->pmu);
+	pmu->registered = false;
+}
+
 static void __init uncore_type_exit(struct intel_uncore_type *type)
 {
 	int i;
 
 	if (type->pmus) {
-		for (i = 0; i < type->num_boxes; i++)
+		for (i = 0; i < type->num_boxes; i++) {
+			uncore_pmu_unregister(&type->pmus[i]);
 			free_percpu(type->pmus[i].box);
+		}
 		kfree(type->pmus);
 		type->pmus = NULL;
 	}
@@ -856,8 +878,8 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	struct intel_uncore_pmu *pmu;
 	struct intel_uncore_box *box;
 	struct intel_uncore_type *type;
-	int phys_id;
 	bool first_box = false;
+	int phys_id, ret;
 
 	phys_id = uncore_pcibus_to_physid(pdev->bus);
 	if (phys_id < 0)
@@ -906,9 +928,18 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	list_add_tail(&box->list, &pmu->box_list);
 	raw_spin_unlock(&uncore_box_lock);
 
-	if (first_box)
-		uncore_pmu_register(pmu);
-	return 0;
+	if (!first_box)
+		return 0;
+
+	ret = uncore_pmu_register(pmu);
+	if (ret) {
+		pci_set_drvdata(pdev, NULL);
+		raw_spin_lock(&uncore_box_lock);
+		list_del(&box->list);
+		raw_spin_unlock(&uncore_box_lock);
+		kfree(box);
+	}
+	return ret;
 }
 
 static void uncore_pci_remove(struct pci_dev *pdev)
@@ -954,7 +985,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 	kfree(box);
 
 	if (last_box)
-		perf_pmu_unregister(&pmu->pmu);
+		uncore_pmu_unregister(pmu);
 }
 
 static int __init uncore_pci_init(void)
@@ -1018,6 +1049,7 @@ static int __init uncore_pci_init(void)
 err:
 	uncore_types_exit(uncore_pci_uncores);
 	uncore_pci_uncores = empty_uncore;
+	uncore_free_pcibus_map();
 	return ret;
 }
 
@@ -1027,6 +1059,7 @@ static void __init uncore_pci_exit(void)
 		pcidrv_registered = false;
 		pci_unregister_driver(uncore_pci_driver);
 		uncore_types_exit(uncore_pci_uncores);
+		uncore_free_pcibus_map();
 	}
 }
 
@@ -1223,8 +1256,7 @@ static int uncore_cpu_notifier(struct notifier_block *self,
 	/* allocate/free data structure for uncore box */
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
-		uncore_cpu_prepare(cpu, -1);
-		break;
+		return notifier_from_errno(uncore_cpu_prepare(cpu, -1));
 	case CPU_STARTING:
 		uncore_cpu_starting(cpu);
 		break;
@@ -1265,9 +1297,29 @@ static struct notifier_block uncore_cpu_nb = {
 	.priority	= CPU_PRI_PERF + 1,
 };
 
-static void __init uncore_cpu_setup(void *dummy)
+static int __init type_pmu_register(struct intel_uncore_type *type)
 {
-	uncore_cpu_starting(smp_processor_id());
+	int i, ret;
+
+	for (i = 0; i < type->num_boxes; i++) {
+		ret = uncore_pmu_register(&type->pmus[i]);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static int __init uncore_msr_pmus_register(void)
+{
+	struct intel_uncore_type **types = uncore_msr_uncores;
+	int ret;
+
+	while (*types) {
+		ret = type_pmu_register(*types++);
+		if (ret)
+			return ret;
+	}
+	return 0;
 }
 
 static int __init uncore_cpu_init(void)
@@ -1317,6 +1369,10 @@ static int __init uncore_cpu_init(void)
 	ret = uncore_types_init(uncore_msr_uncores);
 	if (ret)
 		goto err;
+
+	ret = uncore_msr_pmus_register();
+	if (ret)
+		goto err;
 	return 0;
 err:
 	uncore_types_exit(uncore_msr_uncores);
@@ -1324,26 +1380,14 @@ err:
 	return ret;
 }
 
-static int __init uncore_pmus_register(void)
+static void __init uncore_cpu_setup(void *dummy)
 {
-	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_type *type;
-	int i, j;
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			uncore_pmu_register(pmu);
-		}
-	}
-
-	return 0;
+	uncore_cpu_starting(smp_processor_id());
 }
 
-static void __init uncore_cpumask_init(void)
+static int __init uncore_cpumask_init(void)
 {
-	int cpu;
+	int cpu, ret = 0;
 
 	cpu_notifier_register_begin();
 
@@ -1359,17 +1403,20 @@ static void __init uncore_cpumask_init(void)
 		if (phys_id < 0)
 			continue;
 
-		uncore_cpu_prepare(cpu, phys_id);
+		ret = uncore_cpu_prepare(cpu, phys_id);
+		if (ret)
+			goto out;
 		uncore_event_init_cpu(cpu);
 	}
 	on_each_cpu(uncore_cpu_setup, NULL, 1);
 
 	__register_cpu_notifier(&uncore_cpu_nb);
 
+out:
 	cpu_notifier_register_done();
+	return ret;
 }
 
-
 static int __init intel_uncore_init(void)
 {
 	int ret;
@@ -1382,17 +1429,20 @@ static int __init intel_uncore_init(void)
 
 	ret = uncore_pci_init();
 	if (ret)
-		goto fail;
+		return ret;
 	ret = uncore_cpu_init();
-	if (ret) {
-		uncore_pci_exit();
-		goto fail;
-	}
-	uncore_cpumask_init();
+	if (ret)
+		goto errpci;
+	ret = uncore_cpumask_init();
+	if (ret)
+		goto errcpu;
 
-	uncore_pmus_register();
 	return 0;
-fail:
+
+errcpu:
+	uncore_types_exit(uncore_msr_uncores);
+errpci:
+	uncore_pci_exit();
 	return ret;
 }
 device_initcall(intel_uncore_init);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 6a1340c..a18cc7f 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -73,13 +73,14 @@ struct intel_uncore_ops {
 };
 
 struct intel_uncore_pmu {
-	struct pmu pmu;
-	char name[UNCORE_PMU_NAME_LEN];
-	int pmu_idx;
-	int func_id;
-	struct intel_uncore_type *type;
-	struct intel_uncore_box ** __percpu box;
-	struct list_head box_list;
+	struct pmu			pmu;
+	char				name[UNCORE_PMU_NAME_LEN];
+	int				pmu_idx;
+	int				func_id;
+	bool				registered;
+	struct intel_uncore_type	*type;
+	struct intel_uncore_box		** __percpu box;
+	struct list_head		box_list;
 };
 
 struct intel_uncore_extra_reg {
-- 
cgit v0.10.2


From 83f8ebd2eb459b21e8e74d43c75e1be666ec8b97 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:10 +0000
Subject: perf/x86/intel/uncore: Add sanity checks for PCI dev package id

The storage array is size limited, but misses a sanity check

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221010.929967806@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 25e6203..42ea435 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -882,7 +882,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	int phys_id, ret;
 
 	phys_id = uncore_pcibus_to_physid(pdev->bus);
-	if (phys_id < 0)
+	if (phys_id < 0 || phys_id >= UNCORE_SOCKET_MAX)
 		return -ENODEV;
 
 	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
-- 
cgit v0.10.2


From a46195f1782e94ad07e7485fb15b80db7e60e7aa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:11 +0000
Subject: perf/x86/intel/uncore: Clean up hardware on exit

When tearing down the boxes nothing undoes the hardware state which was setup
by box->init_box(). Add a box->exit_box() callback and implement it for the
uncores which have an init_box() callback.

This misses the cleanup in the error exit pathes, but I cannot be bothered to
implement it before cleaning up the rest of the driver, which makes that task
way simpler.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.023930023@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 42ea435..c3d1cbb 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -937,6 +937,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 		raw_spin_lock(&uncore_box_lock);
 		list_del(&box->list);
 		raw_spin_unlock(&uncore_box_lock);
+		uncore_box_exit(box);
 		kfree(box);
 	}
 	return ret;
@@ -982,6 +983,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 	}
 
 	WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
+	uncore_box_exit(box);
 	kfree(box);
 
 	if (last_box)
@@ -1091,8 +1093,10 @@ static void uncore_cpu_dying(int cpu)
 			pmu = &type->pmus[j];
 			box = *per_cpu_ptr(pmu->box, cpu);
 			*per_cpu_ptr(pmu->box, cpu) = NULL;
-			if (box && atomic_dec_and_test(&box->refcnt))
+			if (box && atomic_dec_and_test(&box->refcnt)) {
 				list_add(&box->list, &boxes_to_free);
+				uncore_box_exit(box);
+			}
 		}
 	}
 }
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index a18cc7f..cf2cb1f 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -61,6 +61,7 @@ struct intel_uncore_type {
 
 struct intel_uncore_ops {
 	void (*init_box)(struct intel_uncore_box *);
+	void (*exit_box)(struct intel_uncore_box *);
 	void (*disable_box)(struct intel_uncore_box *);
 	void (*enable_box)(struct intel_uncore_box *);
 	void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
@@ -306,6 +307,14 @@ static inline void uncore_box_init(struct intel_uncore_box *box)
 	}
 }
 
+static inline void uncore_box_exit(struct intel_uncore_box *box)
+{
+	if (test_and_clear_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+		if (box->pmu->type->ops->exit_box)
+			box->pmu->type->ops->exit_box(box);
+	}
+}
+
 static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
 {
 	return (box->phys_id < 0);
diff --git a/arch/x86/events/intel/uncore_nhmex.c b/arch/x86/events/intel/uncore_nhmex.c
index e89bf5c..cda5693 100644
--- a/arch/x86/events/intel/uncore_nhmex.c
+++ b/arch/x86/events/intel/uncore_nhmex.c
@@ -201,6 +201,11 @@ static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
 	wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
 }
 
+static void nhmex_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+	wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, 0);
+}
+
 static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
 {
 	unsigned msr = uncore_msr_box_ctl(box);
@@ -250,6 +255,7 @@ static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct p
 
 #define NHMEX_UNCORE_OPS_COMMON_INIT()				\
 	.init_box	= nhmex_uncore_msr_init_box,		\
+	.exit_box	= nhmex_uncore_msr_exit_box,		\
 	.disable_box	= nhmex_uncore_msr_disable_box,		\
 	.enable_box	= nhmex_uncore_msr_enable_box,		\
 	.disable_event	= nhmex_uncore_msr_disable_event,	\
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 2049d26..120e106 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -95,6 +95,12 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
 	}
 }
 
+static void snb_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+	if (box->pmu->pmu_idx == 0)
+		wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, 0);
+}
+
 static struct uncore_event_desc snb_uncore_events[] = {
 	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
 	{ /* end: all zeroes */ },
@@ -116,6 +122,7 @@ static struct attribute_group snb_uncore_format_group = {
 
 static struct intel_uncore_ops snb_uncore_msr_ops = {
 	.init_box	= snb_uncore_msr_init_box,
+	.exit_box	= snb_uncore_msr_exit_box,
 	.disable_event	= snb_uncore_msr_disable_event,
 	.enable_event	= snb_uncore_msr_enable_event,
 	.read_counter	= uncore_msr_read_counter,
@@ -231,6 +238,11 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
 	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
 }
 
+static void snb_uncore_imc_exit_box(struct intel_uncore_box *box)
+{
+	iounmap(box->io_addr);
+}
+
 static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
 {}
 
@@ -458,6 +470,7 @@ static struct pmu snb_uncore_imc_pmu = {
 
 static struct intel_uncore_ops snb_uncore_imc_ops = {
 	.init_box	= snb_uncore_imc_init_box,
+	.exit_box	= snb_uncore_imc_exit_box,
 	.enable_box	= snb_uncore_imc_enable_box,
 	.disable_box	= snb_uncore_imc_disable_box,
 	.disable_event	= snb_uncore_imc_disable_event,
-- 
cgit v0.10.2


From 1229735b290d12003a1362ccfc442c690c93bc3a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:12 +0000
Subject: perf/x86/intel/uncore: Make code more readable

Clean up the code a bit before reworking it completely.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.204771538@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index c3d1cbb..84055f2 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -217,7 +217,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
 	return config;
 }
 
-static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
+static void uncore_assign_hw_event(struct intel_uncore_box *box,
+				   struct perf_event *event, int idx)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
@@ -312,18 +313,19 @@ static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
 	box->hrtimer.function = uncore_pmu_hrtimer;
 }
 
-static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
+static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
+						 int node)
 {
+	int i, size, numshared = type->num_shared_regs ;
 	struct intel_uncore_box *box;
-	int i, size;
 
-	size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
+	size = sizeof(*box) + numshared * sizeof(struct intel_uncore_extra_reg);
 
 	box = kzalloc_node(size, GFP_KERNEL, node);
 	if (!box)
 		return NULL;
 
-	for (i = 0; i < type->num_shared_regs; i++)
+	for (i = 0; i < numshared; i++)
 		raw_spin_lock_init(&box->shared_regs[i].lock);
 
 	uncore_pmu_init_hrtimer(box);
@@ -351,7 +353,8 @@ static bool is_uncore_event(struct perf_event *event)
 }
 
 static int
-uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
+uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader,
+		      bool dogrp)
 {
 	struct perf_event *event;
 	int n, max_count;
@@ -412,7 +415,8 @@ uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *eve
 	return &type->unconstrainted;
 }
 
-static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
+static void uncore_put_event_constraint(struct intel_uncore_box *box,
+					struct perf_event *event)
 {
 	if (box->pmu->type->ops->put_constraint)
 		box->pmu->type->ops->put_constraint(box, event);
@@ -592,7 +596,7 @@ static void uncore_pmu_event_del(struct perf_event *event, int flags)
 		if (event == box->event_list[i]) {
 			uncore_put_event_constraint(box, event);
 
-			while (++i < box->n_events)
+			for (++i; i < box->n_events; i++)
 				box->event_list[i - 1] = box->event_list[i];
 
 			--box->n_events;
@@ -801,10 +805,8 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
 
 static void __init uncore_types_exit(struct intel_uncore_type **types)
 {
-	int i;
-
-	for (i = 0; types[i]; i++)
-		uncore_type_exit(types[i]);
+	for (; *types; types++)
+		uncore_type_exit(*types);
 }
 
 static int __init uncore_type_init(struct intel_uncore_type *type)
@@ -908,9 +910,11 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	 * some device types. Hence PCI device idx would be 0 for all devices.
 	 * So increment pmu pointer to point to an unused array element.
 	 */
-	if (boot_cpu_data.x86_model == 87)
+	if (boot_cpu_data.x86_model == 87) {
 		while (pmu->func_id >= 0)
 			pmu++;
+	}
+
 	if (pmu->func_id < 0)
 		pmu->func_id = pdev->devfn;
 	else
@@ -1170,44 +1174,45 @@ static int uncore_cpu_prepare(int cpu, int phys_id)
 	return 0;
 }
 
-static void
-uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
+static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
+				   int new_cpu)
 {
-	struct intel_uncore_type *type;
-	struct intel_uncore_pmu *pmu;
+	struct intel_uncore_pmu *pmu = type->pmus;
 	struct intel_uncore_box *box;
-	int i, j;
-
-	for (i = 0; uncores[i]; i++) {
-		type = uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			if (old_cpu < 0)
-				box = uncore_pmu_to_box(pmu, new_cpu);
-			else
-				box = uncore_pmu_to_box(pmu, old_cpu);
-			if (!box)
-				continue;
+	int i;
 
-			if (old_cpu < 0) {
-				WARN_ON_ONCE(box->cpu != -1);
-				box->cpu = new_cpu;
-				continue;
-			}
+	for (i = 0; i < type->num_boxes; i++, pmu++) {
+		if (old_cpu < 0)
+			box = uncore_pmu_to_box(pmu, new_cpu);
+		else
+			box = uncore_pmu_to_box(pmu, old_cpu);
+		if (!box)
+			continue;
 
-			WARN_ON_ONCE(box->cpu != old_cpu);
-			if (new_cpu >= 0) {
-				uncore_pmu_cancel_hrtimer(box);
-				perf_pmu_migrate_context(&pmu->pmu,
-						old_cpu, new_cpu);
-				box->cpu = new_cpu;
-			} else {
-				box->cpu = -1;
-			}
+		if (old_cpu < 0) {
+			WARN_ON_ONCE(box->cpu != -1);
+			box->cpu = new_cpu;
+			continue;
 		}
+
+		WARN_ON_ONCE(box->cpu != old_cpu);
+		box->cpu = -1;
+		if (new_cpu < 0)
+			continue;
+
+		uncore_pmu_cancel_hrtimer(box);
+		perf_pmu_migrate_context(&pmu->pmu, old_cpu, new_cpu);
+		box->cpu = new_cpu;
 	}
 }
 
+static void uncore_change_context(struct intel_uncore_type **uncores,
+				  int old_cpu, int new_cpu)
+{
+	for (; *uncores; uncores++)
+		uncore_change_type_ctx(*uncores, old_cpu, new_cpu);
+}
+
 static void uncore_event_exit_cpu(int cpu)
 {
 	int i, phys_id, target;
@@ -1318,8 +1323,8 @@ static int __init uncore_msr_pmus_register(void)
 	struct intel_uncore_type **types = uncore_msr_uncores;
 	int ret;
 
-	while (*types) {
-		ret = type_pmu_register(*types++);
+	for (; *types; types++) {
+		ret = type_pmu_register(*types);
 		if (ret)
 			return ret;
 	}
-- 
cgit v0.10.2


From 1384c70442b71f882e7e7153df542794d468a15d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:13 +0000
Subject: perf/x86/uncore: Make uncore_pcibus_to_physid() static

No users outside of this file.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.285504825@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 84055f2..80ee3d4 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -21,7 +21,7 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
 	EVENT_CONSTRAINT(0, 0, 0);
 
-int uncore_pcibus_to_physid(struct pci_bus *bus)
+static int uncore_pcibus_to_physid(struct pci_bus *bus)
 {
 	struct pci2phy_map *map;
 	int phys_id = -1;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index cf2cb1f..f118c0d 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -125,7 +125,6 @@ struct pci2phy_map {
 	int pbus_to_physid[256];
 };
 
-int uncore_pcibus_to_physid(struct pci_bus *bus);
 struct pci2phy_map *__find_pci2phy_map(int segment);
 
 ssize_t uncore_event_show(struct kobject *kobj,
-- 
cgit v0.10.2


From 54d751d4ad357c817907fe89db3222b97ff66db3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:14 +0000
Subject: perf: Allow storage of PMU private data in event

For PMUs which are not per CPU, but e.g. per package/socket, we want to be
able to store a reference to the underlying per package/socket facility in the
event at init time so we can avoid magic storage constructs in the PMU driver.

This allows us to get rid of the per CPU dance in the intel uncore and RAPL
drivers and avoids a lookup of the per package data in the perf hotpath.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.364140369@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f5c5a3f..a9d8cab 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -468,6 +468,7 @@ struct perf_event {
 	int				group_flags;
 	struct perf_event		*group_leader;
 	struct pmu			*pmu;
+	void				*pmu_private;
 
 	enum perf_event_active_state	state;
 	unsigned int			attach_state;
-- 
cgit v0.10.2


From 1f2569fac6c6dd5a6e3fba41c183c04273f05a58 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:14 +0000
Subject: perf/x86/intel/uncore: Store box in event->pmu_private

Store the PMU pointer in event->pmu_private, so we can get rid of the
per CPU data storage.

We keep it after converting to per package data, because a CPU to
package lookup will be 3 loads versus one and these usage sites are
in the perf fast path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.460851335@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 80ee3d4..220f81c 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -92,11 +92,6 @@ ssize_t uncore_event_show(struct kobject *kobj,
 	return sprintf(buf, "%s", event->config);
 }
 
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
-{
-	return container_of(event->pmu, struct intel_uncore_pmu, pmu);
-}
-
 struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
 {
 	struct intel_uncore_box *box;
@@ -122,15 +117,6 @@ out:
 	return *per_cpu_ptr(pmu->box, cpu);
 }
 
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
-{
-	/*
-	 * perf core schedules event on the basis of cpu, uncore events are
-	 * collected by one of the cpus inside a physical package.
-	 */
-	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
-}
-
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
 {
 	u64 count;
@@ -690,6 +676,7 @@ static int uncore_pmu_event_init(struct perf_event *event)
 	if (!box || box->cpu < 0)
 		return -EINVAL;
 	event->cpu = box->cpu;
+	event->pmu_private = box;
 
 	event->hw.idx = -1;
 	event->hw.last_tag = ~0ULL;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index f118c0d..479c1e8 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -319,9 +319,17 @@ static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
 	return (box->phys_id < 0);
 }
 
-struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
+static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+	return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static inline struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+	return event->pmu_private;
+}
+
 struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
-struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
 void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
 void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 120e106..96531d2 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -313,6 +313,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
 		return -EINVAL;
 
 	event->cpu = box->cpu;
+	event->pmu_private = box;
 
 	event->hw.idx = -1;
 	event->hw.last_tag = ~0ULL;
-- 
cgit v0.10.2


From 1f12e32f4cd5243ae46d8b933181be0d022c6793 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:15 +0000
Subject: x86/topology: Create logical package id

For per package oriented services we must be able to rely on the number of CPU
packages to be within bounds. Create a tracking facility, which

- calculates the number of possible packages depending on nr_cpu_ids after boot

- makes sure that the package id is within the number of possible packages. If
  the apic id is outside we map it to a logical package id if there is enough
  space available.

Provide interfaces for drivers to query the mapping and do translations from
physcial to logical ids.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.541071755@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 20c11d1..813384e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -129,6 +129,8 @@ struct cpuinfo_x86 {
 	u16			booted_cores;
 	/* Physical processor id: */
 	u16			phys_proc_id;
+	/* Logical processor id: */
+	u16			logical_proc_id;
 	/* Core id: */
 	u16			cpu_core_id;
 	/* Compute unit id */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 0fb4648..7f991bd5 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -119,12 +119,23 @@ static inline void setup_node_to_cpumask_map(void) { }
 
 extern const struct cpumask *cpu_coregroup_mask(int cpu);
 
+#define topology_logical_package_id(cpu)	(cpu_data(cpu).logical_proc_id)
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).phys_proc_id)
 #define topology_core_id(cpu)			(cpu_data(cpu).cpu_core_id)
 
 #ifdef ENABLE_TOPO_DEFINES
 #define topology_core_cpumask(cpu)		(per_cpu(cpu_core_map, cpu))
 #define topology_sibling_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
+
+extern unsigned int __max_logical_packages;
+#define topology_max_packages()			(__max_logical_packages)
+int topology_update_package_map(unsigned int apicid, unsigned int cpu);
+extern int topology_phys_to_logical_pkg(unsigned int pkg);
+#else
+#define topology_max_packages()			(1)
+static inline int
+topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
+static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8a5cdda..531b961 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2078,6 +2078,20 @@ int generic_processor_info(int apicid, int version)
 		cpu = cpumask_next_zero(-1, cpu_present_mask);
 
 	/*
+	 * This can happen on physical hotplug. The sanity check at boot time
+	 * is done from native_smp_prepare_cpus() after num_possible_cpus() is
+	 * established.
+	 */
+	if (topology_update_package_map(apicid, cpu) < 0) {
+		int thiscpu = max + disabled_cpus;
+
+		pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n",
+			   thiscpu, apicid);
+		disabled_cpus++;
+		return -ENOSPC;
+	}
+
+	/*
 	 * Validate version
 	 */
 	if (version == 0x0) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68a80e9..81cf716 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -975,6 +975,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 #ifdef CONFIG_NUMA
 	numa_add_cpu(smp_processor_id());
 #endif
+	/* The boot/hotplug time assigment got cleared, restore it */
+	c->logical_proc_id = topology_phys_to_logical_pkg(c->phys_proc_id);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 05b9211..38766c2 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -160,6 +160,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		pr_info("Disabling PGE capability bit\n");
 		setup_clear_cpu_cap(X86_FEATURE_PGE);
 	}
+
+	if (c->cpuid_level >= 0x00000001) {
+		u32 eax, ebx, ecx, edx;
+
+		cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+		/*
+		 * If HTT (EDX[28]) is set EBX[16:23] contain the number of
+		 * apicids which are reserved per package. Store the resulting
+		 * shift value for the package management code.
+		 */
+		if (edx & (1U << 28))
+			c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
+	}
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 24d57f7..3bf1e0b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,6 +97,14 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);
 
+/* Logical package management. We might want to allocate that dynamically */
+static int *physical_to_logical_pkg __read_mostly;
+static unsigned long *physical_package_map __read_mostly;;
+static unsigned long *logical_package_map  __read_mostly;
+static unsigned int max_physical_pkg_id __read_mostly;
+unsigned int __max_logical_packages __read_mostly;
+EXPORT_SYMBOL(__max_logical_packages);
+
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
 	unsigned long flags;
@@ -251,6 +259,97 @@ static void notrace start_secondary(void *unused)
 	cpu_startup_entry(CPUHP_ONLINE);
 }
 
+int topology_update_package_map(unsigned int apicid, unsigned int cpu)
+{
+	unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits;
+
+	/* Called from early boot ? */
+	if (!physical_package_map)
+		return 0;
+
+	if (pkg >= max_physical_pkg_id)
+		return -EINVAL;
+
+	/* Set the logical package id */
+	if (test_and_set_bit(pkg, physical_package_map))
+		goto found;
+
+	if (pkg < __max_logical_packages) {
+		set_bit(pkg, logical_package_map);
+		physical_to_logical_pkg[pkg] = pkg;
+		goto found;
+	}
+	new = find_first_zero_bit(logical_package_map, __max_logical_packages);
+	if (new >= __max_logical_packages) {
+		physical_to_logical_pkg[pkg] = -1;
+		pr_warn("APIC(%x) Package %u exceeds logical package map\n",
+			apicid, pkg);
+		return -ENOSPC;
+	}
+	set_bit(new, logical_package_map);
+	pr_info("APIC(%x) Converting physical %u to logical package %u\n",
+		apicid, pkg, new);
+	physical_to_logical_pkg[pkg] = new;
+
+found:
+	cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg];
+	return 0;
+}
+
+/**
+ * topology_phys_to_logical_pkg - Map a physical package id to a logical
+ *
+ * Returns logical package id or -1 if not found
+ */
+int topology_phys_to_logical_pkg(unsigned int phys_pkg)
+{
+	if (phys_pkg >= max_physical_pkg_id)
+		return -1;
+	return physical_to_logical_pkg[phys_pkg];
+}
+EXPORT_SYMBOL(topology_phys_to_logical_pkg);
+
+static void __init smp_init_package_map(void)
+{
+	unsigned int ncpus, cpu;
+	size_t size;
+
+	/*
+	 * Today neither Intel nor AMD support heterogenous systems. That
+	 * might change in the future....
+	 */
+	ncpus = boot_cpu_data.x86_max_cores * smp_num_siblings;
+	__max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
+
+	/*
+	 * Possibly larger than what we need as the number of apic ids per
+	 * package can be smaller than the actual used apic ids.
+	 */
+	max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus);
+	size = max_physical_pkg_id * sizeof(unsigned int);
+	physical_to_logical_pkg = kmalloc(size, GFP_KERNEL);
+	memset(physical_to_logical_pkg, 0xff, size);
+	size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
+	physical_package_map = kzalloc(size, GFP_KERNEL);
+	size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long);
+	logical_package_map = kzalloc(size, GFP_KERNEL);
+
+	pr_info("Max logical packages: %u\n", __max_logical_packages);
+
+	for_each_present_cpu(cpu) {
+		unsigned int apicid = apic->cpu_present_to_apicid(cpu);
+
+		if (apicid == BAD_APICID || !apic->apic_id_valid(apicid))
+			continue;
+		if (!topology_update_package_map(apicid, cpu))
+			continue;
+		pr_warn("CPU %u APICId %x disabled\n", cpu, apicid);
+		per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID;
+		set_cpu_possible(cpu, false);
+		set_cpu_present(cpu, false);
+	}
+}
+
 void __init smp_store_boot_cpu_info(void)
 {
 	int id = 0; /* CPU 0 */
@@ -258,6 +357,7 @@ void __init smp_store_boot_cpu_info(void)
 
 	*c = boot_cpu_data;
 	c->cpu_index = id;
+	smp_init_package_map();
 }
 
 /*
-- 
cgit v0.10.2


From cf6d445f68974d0b15a14cf6021be38a91f2b5d8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:16 +0000
Subject: perf/x86/uncore: Track packages, not per CPU data

Uncore is a per package facility, but the code tries to mimick a per CPU
facility with completely convoluted constructs.

Simplify the whole machinery by tracking per package information. While at it,
avoid the kfree/alloc dance when a CPU goes offline and online again. There is
no point in freeing the box after it was allocated. We just keep proper
refcounting and the first CPU which comes online in a package does the
initialization/activation of the box.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.622258933@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 220f81c..096fc0d 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -9,9 +9,9 @@ struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
 DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
 struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
-struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+struct pci_extra_dev *uncore_extra_pci_dev;
+static int max_packages;
 
-static DEFINE_RAW_SPINLOCK(uncore_box_lock);
 /* mask of cpus that collect uncore events */
 static cpumask_t uncore_cpu_mask;
 
@@ -94,27 +94,7 @@ ssize_t uncore_event_show(struct kobject *kobj,
 
 struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
 {
-	struct intel_uncore_box *box;
-
-	box = *per_cpu_ptr(pmu->box, cpu);
-	if (box)
-		return box;
-
-	raw_spin_lock(&uncore_box_lock);
-	/* Recheck in lock to handle races. */
-	if (*per_cpu_ptr(pmu->box, cpu))
-		goto out;
-	list_for_each_entry(box, &pmu->box_list, list) {
-		if (box->phys_id == topology_physical_package_id(cpu)) {
-			atomic_inc(&box->refcnt);
-			*per_cpu_ptr(pmu->box, cpu) = box;
-			break;
-		}
-	}
-out:
-	raw_spin_unlock(&uncore_box_lock);
-
-	return *per_cpu_ptr(pmu->box, cpu);
+	return pmu->boxes[topology_logical_package_id(cpu)];
 }
 
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
@@ -315,9 +295,9 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
 		raw_spin_lock_init(&box->shared_regs[i].lock);
 
 	uncore_pmu_init_hrtimer(box);
-	atomic_set(&box->refcnt, 1);
 	box->cpu = -1;
-	box->phys_id = -1;
+	box->pci_phys_id = -1;
+	box->pkgid = -1;
 
 	/* set default hrtimer timeout */
 	box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
@@ -774,14 +754,24 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
 	pmu->registered = false;
 }
 
+static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
+{
+	int pkg;
+
+	for (pkg = 0; pkg < max_packages; pkg++)
+		kfree(pmu->boxes[pkg]);
+	kfree(pmu->boxes);
+}
+
 static void __init uncore_type_exit(struct intel_uncore_type *type)
 {
+	struct intel_uncore_pmu *pmu = type->pmus;
 	int i;
 
-	if (type->pmus) {
-		for (i = 0; i < type->num_boxes; i++) {
-			uncore_pmu_unregister(&type->pmus[i]);
-			free_percpu(type->pmus[i].box);
+	if (pmu) {
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			uncore_pmu_unregister(pmu);
+			uncore_free_boxes(pmu);
 		}
 		kfree(type->pmus);
 		type->pmus = NULL;
@@ -796,37 +786,36 @@ static void __init uncore_types_exit(struct intel_uncore_type **types)
 		uncore_type_exit(*types);
 }
 
-static int __init uncore_type_init(struct intel_uncore_type *type)
+static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 {
 	struct intel_uncore_pmu *pmus;
 	struct attribute_group *attr_group;
 	struct attribute **attrs;
+	size_t size;
 	int i, j;
 
 	pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
 	if (!pmus)
 		return -ENOMEM;
 
-	type->pmus = pmus;
-
-	type->unconstrainted = (struct event_constraint)
-		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
-				0, type->num_counters, 0, 0);
+	size = max_packages * sizeof(struct intel_uncore_box *);
 
 	for (i = 0; i < type->num_boxes; i++) {
-		pmus[i].func_id = -1;
-		pmus[i].pmu_idx = i;
-		pmus[i].type = type;
-		INIT_LIST_HEAD(&pmus[i].box_list);
-		pmus[i].box = alloc_percpu(struct intel_uncore_box *);
-		if (!pmus[i].box)
+		pmus[i].func_id	= setid ? i : -1;
+		pmus[i].pmu_idx	= i;
+		pmus[i].type	= type;
+		pmus[i].boxes	= kzalloc(size, GFP_KERNEL);
+		if (!pmus[i].boxes)
 			return -ENOMEM;
 	}
 
+	type->pmus = pmus;
+	type->unconstrainted = (struct event_constraint)
+		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
+				0, type->num_counters, 0, 0);
+
 	if (type->event_descs) {
-		i = 0;
-		while (type->event_descs[i].attr.attr.name)
-			i++;
+		for (i = 0; type->event_descs[i].attr.attr.name; i++);
 
 		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
 					sizeof(*attr_group), GFP_KERNEL);
@@ -847,12 +836,13 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
 	return 0;
 }
 
-static int __init uncore_types_init(struct intel_uncore_type **types)
+static int __init
+uncore_types_init(struct intel_uncore_type **types, bool setid)
 {
-	int i, ret;
+	int ret;
 
-	for (i = 0; types[i]; i++) {
-		ret = uncore_type_init(types[i]);
+	for (; *types; types++) {
+		ret = uncore_type_init(*types, setid);
 		if (ret)
 			return ret;
 	}
@@ -864,28 +854,28 @@ static int __init uncore_types_init(struct intel_uncore_type **types)
  */
 static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
+	struct intel_uncore_type *type;
 	struct intel_uncore_pmu *pmu;
 	struct intel_uncore_box *box;
-	struct intel_uncore_type *type;
-	bool first_box = false;
-	int phys_id, ret;
+	int phys_id, pkg, ret;
 
 	phys_id = uncore_pcibus_to_physid(pdev->bus);
-	if (phys_id < 0 || phys_id >= UNCORE_SOCKET_MAX)
+	if (phys_id < 0)
 		return -ENODEV;
 
+	pkg = topology_phys_to_logical_pkg(phys_id);
+	if (WARN_ON_ONCE(pkg < 0))
+		return -EINVAL;
+
 	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
 		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
-		uncore_extra_pci_dev[phys_id][idx] = pdev;
+
+		uncore_extra_pci_dev[pkg].dev[idx] = pdev;
 		pci_set_drvdata(pdev, NULL);
 		return 0;
 	}
 
 	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
-	box = uncore_alloc_box(type, NUMA_NO_NODE);
-	if (!box)
-		return -ENOMEM;
-
 	/*
 	 * for performance monitoring unit with multiple boxes,
 	 * each box has a different function id.
@@ -902,32 +892,35 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 			pmu++;
 	}
 
+	if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL))
+		return -EINVAL;
+
+	box = uncore_alloc_box(type, NUMA_NO_NODE);
+	if (!box)
+		return -ENOMEM;
+
 	if (pmu->func_id < 0)
 		pmu->func_id = pdev->devfn;
 	else
 		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
 
-	box->phys_id = phys_id;
+	atomic_inc(&box->refcnt);
+	box->pci_phys_id = phys_id;
+	box->pkgid = pkg;
 	box->pci_dev = pdev;
 	box->pmu = pmu;
 	uncore_box_init(box);
 	pci_set_drvdata(pdev, box);
 
-	raw_spin_lock(&uncore_box_lock);
-	if (list_empty(&pmu->box_list))
-		first_box = true;
-	list_add_tail(&box->list, &pmu->box_list);
-	raw_spin_unlock(&uncore_box_lock);
-
-	if (!first_box)
+	pmu->boxes[pkg] = box;
+	if (atomic_inc_return(&pmu->activeboxes) > 1)
 		return 0;
 
+	/* First active box registers the pmu */
 	ret = uncore_pmu_register(pmu);
 	if (ret) {
 		pci_set_drvdata(pdev, NULL);
-		raw_spin_lock(&uncore_box_lock);
-		list_del(&box->list);
-		raw_spin_unlock(&uncore_box_lock);
+		pmu->boxes[pkg] = NULL;
 		uncore_box_exit(box);
 		kfree(box);
 	}
@@ -938,15 +931,16 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
 	struct intel_uncore_box *box = pci_get_drvdata(pdev);
 	struct intel_uncore_pmu *pmu;
-	int i, cpu, phys_id;
-	bool last_box = false;
+	int i, phys_id, pkg;
 
 	phys_id = uncore_pcibus_to_physid(pdev->bus);
+	pkg = topology_phys_to_logical_pkg(phys_id);
+
 	box = pci_get_drvdata(pdev);
 	if (!box) {
 		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
-			if (uncore_extra_pci_dev[phys_id][i] == pdev) {
-				uncore_extra_pci_dev[phys_id][i] = NULL;
+			if (uncore_extra_pci_dev[pkg].dev[i] == pdev) {
+				uncore_extra_pci_dev[pkg].dev[i] = NULL;
 				break;
 			}
 		}
@@ -955,34 +949,20 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 	}
 
 	pmu = box->pmu;
-	if (WARN_ON_ONCE(phys_id != box->phys_id))
+	if (WARN_ON_ONCE(phys_id != box->pci_phys_id))
 		return;
 
 	pci_set_drvdata(pdev, NULL);
-
-	raw_spin_lock(&uncore_box_lock);
-	list_del(&box->list);
-	if (list_empty(&pmu->box_list))
-		last_box = true;
-	raw_spin_unlock(&uncore_box_lock);
-
-	for_each_possible_cpu(cpu) {
-		if (*per_cpu_ptr(pmu->box, cpu) == box) {
-			*per_cpu_ptr(pmu->box, cpu) = NULL;
-			atomic_dec(&box->refcnt);
-		}
-	}
-
-	WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
+	pmu->boxes[pkg] = NULL;
+	if (atomic_dec_return(&pmu->activeboxes) == 0)
+		uncore_pmu_unregister(pmu);
 	uncore_box_exit(box);
 	kfree(box);
-
-	if (last_box)
-		uncore_pmu_unregister(pmu);
 }
 
 static int __init uncore_pci_init(void)
 {
+	size_t size;
 	int ret;
 
 	switch (boot_cpu_data.x86_model) {
@@ -1025,24 +1005,34 @@ static int __init uncore_pci_init(void)
 	if (ret)
 		return ret;
 
-	ret = uncore_types_init(uncore_pci_uncores);
-	if (ret)
+	size = max_packages * sizeof(struct pci_extra_dev);
+	uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
+	if (!uncore_extra_pci_dev) {
+		ret = -ENOMEM;
 		goto err;
+	}
+
+	ret = uncore_types_init(uncore_pci_uncores, false);
+	if (ret)
+		goto errtype;
 
 	uncore_pci_driver->probe = uncore_pci_probe;
 	uncore_pci_driver->remove = uncore_pci_remove;
 
 	ret = pci_register_driver(uncore_pci_driver);
 	if (ret)
-		goto err;
+		goto errtype;
 
 	pcidrv_registered = true;
 	return 0;
 
-err:
+errtype:
 	uncore_types_exit(uncore_pci_uncores);
-	uncore_pci_uncores = empty_uncore;
+	kfree(uncore_extra_pci_dev);
+	uncore_extra_pci_dev = NULL;
 	uncore_free_pcibus_map();
+err:
+	uncore_pci_uncores = empty_uncore;
 	return ret;
 }
 
@@ -1052,110 +1042,81 @@ static void __init uncore_pci_exit(void)
 		pcidrv_registered = false;
 		pci_unregister_driver(uncore_pci_driver);
 		uncore_types_exit(uncore_pci_uncores);
+		kfree(uncore_extra_pci_dev);
 		uncore_free_pcibus_map();
 	}
 }
 
-/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
-static LIST_HEAD(boxes_to_free);
-
-static void uncore_kfree_boxes(void)
-{
-	struct intel_uncore_box *box;
-
-	while (!list_empty(&boxes_to_free)) {
-		box = list_entry(boxes_to_free.next,
-				 struct intel_uncore_box, list);
-		list_del(&box->list);
-		kfree(box);
-	}
-}
-
 static void uncore_cpu_dying(int cpu)
 {
-	struct intel_uncore_type *type;
+	struct intel_uncore_type *type, **types = uncore_msr_uncores;
 	struct intel_uncore_pmu *pmu;
 	struct intel_uncore_box *box;
-	int i, j;
+	int i, pkg;
 
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			box = *per_cpu_ptr(pmu->box, cpu);
-			*per_cpu_ptr(pmu->box, cpu) = NULL;
-			if (box && atomic_dec_and_test(&box->refcnt)) {
-				list_add(&box->list, &boxes_to_free);
+	pkg = topology_logical_package_id(cpu);
+	for (; *types; types++) {
+		type = *types;
+		pmu = type->pmus;
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			box = pmu->boxes[pkg];
+			if (box && atomic_dec_return(&box->refcnt) == 0)
 				uncore_box_exit(box);
-			}
 		}
 	}
 }
 
-static int uncore_cpu_starting(int cpu)
+static void uncore_cpu_starting(int cpu, bool init)
 {
-	struct intel_uncore_type *type;
+	struct intel_uncore_type *type, **types = uncore_msr_uncores;
 	struct intel_uncore_pmu *pmu;
-	struct intel_uncore_box *box, *exist;
-	int i, j, k, phys_id;
-
-	phys_id = topology_physical_package_id(cpu);
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			box = *per_cpu_ptr(pmu->box, cpu);
-			/* called by uncore_cpu_init? */
-			if (box && box->phys_id >= 0) {
-				uncore_box_init(box);
-				continue;
-			}
+	struct intel_uncore_box *box;
+	int i, pkg, ncpus = 1;
 
-			for_each_online_cpu(k) {
-				exist = *per_cpu_ptr(pmu->box, k);
-				if (exist && exist->phys_id == phys_id) {
-					atomic_inc(&exist->refcnt);
-					*per_cpu_ptr(pmu->box, cpu) = exist;
-					if (box) {
-						list_add(&box->list,
-							 &boxes_to_free);
-						box = NULL;
-					}
-					break;
-				}
-			}
+	if (init) {
+		/*
+		 * On init we get the number of online cpus in the package
+		 * and set refcount for all of them.
+		 */
+		ncpus = cpumask_weight(topology_core_cpumask(cpu));
+	}
 
-			if (box) {
-				box->phys_id = phys_id;
+	pkg = topology_logical_package_id(cpu);
+	for (; *types; types++) {
+		type = *types;
+		pmu = type->pmus;
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			box = pmu->boxes[pkg];
+			if (!box)
+				continue;
+			/* The first cpu on a package activates the box */
+			if (atomic_add_return(ncpus, &box->refcnt) == ncpus)
 				uncore_box_init(box);
-			}
 		}
 	}
-	return 0;
 }
 
-static int uncore_cpu_prepare(int cpu, int phys_id)
+static int uncore_cpu_prepare(int cpu)
 {
-	struct intel_uncore_type *type;
+	struct intel_uncore_type *type, **types = uncore_msr_uncores;
 	struct intel_uncore_pmu *pmu;
 	struct intel_uncore_box *box;
-	int i, j;
-
-	for (i = 0; uncore_msr_uncores[i]; i++) {
-		type = uncore_msr_uncores[i];
-		for (j = 0; j < type->num_boxes; j++) {
-			pmu = &type->pmus[j];
-			if (pmu->func_id < 0)
-				pmu->func_id = j;
+	int i, pkg;
 
+	pkg = topology_logical_package_id(cpu);
+	for (; *types; types++) {
+		type = *types;
+		pmu = type->pmus;
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			if (pmu->boxes[pkg])
+				continue;
+			/* First cpu of a package allocates the box */
 			box = uncore_alloc_box(type, cpu_to_node(cpu));
 			if (!box)
 				return -ENOMEM;
-
 			box->pmu = pmu;
-			box->phys_id = phys_id;
-			*per_cpu_ptr(pmu->box, cpu) = box;
+			box->pkgid = pkg;
+			pmu->boxes[pkg] = box;
 		}
 	}
 	return 0;
@@ -1166,13 +1127,11 @@ static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu,
 {
 	struct intel_uncore_pmu *pmu = type->pmus;
 	struct intel_uncore_box *box;
-	int i;
+	int i, pkg;
 
+	pkg = topology_logical_package_id(old_cpu < 0 ? new_cpu : old_cpu);
 	for (i = 0; i < type->num_boxes; i++, pmu++) {
-		if (old_cpu < 0)
-			box = uncore_pmu_to_box(pmu, new_cpu);
-		else
-			box = uncore_pmu_to_box(pmu, old_cpu);
+		box = pmu->boxes[pkg];
 		if (!box)
 			continue;
 
@@ -1202,27 +1161,20 @@ static void uncore_change_context(struct intel_uncore_type **uncores,
 
 static void uncore_event_exit_cpu(int cpu)
 {
-	int i, phys_id, target;
+	int target;
 
-	/* if exiting cpu is used for collecting uncore events */
+	/* Check if exiting cpu is used for collecting uncore events */
 	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
 		return;
 
-	/* find a new cpu to collect uncore events */
-	phys_id = topology_physical_package_id(cpu);
-	target = -1;
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
-		if (phys_id == topology_physical_package_id(i)) {
-			target = i;
-			break;
-		}
-	}
+	/* Find a new cpu to collect uncore events */
+	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
 
-	/* migrate uncore events to the new cpu */
-	if (target >= 0)
+	/* Migrate uncore events to the new target */
+	if (target < nr_cpu_ids)
 		cpumask_set_cpu(target, &uncore_cpu_mask);
+	else
+		target = -1;
 
 	uncore_change_context(uncore_msr_uncores, cpu, target);
 	uncore_change_context(uncore_pci_uncores, cpu, target);
@@ -1230,13 +1182,15 @@ static void uncore_event_exit_cpu(int cpu)
 
 static void uncore_event_init_cpu(int cpu)
 {
-	int i, phys_id;
+	int target;
 
-	phys_id = topology_physical_package_id(cpu);
-	for_each_cpu(i, &uncore_cpu_mask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;
-	}
+	/*
+	 * Check if there is an online cpu in the package
+	 * which collects uncore events already.
+	 */
+	target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu));
+	if (target < nr_cpu_ids)
+		return;
 
 	cpumask_set_cpu(cpu, &uncore_cpu_mask);
 
@@ -1249,38 +1203,25 @@ static int uncore_cpu_notifier(struct notifier_block *self,
 {
 	unsigned int cpu = (long)hcpu;
 
-	/* allocate/free data structure for uncore box */
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
-		return notifier_from_errno(uncore_cpu_prepare(cpu, -1));
+		return notifier_from_errno(uncore_cpu_prepare(cpu));
+
 	case CPU_STARTING:
-		uncore_cpu_starting(cpu);
+		uncore_cpu_starting(cpu, false);
+	case CPU_DOWN_FAILED:
+		uncore_event_init_cpu(cpu);
 		break;
+
 	case CPU_UP_CANCELED:
 	case CPU_DYING:
 		uncore_cpu_dying(cpu);
 		break;
-	case CPU_ONLINE:
-	case CPU_DEAD:
-		uncore_kfree_boxes();
-		break;
-	default:
-		break;
-	}
 
-	/* select the cpu that collects uncore events */
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_FAILED:
-	case CPU_STARTING:
-		uncore_event_init_cpu(cpu);
-		break;
 	case CPU_DOWN_PREPARE:
 		uncore_event_exit_cpu(cpu);
 		break;
-	default:
-		break;
 	}
-
 	return NOTIFY_OK;
 }
 
@@ -1362,7 +1303,7 @@ static int __init uncore_cpu_init(void)
 		return 0;
 	}
 
-	ret = uncore_types_init(uncore_msr_uncores);
+	ret = uncore_types_init(uncore_msr_uncores, true);
 	if (ret)
 		goto err;
 
@@ -1378,39 +1319,34 @@ err:
 
 static void __init uncore_cpu_setup(void *dummy)
 {
-	uncore_cpu_starting(smp_processor_id());
+	uncore_cpu_starting(smp_processor_id(), true);
 }
 
+/* Lazy to avoid allocation of a few bytes for the normal case */
+static __initdata DECLARE_BITMAP(packages, MAX_LOCAL_APIC);
+
 static int __init uncore_cpumask_init(void)
 {
-	int cpu, ret = 0;
-
-	cpu_notifier_register_begin();
+	unsigned int cpu;
 
 	for_each_online_cpu(cpu) {
-		int i, phys_id = topology_physical_package_id(cpu);
+		unsigned int pkg = topology_logical_package_id(cpu);
+		int ret;
 
-		for_each_cpu(i, &uncore_cpu_mask) {
-			if (phys_id == topology_physical_package_id(i)) {
-				phys_id = -1;
-				break;
-			}
-		}
-		if (phys_id < 0)
+		if (test_and_set_bit(pkg, packages))
 			continue;
-
-		ret = uncore_cpu_prepare(cpu, phys_id);
+		/*
+		 * The first online cpu of each package takes the refcounts
+		 * for all other online cpus in that package.
+		 */
+		ret = uncore_cpu_prepare(cpu);
 		if (ret)
-			goto out;
+			return ret;
 		uncore_event_init_cpu(cpu);
+		smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1);
 	}
-	on_each_cpu(uncore_cpu_setup, NULL, 1);
-
 	__register_cpu_notifier(&uncore_cpu_nb);
-
-out:
-	cpu_notifier_register_done();
-	return ret;
+	return 0;
 }
 
 static int __init intel_uncore_init(void)
@@ -1423,22 +1359,26 @@ static int __init intel_uncore_init(void)
 	if (cpu_has_hypervisor)
 		return -ENODEV;
 
+	max_packages = topology_max_packages();
+
 	ret = uncore_pci_init();
 	if (ret)
 		return ret;
 	ret = uncore_cpu_init();
 	if (ret)
-		goto errpci;
+		goto err;
+
+	cpu_notifier_register_begin();
 	ret = uncore_cpumask_init();
 	if (ret)
-		goto errcpu;
-
+		goto err;
+	cpu_notifier_register_done();
 	return 0;
 
-errcpu:
+err:
 	uncore_types_exit(uncore_msr_uncores);
-errpci:
 	uncore_pci_exit();
+	cpu_notifier_register_done();
 	return ret;
 }
 device_initcall(intel_uncore_init);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 479c1e8..01c6793 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -19,11 +19,12 @@
 #define UNCORE_EXTRA_PCI_DEV		0xff
 #define UNCORE_EXTRA_PCI_DEV_MAX	3
 
-/* support up to 8 sockets */
-#define UNCORE_SOCKET_MAX		8
-
 #define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
 
+struct pci_extra_dev {
+	struct pci_dev *dev[UNCORE_EXTRA_PCI_DEV_MAX];
+};
+
 struct intel_uncore_ops;
 struct intel_uncore_pmu;
 struct intel_uncore_box;
@@ -79,9 +80,9 @@ struct intel_uncore_pmu {
 	int				pmu_idx;
 	int				func_id;
 	bool				registered;
+	atomic_t			activeboxes;
 	struct intel_uncore_type	*type;
-	struct intel_uncore_box		** __percpu box;
-	struct list_head		box_list;
+	struct intel_uncore_box		**boxes;
 };
 
 struct intel_uncore_extra_reg {
@@ -91,7 +92,8 @@ struct intel_uncore_extra_reg {
 };
 
 struct intel_uncore_box {
-	int phys_id;
+	int pci_phys_id;
+	int pkgid;
 	int n_active;	/* number of active events */
 	int n_events;
 	int cpu;	/* cpu to collect events */
@@ -316,7 +318,7 @@ static inline void uncore_box_exit(struct intel_uncore_box *box)
 
 static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
 {
-	return (box->phys_id < 0);
+	return (box->pkgid < 0);
 }
 
 static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
@@ -345,7 +347,7 @@ extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct pci_driver *uncore_pci_driver;
 extern raw_spinlock_t pci2phy_map_lock;
 extern struct list_head pci2phy_map_head;
-extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+extern struct pci_extra_dev *uncore_extra_pci_dev;
 extern struct event_constraint uncore_constraint_empty;
 
 /* perf_event_intel_uncore_snb.c */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index d967fcc..93f6bd9 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -986,7 +986,9 @@ static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_eve
 
 	if (reg1->idx != EXTRA_REG_NONE) {
 		int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
-		struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
+		int pkg = topology_phys_to_logical_pkg(box->pci_phys_id);
+		struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx];
+
 		if (filter_pdev) {
 			pci_write_config_dword(filter_pdev, reg1->reg,
 						(u32)reg1->config);
@@ -2520,14 +2522,16 @@ static struct intel_uncore_type *hswep_msr_uncores[] = {
 
 void hswep_uncore_cpu_init(void)
 {
+	int pkg = topology_phys_to_logical_pkg(0);
+
 	if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
 		hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
 
 	/* Detect 6-8 core systems with only two SBOXes */
-	if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
+	if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
 		u32 capid4;
 
-		pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
+		pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3],
 				      0x94, &capid4);
 		if (((capid4 >> 6) & 0x3) == 0)
 			hswep_uncore_sbox.num_boxes = 2;
-- 
cgit v0.10.2


From 7b672d6433f0f31c95be74b649090cd8b3a6388d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:17 +0000
Subject: perf/x86/intel/uncore: Clear all hardware state on exit

The only missing bit is to completely clear the hardware state on failure
exit. This is now a pretty simple exercise.

Undo the box->init_box() setup on all packages which have been initialized so
far.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.702452407@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 096fc0d..5ad8e52 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -754,6 +754,30 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu)
 	pmu->registered = false;
 }
 
+static void __init __uncore_exit_boxes(struct intel_uncore_type *type, int cpu)
+{
+	struct intel_uncore_pmu *pmu = type->pmus;
+	struct intel_uncore_box *box;
+	int i, pkg;
+
+	if (pmu) {
+		pkg = topology_physical_package_id(cpu);
+		for (i = 0; i < type->num_boxes; i++, pmu++) {
+			box = pmu->boxes[pkg];
+			if (box)
+				uncore_box_exit(box);
+		}
+	}
+}
+
+static void __init uncore_exit_boxes(void *dummy)
+{
+	struct intel_uncore_type **types;
+
+	for (types = uncore_msr_uncores; *types; types++)
+		__uncore_exit_boxes(*types++, smp_processor_id());
+}
+
 static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
 {
 	int pkg;
@@ -1376,6 +1400,8 @@ static int __init intel_uncore_init(void)
 	return 0;
 
 err:
+	/* Undo box->init_box() */
+	on_each_cpu_mask(&uncore_cpu_mask, uncore_exit_boxes, NULL, 1);
 	uncore_types_exit(uncore_msr_uncores);
 	uncore_pci_exit();
 	cpu_notifier_register_done();
-- 
cgit v0.10.2


From 5485592c17954b2a91fbd12980af596dc666079d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:17 +0000
Subject: perf/x86/intel/uncore: Make PCI and MSR uncore independent

Andi wanted to do this before, but the patch fell down the cracks. Implement
it with the proper error handling.

Requested-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.799159968@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 5ad8e52..7012d18 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1023,7 +1023,7 @@ static int __init uncore_pci_init(void)
 		ret = skl_uncore_pci_init();
 		break;
 	default:
-		return 0;
+		return -ENODEV;
 	}
 
 	if (ret)
@@ -1324,7 +1324,7 @@ static int __init uncore_cpu_init(void)
 		knl_uncore_cpu_init();
 		break;
 	default:
-		return 0;
+		return -ENODEV;
 	}
 
 	ret = uncore_types_init(uncore_msr_uncores, true);
@@ -1349,7 +1349,7 @@ static void __init uncore_cpu_setup(void *dummy)
 /* Lazy to avoid allocation of a few bytes for the normal case */
 static __initdata DECLARE_BITMAP(packages, MAX_LOCAL_APIC);
 
-static int __init uncore_cpumask_init(void)
+static int __init uncore_cpumask_init(bool msr)
 {
 	unsigned int cpu;
 
@@ -1360,12 +1360,15 @@ static int __init uncore_cpumask_init(void)
 		if (test_and_set_bit(pkg, packages))
 			continue;
 		/*
-		 * The first online cpu of each package takes the refcounts
-		 * for all other online cpus in that package.
+		 * The first online cpu of each package allocates and takes
+		 * the refcounts for all other online cpus in that package.
+		 * If msrs are not enabled no allocation is required.
 		 */
-		ret = uncore_cpu_prepare(cpu);
-		if (ret)
-			return ret;
+		if (msr) {
+			ret = uncore_cpu_prepare(cpu);
+			if (ret)
+				return ret;
+		}
 		uncore_event_init_cpu(cpu);
 		smp_call_function_single(cpu, uncore_cpu_setup, NULL, 1);
 	}
@@ -1375,7 +1378,7 @@ static int __init uncore_cpumask_init(void)
 
 static int __init intel_uncore_init(void)
 {
-	int ret;
+	int pret, cret, ret;
 
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
 		return -ENODEV;
@@ -1385,15 +1388,14 @@ static int __init intel_uncore_init(void)
 
 	max_packages = topology_max_packages();
 
-	ret = uncore_pci_init();
-	if (ret)
-		return ret;
-	ret = uncore_cpu_init();
-	if (ret)
-		goto err;
+	pret = uncore_pci_init();
+	cret = uncore_cpu_init();
+
+	if (cret && pret)
+		return -ENODEV;
 
 	cpu_notifier_register_begin();
-	ret = uncore_cpumask_init();
+	ret = uncore_cpumask_init(!cret);
 	if (ret)
 		goto err;
 	cpu_notifier_register_done();
-- 
cgit v0.10.2


From 3712bba1a260ad851f3aa8ddea9cb7326f6aa0b3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:18 +0000
Subject: cpumask: Export cpumask_any_but()

Almost every cpumask function is exported, just not the one I need to make the
Intel uncore driver modular.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221011.878299859@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/lib/cpumask.c b/lib/cpumask.c
index 5a70f61..81dedaa 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -41,6 +41,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 			break;
 	return i;
 }
+EXPORT_SYMBOL(cpumask_any_but);
 
 /* These are not inline because of header tangles. */
 #ifdef CONFIG_CPUMASK_OFFSTACK
-- 
cgit v0.10.2


From 827db839cd051cfde5a618fb4427e86dc91bc9aa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:20 +0000
Subject: perf/x86/intel/cqm: Get rid of the silly for_each_cpu() lookups

CQM is a strict per package facility. Use the proper cpumasks to lookup the
readers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.054916179@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 1b064c4..93cb412 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -1244,15 +1244,12 @@ static struct pmu intel_cqm_pmu = {
 
 static inline void cqm_pick_event_reader(int cpu)
 {
-	int phys_id = topology_physical_package_id(cpu);
-	int i;
+	int reader;
 
-	for_each_cpu(i, &cqm_cpumask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;	/* already got reader for this socket */
-	}
-
-	cpumask_set_cpu(cpu, &cqm_cpumask);
+	/* First online cpu in package becomes the reader */
+	reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
+	if (reader >= nr_cpu_ids)
+		cpumask_set_cpu(cpu, &cqm_cpumask);
 }
 
 static void intel_cqm_cpu_starting(unsigned int cpu)
@@ -1270,24 +1267,17 @@ static void intel_cqm_cpu_starting(unsigned int cpu)
 
 static void intel_cqm_cpu_exit(unsigned int cpu)
 {
-	int phys_id = topology_physical_package_id(cpu);
-	int i;
+	int target;
 
-	/*
-	 * Is @cpu a designated cqm reader?
-	 */
+	/* Is @cpu the current cqm reader for this package ? */
 	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
 		return;
 
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
+	/* Find another online reader in this package */
+	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
 
-		if (phys_id == topology_physical_package_id(i)) {
-			cpumask_set_cpu(i, &cqm_cpumask);
-			break;
-		}
-	}
+	if (target < nr_cpu_ids)
+		cpumask_set_cpu(target, &cqm_cpumask);
 }
 
 static int intel_cqm_cpu_notifier(struct notifier_block *nb,
-- 
cgit v0.10.2


From 4d120c535d638a952e644817ba3cbef30deedb2b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:20 +0000
Subject: perf/x86/intel/rapl: Make Knights Landings support functional

The Knights Landings support added the events and the detection case, but then
returns 0 without actually initializing the driver.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dasaratharaman Chandramouli <dasaratharaman.chandramouli@intel.com>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Fixes: 3a2a7797326a4 "perf/x86/intel/rapl: Add support for Knights Landing (KNL)"
Link: http://lkml.kernel.org/r/20160222221012.149331888@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 580f504..536f0ce 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -731,7 +731,7 @@ static int __init rapl_pmu_init(void)
 		rapl_add_quirk(rapl_hsw_server_quirk);
 		rapl_cntr_mask = RAPL_IDX_KNL;
 		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
-
+		break;
 	default:
 		/* unsupported */
 		return 0;
-- 
cgit v0.10.2


From 55f2890f0726fe4a1f41a3a0e72ca1a263f095c3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:21 +0000
Subject: perf/x86/intel/rapl: Add proper error handling

Like uncore the rapl driver lacks error handling. It leaks memory and leaves
the hotplug notifier registered.

Add the proper error checks, cleanup the memory and register the hotplug
notifier only on success.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.231222076@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 536f0ce..98b04d2 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -686,6 +686,14 @@ static int rapl_check_hw_unit(void)
 	return 0;
 }
 
+static void __init cleanup_rapl_pmus(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		kfree(per_cpu(rapl_pmu, cpu));
+}
+
 static const struct x86_cpu_id rapl_cpu_match[] = {
 	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
 	[1] = {},
@@ -702,7 +710,7 @@ static int __init rapl_pmu_init(void)
 	 * check for Intel processor family 6
 	 */
 	if (!x86_match_cpu(rapl_cpu_match))
-		return 0;
+		return -ENODEV;
 
 	/* check supported CPU */
 	switch (boot_cpu_data.x86_model) {
@@ -734,8 +742,9 @@ static int __init rapl_pmu_init(void)
 		break;
 	default:
 		/* unsupported */
-		return 0;
+		return -ENODEV;
 	}
+
 	ret = rapl_check_hw_unit();
 	if (ret)
 		return ret;
@@ -743,6 +752,7 @@ static int __init rapl_pmu_init(void)
 	/* run cpu model quirks */
 	for (quirk = rapl_quirks; quirk; quirk = quirk->next)
 		quirk->func();
+
 	cpu_notifier_register_begin();
 
 	for_each_online_cpu(cpu) {
@@ -752,15 +762,14 @@ static int __init rapl_pmu_init(void)
 		rapl_cpu_init(cpu);
 	}
 
-	__perf_cpu_notifier(rapl_cpu_notifier);
-
 	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
 	if (WARN_ON(ret)) {
 		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
-		cpu_notifier_register_done();
-		return -1;
+		goto out;
 	}
 
+	__perf_cpu_notifier(rapl_cpu_notifier);
+
 	pmu = __this_cpu_read(rapl_pmu);
 
 	pr_info("RAPL PMU detected,"
@@ -775,9 +784,13 @@ static int __init rapl_pmu_init(void)
 				rapl_domain_names[i], rapl_hw_unit[i]);
 		}
 	}
-out:
-	cpu_notifier_register_done();
 
+	cpu_notifier_register_done();
 	return 0;
+
+out:
+	cleanup_rapl_pmus();
+	cpu_notifier_register_done();
+	return ret;
 }
 device_initcall(rapl_pmu_init);
-- 
cgit v0.10.2


From b8b3319a471b2df35ae0a8fe94223638468e9ca4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:22 +0000
Subject: perf/x86/intel/rapl: Sanitize the quirk handling

There is no point in having a quirk machinery for a single possible
function. Get rid of it and move the quirk to a place where it actually
makes sense.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.311639465@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 98b04d2..976558a 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -133,7 +133,6 @@ static int rapl_cntr_mask;
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
 
-static struct x86_pmu_quirk *rapl_quirks;
 static inline u64 rapl_read_counter(struct perf_event *event)
 {
 	u64 raw;
@@ -141,15 +140,6 @@ static inline u64 rapl_read_counter(struct perf_event *event)
 	return raw;
 }
 
-#define rapl_add_quirk(func_)						\
-do {									\
-	static struct x86_pmu_quirk __quirk __initdata = {		\
-		.func = func_,						\
-	};								\
-	__quirk.next = rapl_quirks;					\
-	rapl_quirks = &__quirk;						\
-} while (0)
-
 static inline u64 rapl_scale(u64 v, int cfg)
 {
 	if (cfg > NR_RAPL_DOMAINS) {
@@ -564,17 +554,6 @@ static void rapl_cpu_init(int cpu)
 	cpumask_set_cpu(cpu, &rapl_cpu_mask);
 }
 
-static __init void rapl_hsw_server_quirk(void)
-{
-	/*
-	 * DRAM domain on HSW server has fixed energy unit which can be
-	 * different than the unit from power unit MSR.
-	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
-	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
-	 */
-	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
-}
-
 static int rapl_cpu_prepare(int cpu)
 {
 	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
@@ -672,7 +651,18 @@ static int rapl_cpu_notifier(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static int rapl_check_hw_unit(void)
+static __init void rapl_hsw_server_quirk(void)
+{
+	/*
+	 * DRAM domain on HSW server has fixed energy unit which can be
+	 * different than the unit from power unit MSR.
+	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+	 */
+	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+}
+
+static int rapl_check_hw_unit(void (*quirk)(void))
 {
 	u64 msr_rapl_power_unit_bits;
 	int i;
@@ -683,6 +673,9 @@ static int rapl_check_hw_unit(void)
 	for (i = 0; i < NR_RAPL_DOMAINS; i++)
 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 
+	/* Apply cpu model quirk */
+	if (quirk)
+		quirk();
 	return 0;
 }
 
@@ -701,9 +694,9 @@ static const struct x86_cpu_id rapl_cpu_match[] = {
 
 static int __init rapl_pmu_init(void)
 {
+	void (*quirk)(void) = NULL;
 	struct rapl_pmu *pmu;
 	int cpu, ret;
-	struct x86_pmu_quirk *quirk;
 	int i;
 
 	/*
@@ -720,7 +713,7 @@ static int __init rapl_pmu_init(void)
 		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
 		break;
 	case 63: /* Haswell-Server */
-		rapl_add_quirk(rapl_hsw_server_quirk);
+		quirk = rapl_hsw_server_quirk;
 		rapl_cntr_mask = RAPL_IDX_SRV;
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 		break;
@@ -736,7 +729,7 @@ static int __init rapl_pmu_init(void)
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 		break;
 	case 87: /* Knights Landing */
-		rapl_add_quirk(rapl_hsw_server_quirk);
+		quirk = rapl_hsw_server_quirk;
 		rapl_cntr_mask = RAPL_IDX_KNL;
 		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
 		break;
@@ -745,14 +738,10 @@ static int __init rapl_pmu_init(void)
 		return -ENODEV;
 	}
 
-	ret = rapl_check_hw_unit();
+	ret = rapl_check_hw_unit(quirk);
 	if (ret)
 		return ret;
 
-	/* run cpu model quirks */
-	for (quirk = rapl_quirks; quirk; quirk = quirk->next)
-		quirk->func();
-
 	cpu_notifier_register_begin();
 
 	for_each_online_cpu(cpu) {
-- 
cgit v0.10.2


From 75c7003fbf419f05215d4ca09bf3964d0b1c99e9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:22 +0000
Subject: perf/x86/intel/rapl: Calculate timing once

No point in doing the same calculation over and over. Do it once in
rapl_check_hw_unit().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.409238136@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 976558a..b1d4a2f 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -129,6 +129,7 @@ static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
 static struct pmu rapl_pmu_class;
 static cpumask_t rapl_cpu_mask;
 static int rapl_cntr_mask;
+static u64 rapl_timer_ms;
 
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
@@ -558,7 +559,6 @@ static int rapl_cpu_prepare(int cpu)
 {
 	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
 	int phys_id = topology_physical_package_id(cpu);
-	u64 ms;
 
 	if (pmu)
 		return 0;
@@ -575,19 +575,7 @@ static int rapl_cpu_prepare(int cpu)
 
 	pmu->pmu = &rapl_pmu_class;
 
-	/*
-	 * use reference of 200W for scaling the timeout
-	 * to avoid missing counter overflows.
-	 * 200W = 200 Joules/sec
-	 * divide interval by 2 to avoid lockstep (2 * 100)
-	 * if hw unit is 32, then we use 2 ms 1/200/2
-	 */
-	if (rapl_hw_unit[0] < 32)
-		ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
-	else
-		ms = 2;
-
-	pmu->timer_interval = ms_to_ktime(ms);
+	pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
 
 	rapl_hrtimer_init(pmu);
 
@@ -676,6 +664,19 @@ static int rapl_check_hw_unit(void (*quirk)(void))
 	/* Apply cpu model quirk */
 	if (quirk)
 		quirk();
+
+	/*
+	 * Calculate the timer rate:
+	 * Use reference of 200W for scaling the timeout to avoid counter
+	 * overflows. 200W = 200 Joules/sec
+	 * Divide interval by 2 to avoid lockstep (2 * 100)
+	 * if hw unit is 32, then we use 2 ms 1/200/2
+	 */
+	rapl_timer_ms = 2;
+	if (rapl_hw_unit[0] < 32) {
+		rapl_timer_ms = (1000 / (2 * 100));
+		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
+	}
 	return 0;
 }
 
@@ -695,9 +696,7 @@ static const struct x86_cpu_id rapl_cpu_match[] = {
 static int __init rapl_pmu_init(void)
 {
 	void (*quirk)(void) = NULL;
-	struct rapl_pmu *pmu;
-	int cpu, ret;
-	int i;
+	int cpu, ret, i;
 
 	/*
 	 * check for Intel processor family 6
@@ -758,15 +757,14 @@ static int __init rapl_pmu_init(void)
 	}
 
 	__perf_cpu_notifier(rapl_cpu_notifier);
-
-	pmu = __this_cpu_read(rapl_pmu);
+	cpu_notifier_register_done();
 
 	pr_info("RAPL PMU detected,"
 		" API unit is 2^-32 Joules,"
 		" %d fixed counters"
 		" %llu ms ovfl timer\n",
 		hweight32(rapl_cntr_mask),
-		ktime_to_ms(pmu->timer_interval));
+		rapl_timer_ms);
 	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
 		if (rapl_cntr_mask & (1 << i)) {
 			pr_info("hw unit of domain %s 2^-%d Joules\n",
@@ -774,7 +772,6 @@ static int __init rapl_pmu_init(void)
 		}
 	}
 
-	cpu_notifier_register_done();
 	return 0;
 
 out:
-- 
cgit v0.10.2


From 512089d98457b7913d2e4762a44af52fbcd87470 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:23 +0000
Subject: perf/x86/intel/rapl: Clean up the printk output

The output is inconsistent. Use a proper pr_fmt prefix and split out the
advertisement into a seperate function.

Remove the WARN_ON() in the failure case. It's pointless as we already know
where it failed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.504551295@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index b1d4a2f..f31e4b4 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -44,6 +44,9 @@
  * the duration of the measurement. Tools may use a function such as
  * ldexp(raw_count, -32);
  */
+
+#define pr_fmt(fmt) "RAPL PMU: " fmt
+
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
@@ -144,7 +147,7 @@ static inline u64 rapl_read_counter(struct perf_event *event)
 static inline u64 rapl_scale(u64 v, int cfg)
 {
 	if (cfg > NR_RAPL_DOMAINS) {
-		pr_warn("invalid domain %d, failed to scale data\n", cfg);
+		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
 		return v;
 	}
 	/*
@@ -680,6 +683,21 @@ static int rapl_check_hw_unit(void (*quirk)(void))
 	return 0;
 }
 
+static void __init rapl_advertise(void)
+{
+	int i;
+
+	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
+		hweight32(rapl_cntr_mask), rapl_timer_ms);
+
+	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+		if (rapl_cntr_mask & (1 << i)) {
+			pr_info("hw unit of domain %s 2^-%d Joules\n",
+				rapl_domain_names[i], rapl_hw_unit[i]);
+		}
+	}
+}
+
 static void __init cleanup_rapl_pmus(void)
 {
 	int cpu;
@@ -696,7 +714,7 @@ static const struct x86_cpu_id rapl_cpu_match[] = {
 static int __init rapl_pmu_init(void)
 {
 	void (*quirk)(void) = NULL;
-	int cpu, ret, i;
+	int cpu, ret;
 
 	/*
 	 * check for Intel processor family 6
@@ -751,30 +769,16 @@ static int __init rapl_pmu_init(void)
 	}
 
 	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
-	if (WARN_ON(ret)) {
-		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
+	if (ret)
 		goto out;
-	}
 
 	__perf_cpu_notifier(rapl_cpu_notifier);
 	cpu_notifier_register_done();
-
-	pr_info("RAPL PMU detected,"
-		" API unit is 2^-32 Joules,"
-		" %d fixed counters"
-		" %llu ms ovfl timer\n",
-		hweight32(rapl_cntr_mask),
-		rapl_timer_ms);
-	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
-		if (rapl_cntr_mask & (1 << i)) {
-			pr_info("hw unit of domain %s 2^-%d Joules\n",
-				rapl_domain_names[i], rapl_hw_unit[i]);
-		}
-	}
-
+	rapl_advertise();
 	return 0;
 
 out:
+	pr_warn("Initialization failed (%d), disabled\n", ret);
 	cleanup_rapl_pmus();
 	cpu_notifier_register_done();
 	return ret;
-- 
cgit v0.10.2


From 7162b8fea63061b6231bc5e8a0fed55167e71b4c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:24 +0000
Subject: perf/x86/intel/rapl: Refactor the code some more

Split out code from init into seperate functions. Tidy up the code and get rid
of pointless comments. I wish there would be comments for code which is not
obvious....

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.588544679@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index f31e4b4..ba5043b 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -110,7 +110,7 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj,	\
 static struct kobj_attribute format_attr_##_var =		\
 	__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
 
-#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+#define RAPL_CNTR_WIDTH 32
 
 #define RAPL_EVENT_ATTR_STR(_name, v, str)					\
 static struct perf_pmu_events_attr event_attr_##v = {				\
@@ -120,15 +120,16 @@ static struct perf_pmu_events_attr event_attr_##v = {				\
 };
 
 struct rapl_pmu {
-	spinlock_t	 lock;
-	int		 n_active; /* number of active events */
-	struct list_head active_list;
-	struct pmu	 *pmu; /* pointer to rapl_pmu_class */
-	ktime_t		 timer_interval; /* in ktime_t unit */
-	struct hrtimer   hrtimer;
+	spinlock_t		lock;
+	int			n_active;
+	struct list_head	active_list;
+	struct pmu		*pmu;
+	ktime_t			timer_interval;
+	struct hrtimer		hrtimer;
 };
 
-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
+ /* 1/2^hw_unit Joule */
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
 static struct pmu rapl_pmu_class;
 static cpumask_t rapl_cpu_mask;
 static int rapl_cntr_mask;
@@ -200,11 +201,6 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 		     HRTIMER_MODE_REL_PINNED);
 }
 
-static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
-{
-	hrtimer_cancel(&pmu->hrtimer);
-}
-
 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 {
 	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
@@ -216,9 +212,8 @@ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 
 	spin_lock_irqsave(&pmu->lock, flags);
 
-	list_for_each_entry(event, &pmu->active_list, active_entry) {
+	list_for_each_entry(event, &pmu->active_list, active_entry)
 		rapl_event_update(event);
-	}
 
 	spin_unlock_irqrestore(&pmu->lock, flags);
 
@@ -275,7 +270,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 		WARN_ON_ONCE(pmu->n_active <= 0);
 		pmu->n_active--;
 		if (pmu->n_active == 0)
-			rapl_stop_hrtimer(pmu);
+			hrtimer_cancel(&pmu->hrtimer);
 
 		list_del(&event->active_entry);
 
@@ -542,7 +537,7 @@ static void rapl_cpu_exit(int cpu)
 		perf_pmu_migrate_context(pmu->pmu, cpu, target);
 
 	/* cancel overflow polling timer for CPU */
-	rapl_stop_hrtimer(pmu);
+	hrtimer_cancel(&pmu->hrtimer);
 }
 
 static void rapl_cpu_init(int cpu)
@@ -698,6 +693,20 @@ static void __init rapl_advertise(void)
 	}
 }
 
+static int __init rapl_prepare_cpus(void)
+{
+	unsigned int cpu;
+	int ret;
+
+	for_each_online_cpu(cpu) {
+		ret = rapl_cpu_prepare(cpu);
+		if (ret)
+			return ret;
+		rapl_cpu_init(cpu);
+	}
+	return 0;
+}
+
 static void __init cleanup_rapl_pmus(void)
 {
 	int cpu;
@@ -706,7 +715,7 @@ static void __init cleanup_rapl_pmus(void)
 		kfree(per_cpu(rapl_pmu, cpu));
 }
 
-static const struct x86_cpu_id rapl_cpu_match[] = {
+static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
 	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
 	[1] = {},
 };
@@ -714,15 +723,11 @@ static const struct x86_cpu_id rapl_cpu_match[] = {
 static int __init rapl_pmu_init(void)
 {
 	void (*quirk)(void) = NULL;
-	int cpu, ret;
+	int ret;
 
-	/*
-	 * check for Intel processor family 6
-	 */
 	if (!x86_match_cpu(rapl_cpu_match))
 		return -ENODEV;
 
-	/* check supported CPU */
 	switch (boot_cpu_data.x86_model) {
 	case 42: /* Sandy Bridge */
 	case 58: /* Ivy Bridge */
@@ -751,7 +756,6 @@ static int __init rapl_pmu_init(void)
 		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
 		break;
 	default:
-		/* unsupported */
 		return -ENODEV;
 	}
 
@@ -761,12 +765,9 @@ static int __init rapl_pmu_init(void)
 
 	cpu_notifier_register_begin();
 
-	for_each_online_cpu(cpu) {
-		ret = rapl_cpu_prepare(cpu);
-		if (ret)
-			goto out;
-		rapl_cpu_init(cpu);
-	}
+	ret = rapl_prepare_cpus();
+	if (ret)
+		goto out;
 
 	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
 	if (ret)
-- 
cgit v0.10.2


From a208749c642618b6c106874153906b53f8e2ded9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:25 +0000
Subject: perf/x86/intel/rapl: Make PMU lock raw

This lock is taken in hard interrupt context even on Preempt-RT. Make it raw
so RT does not have to patch it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.669411833@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index ba5043b..2934999 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -120,7 +120,7 @@ static struct perf_pmu_events_attr event_attr_##v = {				\
 };
 
 struct rapl_pmu {
-	spinlock_t		lock;
+	raw_spinlock_t		lock;
 	int			n_active;
 	struct list_head	active_list;
 	struct pmu		*pmu;
@@ -210,12 +210,12 @@ static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 	if (!pmu->n_active)
 		return HRTIMER_NORESTART;
 
-	spin_lock_irqsave(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&pmu->lock, flags);
 
 	list_for_each_entry(event, &pmu->active_list, active_entry)
 		rapl_event_update(event);
 
-	spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_unlock_irqrestore(&pmu->lock, flags);
 
 	hrtimer_forward_now(hrtimer, pmu->timer_interval);
 
@@ -252,9 +252,9 @@ static void rapl_pmu_event_start(struct perf_event *event, int mode)
 	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
 	unsigned long flags;
 
-	spin_lock_irqsave(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&pmu->lock, flags);
 	__rapl_pmu_event_start(pmu, event);
-	spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_unlock_irqrestore(&pmu->lock, flags);
 }
 
 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
@@ -263,7 +263,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&pmu->lock, flags);
 
 	/* mark event as deactivated and stopped */
 	if (!(hwc->state & PERF_HES_STOPPED)) {
@@ -288,7 +288,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 		hwc->state |= PERF_HES_UPTODATE;
 	}
 
-	spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_unlock_irqrestore(&pmu->lock, flags);
 }
 
 static int rapl_pmu_event_add(struct perf_event *event, int mode)
@@ -297,14 +297,14 @@ static int rapl_pmu_event_add(struct perf_event *event, int mode)
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 
-	spin_lock_irqsave(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&pmu->lock, flags);
 
 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 
 	if (mode & PERF_EF_START)
 		__rapl_pmu_event_start(pmu, event);
 
-	spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_unlock_irqrestore(&pmu->lock, flags);
 
 	return 0;
 }
@@ -567,7 +567,7 @@ static int rapl_cpu_prepare(int cpu)
 	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 	if (!pmu)
 		return -1;
-	spin_lock_init(&pmu->lock);
+	raw_spin_lock_init(&pmu->lock);
 
 	INIT_LIST_HEAD(&pmu->active_list);
 
-- 
cgit v0.10.2


From 8a6d2f8f73caa8b8eb596a9e2d2e0b15d64751a4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:25 +0000
Subject: perf/x86/intel/rapl: Utilize event->pmu_private

Store the PMU pointer in event->pmu_private and use it instead of the per CPU
data. Preparatory step to get rid of the per CPU allocations. The usage sites
are the perf fast path, so we keep that even after the conversion to per
package storage as a CPU to package lookup involves 3 loads versus 1 with the
pmu_private pointer.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.748151799@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 2934999..753d90c 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -122,6 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = {				\
 struct rapl_pmu {
 	raw_spinlock_t		lock;
 	int			n_active;
+	int			cpu;
 	struct list_head	active_list;
 	struct pmu		*pmu;
 	ktime_t			timer_interval;
@@ -203,7 +204,7 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 
 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 {
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
 	struct perf_event *event;
 	unsigned long flags;
 
@@ -249,7 +250,7 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 
 static void rapl_pmu_event_start(struct perf_event *event, int mode)
 {
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct rapl_pmu *pmu = event->pmu_private;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&pmu->lock, flags);
@@ -259,7 +260,7 @@ static void rapl_pmu_event_start(struct perf_event *event, int mode)
 
 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 {
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct rapl_pmu *pmu = event->pmu_private;
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 
@@ -293,7 +294,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 
 static int rapl_pmu_event_add(struct perf_event *event, int mode)
 {
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
+	struct rapl_pmu *pmu = event->pmu_private;
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 
@@ -316,6 +317,7 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
 
 static int rapl_pmu_event_init(struct perf_event *event)
 {
+	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 	int bit, msr, ret = 0;
 
@@ -327,6 +329,9 @@ static int rapl_pmu_event_init(struct perf_event *event)
 	if (event->attr.config & ~RAPL_EVENT_MASK)
 		return -EINVAL;
 
+	if (event->cpu < 0)
+		return -EINVAL;
+
 	/*
 	 * check event is known (determines counter)
 	 */
@@ -365,6 +370,8 @@ static int rapl_pmu_event_init(struct perf_event *event)
 		return -EINVAL;
 
 	/* must be done before validate_group */
+	event->cpu = pmu->cpu;
+	event->pmu_private = pmu;
 	event->hw.event_base = msr;
 	event->hw.config = cfg;
 	event->hw.idx = bit;
@@ -572,6 +579,7 @@ static int rapl_cpu_prepare(int cpu)
 	INIT_LIST_HEAD(&pmu->active_list);
 
 	pmu->pmu = &rapl_pmu_class;
+	pmu->cpu = cpu;
 
 	pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
 
-- 
cgit v0.10.2


From 9de8d686955b0e8e27847ed4edbbcd280f3fd853 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:26 +0000
Subject: perf/x86/intel/rapl: Convert it to a per package facility

RAPL is a per package facility and we already have a mechanism for a dedicated
per package reader. So there is no point to have multiple CPUs doing the
same. The current implementation actually starts two timers on two CPUs if one
does:

	perf stat -C1,2 -e -e power/energy-pkg ....

which makes the whole concept of 1 reader per package moot.

What's worse is that the above returns the double of the actual energy
consumption, but that's a different problem to address and cannot be solved by
removing the pointless per cpuness of that mechanism.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.845369524@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 753d90c..019e541 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -129,15 +129,23 @@ struct rapl_pmu {
 	struct hrtimer		hrtimer;
 };
 
+struct rapl_pmus {
+	struct pmu		pmu;
+	unsigned int		maxpkg;
+	struct rapl_pmu		*pmus[];
+};
+
  /* 1/2^hw_unit Joule */
 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
-static struct pmu rapl_pmu_class;
+static struct rapl_pmus *rapl_pmus;
 static cpumask_t rapl_cpu_mask;
-static int rapl_cntr_mask;
+static unsigned int rapl_cntr_mask;
 static u64 rapl_timer_ms;
 
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
-static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
+{
+	return rapl_pmus->pmus[topology_logical_package_id(cpu)];
+}
 
 static inline u64 rapl_read_counter(struct perf_event *event)
 {
@@ -317,12 +325,12 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
 
 static int rapl_pmu_event_init(struct perf_event *event)
 {
-	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 	int bit, msr, ret = 0;
+	struct rapl_pmu *pmu;
 
 	/* only look at RAPL events */
-	if (event->attr.type != rapl_pmu_class.type)
+	if (event->attr.type != rapl_pmus->pmu.type)
 		return -ENOENT;
 
 	/* check only supported bits are set */
@@ -370,6 +378,7 @@ static int rapl_pmu_event_init(struct perf_event *event)
 		return -EINVAL;
 
 	/* must be done before validate_group */
+	pmu = cpu_to_rapl_pmu(event->cpu);
 	event->cpu = pmu->cpu;
 	event->pmu_private = pmu;
 	event->hw.event_base = msr;
@@ -502,116 +511,62 @@ const struct attribute_group *rapl_attr_groups[] = {
 	NULL,
 };
 
-static struct pmu rapl_pmu_class = {
-	.attr_groups	= rapl_attr_groups,
-	.task_ctx_nr	= perf_invalid_context, /* system-wide only */
-	.event_init	= rapl_pmu_event_init,
-	.add		= rapl_pmu_event_add, /* must have */
-	.del		= rapl_pmu_event_del, /* must have */
-	.start		= rapl_pmu_event_start,
-	.stop		= rapl_pmu_event_stop,
-	.read		= rapl_pmu_event_read,
-};
-
 static void rapl_cpu_exit(int cpu)
 {
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-	int i, phys_id = topology_physical_package_id(cpu);
-	int target = -1;
+	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+	int target;
 
-	/* find a new cpu on same package */
-	for_each_online_cpu(i) {
-		if (i == cpu)
-			continue;
-		if (phys_id == topology_physical_package_id(i)) {
-			target = i;
-			break;
-		}
-	}
-	/*
-	 * clear cpu from cpumask
-	 * if was set in cpumask and still some cpu on package,
-	 * then move to new cpu
-	 */
-	if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
-		cpumask_set_cpu(target, &rapl_cpu_mask);
+	/* Check if exiting cpu is used for collecting rapl events */
+	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
+		return;
 
-	WARN_ON(cpumask_empty(&rapl_cpu_mask));
-	/*
-	 * migrate events and context to new cpu
-	 */
-	if (target >= 0)
-		perf_pmu_migrate_context(pmu->pmu, cpu, target);
+	pmu->cpu = -1;
+	/* Find a new cpu to collect rapl events */
+	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
 
-	/* cancel overflow polling timer for CPU */
-	hrtimer_cancel(&pmu->hrtimer);
+	/* Migrate rapl events to the new target */
+	if (target < nr_cpu_ids) {
+		cpumask_set_cpu(target, &rapl_cpu_mask);
+		pmu->cpu = target;
+		perf_pmu_migrate_context(pmu->pmu, cpu, target);
+	}
 }
 
 static void rapl_cpu_init(int cpu)
 {
-	int i, phys_id = topology_physical_package_id(cpu);
+	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+	int target;
+
+	/*
+	 * Check if there is an online cpu in the package which collects rapl
+	 * events already.
+	 */
+	target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu));
+	if (target < nr_cpu_ids)
+		return;
 
-	/* check if phys_is is already covered */
-	for_each_cpu(i, &rapl_cpu_mask) {
-		if (phys_id == topology_physical_package_id(i))
-			return;
-	}
-	/* was not found, so add it */
 	cpumask_set_cpu(cpu, &rapl_cpu_mask);
+	pmu->cpu = cpu;
 }
 
 static int rapl_cpu_prepare(int cpu)
 {
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-	int phys_id = topology_physical_package_id(cpu);
+	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
 
 	if (pmu)
 		return 0;
 
-	if (phys_id < 0)
-		return -1;
-
 	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 	if (!pmu)
-		return -1;
-	raw_spin_lock_init(&pmu->lock);
+		return -ENOMEM;
 
+	raw_spin_lock_init(&pmu->lock);
 	INIT_LIST_HEAD(&pmu->active_list);
-
-	pmu->pmu = &rapl_pmu_class;
-	pmu->cpu = cpu;
-
+	pmu->pmu = &rapl_pmus->pmu;
 	pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
-
+	pmu->cpu = -1;
 	rapl_hrtimer_init(pmu);
-
-	/* set RAPL pmu for this cpu for now */
-	per_cpu(rapl_pmu, cpu) = pmu;
-	per_cpu(rapl_pmu_to_free, cpu) = NULL;
-
-	return 0;
-}
-
-static void rapl_cpu_kfree(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
-
-	kfree(pmu);
-
-	per_cpu(rapl_pmu_to_free, cpu) = NULL;
-}
-
-static int rapl_cpu_dying(int cpu)
-{
-	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
-
-	if (!pmu)
-		return 0;
-
-	per_cpu(rapl_pmu, cpu) = NULL;
-
-	per_cpu(rapl_pmu_to_free, cpu) = pmu;
-
+	rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu;
 	return 0;
 }
 
@@ -624,24 +579,16 @@ static int rapl_cpu_notifier(struct notifier_block *self,
 	case CPU_UP_PREPARE:
 		rapl_cpu_prepare(cpu);
 		break;
-	case CPU_STARTING:
-		rapl_cpu_init(cpu);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_DYING:
-		rapl_cpu_dying(cpu);
-		break;
+
+	case CPU_DOWN_FAILED:
 	case CPU_ONLINE:
-	case CPU_DEAD:
-		rapl_cpu_kfree(cpu);
+		rapl_cpu_init(cpu);
 		break;
+
 	case CPU_DOWN_PREPARE:
 		rapl_cpu_exit(cpu);
 		break;
-	default:
-		break;
 	}
-
 	return NOTIFY_OK;
 }
 
@@ -703,10 +650,14 @@ static void __init rapl_advertise(void)
 
 static int __init rapl_prepare_cpus(void)
 {
-	unsigned int cpu;
+	unsigned int cpu, pkg;
 	int ret;
 
 	for_each_online_cpu(cpu) {
+		pkg = topology_logical_package_id(cpu);
+		if (rapl_pmus->pmus[pkg])
+			continue;
+
 		ret = rapl_cpu_prepare(cpu);
 		if (ret)
 			return ret;
@@ -717,10 +668,33 @@ static int __init rapl_prepare_cpus(void)
 
 static void __init cleanup_rapl_pmus(void)
 {
-	int cpu;
+	int i;
+
+	for (i = 0; i < rapl_pmus->maxpkg; i++)
+		kfree(rapl_pmus->pmus + i);
+	kfree(rapl_pmus);
+}
 
-	for_each_online_cpu(cpu)
-		kfree(per_cpu(rapl_pmu, cpu));
+static int __init init_rapl_pmus(void)
+{
+	int maxpkg = topology_max_packages();
+	size_t size;
+
+	size = sizeof(*rapl_pmus) + maxpkg * sizeof(struct rapl_pmu *);
+	rapl_pmus = kzalloc(size, GFP_KERNEL);
+	if (!rapl_pmus)
+		return -ENOMEM;
+
+	rapl_pmus->maxpkg		= maxpkg;
+	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
+	rapl_pmus->pmu.task_ctx_nr	= perf_invalid_context;
+	rapl_pmus->pmu.event_init	= rapl_pmu_event_init;
+	rapl_pmus->pmu.add		= rapl_pmu_event_add;
+	rapl_pmus->pmu.del		= rapl_pmu_event_del;
+	rapl_pmus->pmu.start		= rapl_pmu_event_start;
+	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
+	rapl_pmus->pmu.read		= rapl_pmu_event_read;
+	return 0;
 }
 
 static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
@@ -771,13 +745,17 @@ static int __init rapl_pmu_init(void)
 	if (ret)
 		return ret;
 
+	ret = init_rapl_pmus();
+	if (ret)
+		return ret;
+
 	cpu_notifier_register_begin();
 
 	ret = rapl_prepare_cpus();
 	if (ret)
 		goto out;
 
-	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
+	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
 	if (ret)
 		goto out;
 
-- 
cgit v0.10.2


From 675965b00d734c985e4285f5bec7e524d15fc4e1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Feb 2016 22:19:27 +0000
Subject: perf: Export perf_event_sysfs_show()

Required to use it in modular perf drivers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160222221012.930735780@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/events/core.c b/kernel/events/core.c
index df5589f..5dcc0bd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9446,6 +9446,7 @@ ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
 
 static int __init perf_event_sysfs_init(void)
 {
-- 
cgit v0.10.2


From 7400d3bbaa229eb8e7631d28fb34afd7cd2c96ff Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Fri, 15 Jan 2016 16:07:49 +0900
Subject: sched/fair: Avoid using decay_load_missed() with a negative value

decay_load_missed() cannot handle nagative values, so we need to prevent
using the function with a negative value.

Reported-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: perterz@infradead.org
Fixes: 59543275488d ("sched/fair: Prepare __update_cpu_load() to handle active tickless")
Link: http://lkml.kernel.org/r/20160115070749.GA1914@X58A-UD3R
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5641042..3b3dc39 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4509,9 +4509,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 
-		old_load = this_rq->cpu_load[i] - tickless_load;
+		old_load = this_rq->cpu_load[i];
 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
-		old_load += tickless_load;
+		if (tickless_load) {
+			old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
+			/*
+			 * old_load can never be a negative value because a
+			 * decayed tickless_load cannot be greater than the
+			 * original tickless_load.
+			 */
+			old_load += tickless_load;
+		}
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
-- 
cgit v0.10.2


From be68a682c00dfa7733021e1da386ba7182fc7bb9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 13 Jan 2016 17:01:29 +0100
Subject: sched/fair: Consolidate nohz CPU load update code

Lets factorize a bit of code there. We'll even have a third user soon.
While at it, standardize the idle update function name against the
others.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Byungchul Park <byungchul.park@lge.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1452700891-21807-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3b3dc39..3313052 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4542,6 +4542,25 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
+static void __update_cpu_load_nohz(struct rq *this_rq,
+				   unsigned long curr_jiffies,
+				   unsigned long load,
+				   int active)
+{
+	unsigned long pending_updates;
+
+	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+	if (pending_updates) {
+		this_rq->last_load_update_tick = curr_jiffies;
+		/*
+		 * In the regular NOHZ case, we were idle, this means load 0.
+		 * In the NOHZ_FULL case, we were non-idle, we should consider
+		 * its weighted load.
+		 */
+		__update_cpu_load(this_rq, load, pending_updates, active);
+	}
+}
+
 /*
  * There is no sane way to deal with nohz on smp when using jiffies because the
  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -4559,22 +4578,15 @@ static unsigned long weighted_cpuload(const int cpu)
  * Called from nohz_idle_balance() to update the load ratings before doing the
  * idle balance.
  */
-static void update_idle_cpu_load(struct rq *this_rq)
+static void update_cpu_load_idle(struct rq *this_rq)
 {
-	unsigned long curr_jiffies = READ_ONCE(jiffies);
-	unsigned long load = weighted_cpuload(cpu_of(this_rq));
-	unsigned long pending_updates;
-
 	/*
 	 * bail if there's load or we're actually up-to-date.
 	 */
-	if (load || curr_jiffies == this_rq->last_load_update_tick)
+	if (weighted_cpuload(cpu_of(this_rq)))
 		return;
 
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	this_rq->last_load_update_tick = curr_jiffies;
-
-	__update_cpu_load(this_rq, load, pending_updates, 0);
+	__update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
 }
 
 /*
@@ -4585,22 +4597,12 @@ void update_cpu_load_nohz(int active)
 	struct rq *this_rq = this_rq();
 	unsigned long curr_jiffies = READ_ONCE(jiffies);
 	unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
-	unsigned long pending_updates;
 
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
 	raw_spin_lock(&this_rq->lock);
-	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-	if (pending_updates) {
-		this_rq->last_load_update_tick = curr_jiffies;
-		/*
-		 * In the regular NOHZ case, we were idle, this means load 0.
-		 * In the NOHZ_FULL case, we were non-idle, we should consider
-		 * its weighted load.
-		 */
-		__update_cpu_load(this_rq, load, pending_updates, active);
-	}
+	__update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
 	raw_spin_unlock(&this_rq->lock);
 }
 #endif /* CONFIG_NO_HZ */
@@ -4612,7 +4614,7 @@ void update_cpu_load_active(struct rq *this_rq)
 {
 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
 	/*
-	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+	 * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
 	 */
 	this_rq->last_load_update_tick = jiffies;
 	__update_cpu_load(this_rq, load, 1, 1);
@@ -7906,7 +7908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		if (time_after_eq(jiffies, rq->next_balance)) {
 			raw_spin_lock_irq(&rq->lock);
 			update_rq_clock(rq);
-			update_idle_cpu_load(rq);
+			update_cpu_load_idle(rq);
 			raw_spin_unlock_irq(&rq->lock);
 			rebalance_domains(rq, CPU_IDLE);
 		}
-- 
cgit v0.10.2


From 41d93397334f3c3374810f45e7bcce9007d1a7bb Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Wed, 13 Jan 2016 16:42:38 +0800
Subject: sched/core: Remove duplicated sched_group_set_shares() prototype

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <lizefan@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1452674558-31897-1-git-send-email-yangds.fnst@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 91f0a77..3dc7b8b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -318,7 +318,6 @@ extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 			struct sched_entity *se, int cpu,
 			struct sched_entity *parent);
 extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
 extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
-- 
cgit v0.10.2


From ff77e468535987b3d21b7bd4da15608ea3ce7d0b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 18 Jan 2016 15:27:07 +0100
Subject: sched/rt: Fix PI handling vs. sched_setscheduler()

Andrea Parri reported:

> I found that the following scenario (with CONFIG_RT_GROUP_SCHED=y) is not
> handled correctly:
>
>     T1 (prio = 20)
>        lock(rtmutex);
>
>     T2 (prio = 20)
>        blocks on rtmutex  (rt_nr_boosted = 0 on T1's rq)
>
>     T1 (prio = 20)
>        sys_set_scheduler(prio = 0)
>           [new_effective_prio == oldprio]
>           T1 prio = 20    (rt_nr_boosted = 0 on T1's rq)
>
> The last step is incorrect as T1 is now boosted (c.f., rt_se_boosted());
> in particular, if we continue with
>
>    T1 (prio = 20)
>       unlock(rtmutex)
>          wakeup(T2)
>          adjust_prio(T1)
>             [prio != rt_mutex_getprio(T1)]
>	    dequeue(T1)
>	       rt_nr_boosted = (unsigned long)(-1)
>	       ...
>             T1 prio = 0
>
> then we end up leaving rt_nr_boosted in an "inconsistent" state.
>
> The simple program attached could reproduce the previous scenario; note
> that, as a consequence of the presence of this state, the "assertion"
>
>     WARN_ON(!rt_nr_running && rt_nr_boosted)
>
> from dec_rt_group() may trigger.

So normally we dequeue/enqueue tasks in sched_setscheduler(), which
would ensure the accounting stays correct. However in the early PI path
we fail to do so.

So this was introduced at around v3.14, by:

  c365c292d059 ("sched: Consider pi boosting in setscheduler()")

which fixed another problem exactly because that dequeue/enqueue, joy.

Fix this by teaching rt about DEQUEUE_SAVE/ENQUEUE_RESTORE and have it
preserve runqueue location with that option. This requires decoupling
the on_rt_rq() state from being on the list.

In order to allow for explicit movement during the SAVE/RESTORE,
introduce {DE,EN}QUEUE_MOVE. We still must use SAVE/RESTORE in these
cases to preserve other invariants.

Respecting the SAVE/RESTORE flags also has the (nice) side-effect that
things like sys_nice()/sys_sched_setaffinity() also do not reorder
FIFO tasks (whereas they used to before this patch).

Reported-by: Andrea Parri <parri.andrea@gmail.com>
Tested-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a292c4b..87e5f98 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1293,6 +1293,8 @@ struct sched_rt_entity {
 	unsigned long timeout;
 	unsigned long watchdog_stamp;
 	unsigned int time_slice;
+	unsigned short on_rq;
+	unsigned short on_list;
 
 	struct sched_rt_entity *back;
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 291552b..bb565b4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2221,6 +2221,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	__dl_clear_params(p);
 
 	INIT_LIST_HEAD(&p->rt.run_list);
+	p->rt.timeout		= 0;
+	p->rt.time_slice	= sched_rr_timeslice;
+	p->rt.on_rq		= 0;
+	p->rt.on_list		= 0;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -3531,7 +3535,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -3559,11 +3563,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
+
+	if (oldprio == prio)
+		queue_flag &= ~DEQUEUE_MOVE;
+
 	prev_class = p->sched_class;
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
+		dequeue_task(rq, p, queue_flag);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3581,7 +3589,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			enqueue_flag |= ENQUEUE_REPLENISH;
+			queue_flag |= ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
@@ -3589,7 +3597,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
-			enqueue_flag |= ENQUEUE_HEAD;
+			queue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
@@ -3604,7 +3612,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, enqueue_flag);
+		enqueue_task(rq, p, queue_flag);
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -3960,6 +3968,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	const struct sched_class *prev_class;
 	struct rq *rq;
 	int reset_on_fork;
+	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -4142,17 +4151,14 @@ change:
 		 * itself.
 		 */
 		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-		if (new_effective_prio == oldprio) {
-			__setscheduler_params(p, attr);
-			task_rq_unlock(rq, p, &flags);
-			return 0;
-		}
+		if (new_effective_prio == oldprio)
+			queue_flags &= ~DEQUEUE_MOVE;
 	}
 
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, DEQUEUE_SAVE);
+		dequeue_task(rq, p, queue_flags);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -4162,15 +4168,14 @@ change:
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
-		int enqueue_flags = ENQUEUE_RESTORE;
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
-		if (oldprio <= p->prio)
-			enqueue_flags |= ENQUEUE_HEAD;
+		if (oldprio < p->prio)
+			queue_flags |= ENQUEUE_HEAD;
 
-		enqueue_task(rq, p, enqueue_flags);
+		enqueue_task(rq, p, queue_flags);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -7958,7 +7963,7 @@ void sched_move_task(struct task_struct *tsk)
 	queued = task_on_rq_queued(tsk);
 
 	if (queued)
-		dequeue_task(rq, tsk, DEQUEUE_SAVE);
+		dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
@@ -7982,7 +7987,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
 
 	task_rq_unlock(rq, tsk, &flags);
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86ab..406a9c2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -436,7 +436,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
-	return !list_empty(&rt_se->run_list);
+	return rt_se->on_rq;
 }
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -482,8 +482,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 	return rt_se->my_q;
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
@@ -499,7 +499,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 		if (!rt_se)
 			enqueue_top_rt_rq(rt_rq);
 		else if (!on_rt_rq(rt_se))
-			enqueue_rt_entity(rt_se, false);
+			enqueue_rt_entity(rt_se, 0);
 
 		if (rt_rq->highest_prio.curr < curr->prio)
 			resched_curr(rq);
@@ -516,7 +516,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 	if (!rt_se)
 		dequeue_top_rt_rq(rt_rq);
 	else if (on_rt_rq(rt_se))
-		dequeue_rt_entity(rt_se);
+		dequeue_rt_entity(rt_se, 0);
 }
 
 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1166,7 +1166,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	dec_rt_group(rt_se, rt_rq);
 }
 
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+		return false;
+
+	return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+	list_del_init(&rt_se->run_list);
+
+	if (list_empty(array->queue + rt_se_prio(rt_se)))
+		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+	rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
@@ -1179,26 +1202,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
 	 * get throttled and the current group doesn't have any other
 	 * active members.
 	 */
-	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+		if (rt_se->on_list)
+			__delist_rt_entity(rt_se, array);
 		return;
+	}
 
-	if (head)
-		list_add(&rt_se->run_list, queue);
-	else
-		list_add_tail(&rt_se->run_list, queue);
-	__set_bit(rt_se_prio(rt_se), array->bitmap);
+	if (move_entity(flags)) {
+		WARN_ON_ONCE(rt_se->on_list);
+		if (flags & ENQUEUE_HEAD)
+			list_add(&rt_se->run_list, queue);
+		else
+			list_add_tail(&rt_se->run_list, queue);
+
+		__set_bit(rt_se_prio(rt_se), array->bitmap);
+		rt_se->on_list = 1;
+	}
+	rt_se->on_rq = 1;
 
 	inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	struct rt_prio_array *array = &rt_rq->active;
 
-	list_del_init(&rt_se->run_list);
-	if (list_empty(array->queue + rt_se_prio(rt_se)))
-		__clear_bit(rt_se_prio(rt_se), array->bitmap);
+	if (move_entity(flags)) {
+		WARN_ON_ONCE(!rt_se->on_list);
+		__delist_rt_entity(rt_se, array);
+	}
+	rt_se->on_rq = 0;
 
 	dec_rt_tasks(rt_se, rt_rq);
 }
@@ -1207,7 +1241,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Because the prio of an upper entry depends on the lower
  * entries, we must remove entries top - down.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct sched_rt_entity *back = NULL;
 
@@ -1220,31 +1254,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 
 	for (rt_se = back; rt_se; rt_se = rt_se->back) {
 		if (on_rt_rq(rt_se))
-			__dequeue_rt_entity(rt_se);
+			__dequeue_rt_entity(rt_se, flags);
 	}
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
 
-	dequeue_rt_stack(rt_se);
+	dequeue_rt_stack(rt_se, flags);
 	for_each_sched_rt_entity(rt_se)
-		__enqueue_rt_entity(rt_se, head);
+		__enqueue_rt_entity(rt_se, flags);
 	enqueue_top_rt_rq(&rq->rt);
 }
 
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
 	struct rq *rq = rq_of_rt_se(rt_se);
 
-	dequeue_rt_stack(rt_se);
+	dequeue_rt_stack(rt_se, flags);
 
 	for_each_sched_rt_entity(rt_se) {
 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 		if (rt_rq && rt_rq->rt_nr_running)
-			__enqueue_rt_entity(rt_se, false);
+			__enqueue_rt_entity(rt_se, flags);
 	}
 	enqueue_top_rt_rq(&rq->rt);
 }
@@ -1260,7 +1294,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
-	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+	enqueue_rt_entity(rt_se, flags);
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_task(rq, p);
@@ -1271,7 +1305,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_rt_entity *rt_se = &p->rt;
 
 	update_curr_rt(rq);
-	dequeue_rt_entity(rt_se);
+	dequeue_rt_entity(rt_se, flags);
 
 	dequeue_pushable_task(rq, p);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3dc7b8b..d3606e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1130,18 +1130,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 extern const int sched_prio_to_weight[40];
 extern const u32 sched_prio_to_wmult[40];
 
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING    - sched_class::task_waking was called
+ *
+ */
+
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */
+
 #define ENQUEUE_WAKEUP		0x01
-#define ENQUEUE_HEAD		0x02
+#define ENQUEUE_RESTORE		0x02
+#define ENQUEUE_MOVE		0x04
+
+#define ENQUEUE_HEAD		0x08
+#define ENQUEUE_REPLENISH	0x10
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
+#define ENQUEUE_WAKING		0x20
 #else
 #define ENQUEUE_WAKING		0x00
 #endif
-#define ENQUEUE_REPLENISH	0x08
-#define ENQUEUE_RESTORE	0x10
-
-#define DEQUEUE_SLEEP		0x01
-#define DEQUEUE_SAVE		0x02
 
 #define RETRY_TASK		((void *)-1UL)
 
-- 
cgit v0.10.2


From d6ca41d7922ce0110a840ef4f8ec4afdd5a239d3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 22 Feb 2016 16:26:50 -0500
Subject: sched/debug: Move the /sys/kernel/debug/sched_features file setup
 into debug.c

As /sys/kernel/debug/sched_features is only created when SCHED_DEBUG is enabled, and the file
debug.c is only compiled when SCHED_DEBUG is enabled, it makes sense to move
sched_feature setup into that file and get rid of the #ifdef.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Clark Williams <williams@redhat.com>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160222212825.464193063@goodmis.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bb565b4..ffaaeb8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -66,7 +66,6 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
@@ -124,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =
 
 #undef SCHED_FEAT
 
-#ifdef CONFIG_SCHED_DEBUG
-#define SCHED_FEAT(name, enabled)	\
-	#name ,
-
-static const char * const sched_feat_names[] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static int sched_feat_show(struct seq_file *m, void *v)
-{
-	int i;
-
-	for (i = 0; i < __SCHED_FEAT_NR; i++) {
-		if (!(sysctl_sched_features & (1UL << i)))
-			seq_puts(m, "NO_");
-		seq_printf(m, "%s ", sched_feat_names[i]);
-	}
-	seq_puts(m, "\n");
-
-	return 0;
-}
-
-#ifdef HAVE_JUMP_LABEL
-
-#define jump_label_key__true  STATIC_KEY_INIT_TRUE
-#define jump_label_key__false STATIC_KEY_INIT_FALSE
-
-#define SCHED_FEAT(name, enabled)	\
-	jump_label_key__##enabled ,
-
-struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
-#include "features.h"
-};
-
-#undef SCHED_FEAT
-
-static void sched_feat_disable(int i)
-{
-	static_key_disable(&sched_feat_keys[i]);
-}
-
-static void sched_feat_enable(int i)
-{
-	static_key_enable(&sched_feat_keys[i]);
-}
-#else
-static void sched_feat_disable(int i) { };
-static void sched_feat_enable(int i) { };
-#endif /* HAVE_JUMP_LABEL */
-
-static int sched_feat_set(char *cmp)
-{
-	int i;
-	int neg = 0;
-
-	if (strncmp(cmp, "NO_", 3) == 0) {
-		neg = 1;
-		cmp += 3;
-	}
-
-	for (i = 0; i < __SCHED_FEAT_NR; i++) {
-		if (strcmp(cmp, sched_feat_names[i]) == 0) {
-			if (neg) {
-				sysctl_sched_features &= ~(1UL << i);
-				sched_feat_disable(i);
-			} else {
-				sysctl_sched_features |= (1UL << i);
-				sched_feat_enable(i);
-			}
-			break;
-		}
-	}
-
-	return i;
-}
-
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
-		size_t cnt, loff_t *ppos)
-{
-	char buf[64];
-	char *cmp;
-	int i;
-	struct inode *inode;
-
-	if (cnt > 63)
-		cnt = 63;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-	cmp = strstrip(buf);
-
-	/* Ensure the static_key remains in a consistent state */
-	inode = file_inode(filp);
-	inode_lock(inode);
-	i = sched_feat_set(cmp);
-	inode_unlock(inode);
-	if (i == __SCHED_FEAT_NR)
-		return -EINVAL;
-
-	*ppos += cnt;
-
-	return cnt;
-}
-
-static int sched_feat_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, sched_feat_show, NULL);
-}
-
-static const struct file_operations sched_feat_fops = {
-	.open		= sched_feat_open,
-	.write		= sched_feat_write,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static __init int sched_init_debug(void)
-{
-	debugfs_create_file("sched_features", 0644, NULL, NULL,
-			&sched_feat_fops);
-
-	return 0;
-}
-late_initcall(sched_init_debug);
-#endif /* CONFIG_SCHED_DEBUG */
-
 /*
  * Number of tasks to iterate in a single balance run.
  * Limited because this is done with IRQs disabled.
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 7cfa87b..d2dedfc 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -16,6 +16,7 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
 #include <linux/mempolicy.h>
+#include <linux/debugfs.h>
 
 #include "sched.h"
 
@@ -58,6 +59,136 @@ static unsigned long nsec_low(unsigned long long nsec)
 
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 
+#define SCHED_FEAT(name, enabled)	\
+	#name ,
+
+static const char * const sched_feat_names[] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static int sched_feat_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	for (i = 0; i < __SCHED_FEAT_NR; i++) {
+		if (!(sysctl_sched_features & (1UL << i)))
+			seq_puts(m, "NO_");
+		seq_printf(m, "%s ", sched_feat_names[i]);
+	}
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true  STATIC_KEY_INIT_TRUE
+#define jump_label_key__false STATIC_KEY_INIT_FALSE
+
+#define SCHED_FEAT(name, enabled)	\
+	jump_label_key__##enabled ,
+
+struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+	static_key_disable(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+	static_key_enable(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
+static int sched_feat_set(char *cmp)
+{
+	int i;
+	int neg = 0;
+
+	if (strncmp(cmp, "NO_", 3) == 0) {
+		neg = 1;
+		cmp += 3;
+	}
+
+	for (i = 0; i < __SCHED_FEAT_NR; i++) {
+		if (strcmp(cmp, sched_feat_names[i]) == 0) {
+			if (neg) {
+				sysctl_sched_features &= ~(1UL << i);
+				sched_feat_disable(i);
+			} else {
+				sysctl_sched_features |= (1UL << i);
+				sched_feat_enable(i);
+			}
+			break;
+		}
+	}
+
+	return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+		size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	char *cmp;
+	int i;
+	struct inode *inode;
+
+	if (cnt > 63)
+		cnt = 63;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+	cmp = strstrip(buf);
+
+	/* Ensure the static_key remains in a consistent state */
+	inode = file_inode(filp);
+	inode_lock(inode);
+	i = sched_feat_set(cmp);
+	inode_unlock(inode);
+	if (i == __SCHED_FEAT_NR)
+		return -EINVAL;
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_feat_show, NULL);
+}
+
+static const struct file_operations sched_feat_fops = {
+	.open		= sched_feat_open,
+	.write		= sched_feat_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static __init int sched_init_debug(void)
+{
+	debugfs_create_file("sched_features", 0644, NULL, NULL,
+			&sched_feat_fops);
+
+	return 0;
+}
+late_initcall(sched_init_debug);
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
-- 
cgit v0.10.2


From 3866e845ed522258c77da2eaa9f849ef55206ca2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 22 Feb 2016 16:26:51 -0500
Subject: sched/debug: Move sched_domain_sysctl to debug.c

The sched_domain_sysctl setup is only enabled when SCHED_DEBUG is
configured. As debug.c is only compiled when SCHED_DEBUG is configured as
well, move the setup of sched_domain_sysctl into that file.

Note, the (un)register_sched_domain_sysctl() functions had to be changed
from static to allow access to them from core.c.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Clark Williams <williams@redhat.com>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160222212825.599278093@goodmis.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ffaaeb8..6dbb21f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -70,7 +70,6 @@
 #include <linux/ftrace.h>
 #include <linux/slab.h>
 #include <linux/init_task.h>
-#include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/compiler.h>
 
@@ -5342,183 +5341,6 @@ static void migrate_tasks(struct rq *dead_rq)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
-
-static struct ctl_table sd_ctl_dir[] = {
-	{
-		.procname	= "sched_domain",
-		.mode		= 0555,
-	},
-	{}
-};
-
-static struct ctl_table sd_ctl_root[] = {
-	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= sd_ctl_dir,
-	},
-	{}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
-	struct ctl_table *entry =
-		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
-	return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
-	struct ctl_table *entry;
-
-	/*
-	 * In the intermediate directories, both the child directory and
-	 * procname are dynamically allocated and could fail but the mode
-	 * will always be set. In the lowest directory the names are
-	 * static strings and all have proc handlers.
-	 */
-	for (entry = *tablep; entry->mode; entry++) {
-		if (entry->child)
-			sd_free_ctl_entry(&entry->child);
-		if (entry->proc_handler == NULL)
-			kfree(entry->procname);
-	}
-
-	kfree(*tablep);
-	*tablep = NULL;
-}
-
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
-static void
-set_table_entry(struct ctl_table *entry,
-		const char *procname, void *data, int maxlen,
-		umode_t mode, proc_handler *proc_handler,
-		bool load_idx)
-{
-	entry->procname = procname;
-	entry->data = data;
-	entry->maxlen = maxlen;
-	entry->mode = mode;
-	entry->proc_handler = proc_handler;
-
-	if (load_idx) {
-		entry->extra1 = &min_load_idx;
-		entry->extra2 = &max_load_idx;
-	}
-}
-
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
-{
-	struct ctl_table *table = sd_alloc_ctl_entry(14);
-
-	if (table == NULL)
-		return NULL;
-
-	set_table_entry(&table[0], "min_interval", &sd->min_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[1], "max_interval", &sd->max_interval,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
-		sizeof(int), 0644, proc_dointvec_minmax, true);
-	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[9], "cache_nice_tries",
-		&sd->cache_nice_tries,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[10], "flags", &sd->flags,
-		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[11], "max_newidle_lb_cost",
-		&sd->max_newidle_lb_cost,
-		sizeof(long), 0644, proc_doulongvec_minmax, false);
-	set_table_entry(&table[12], "name", sd->name,
-		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-	/* &table[13] is terminator */
-
-	return table;
-}
-
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
-{
-	struct ctl_table *entry, *table;
-	struct sched_domain *sd;
-	int domain_num = 0, i;
-	char buf[32];
-
-	for_each_domain(cpu, sd)
-		domain_num++;
-	entry = table = sd_alloc_ctl_entry(domain_num + 1);
-	if (table == NULL)
-		return NULL;
-
-	i = 0;
-	for_each_domain(cpu, sd) {
-		snprintf(buf, 32, "domain%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_domain_table(sd);
-		entry++;
-		i++;
-	}
-	return table;
-}
-
-static struct ctl_table_header *sd_sysctl_header;
-static void register_sched_domain_sysctl(void)
-{
-	int i, cpu_num = num_possible_cpus();
-	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
-	char buf[32];
-
-	WARN_ON(sd_ctl_dir[0].child);
-	sd_ctl_dir[0].child = entry;
-
-	if (entry == NULL)
-		return;
-
-	for_each_possible_cpu(i) {
-		snprintf(buf, 32, "cpu%d", i);
-		entry->procname = kstrdup(buf, GFP_KERNEL);
-		entry->mode = 0555;
-		entry->child = sd_alloc_ctl_cpu_table(i);
-		entry++;
-	}
-
-	WARN_ON(sd_sysctl_header);
-	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
-}
-
-/* may be called multiple times per register */
-static void unregister_sched_domain_sysctl(void)
-{
-	unregister_sysctl_table(sd_sysctl_header);
-	sd_sysctl_header = NULL;
-	if (sd_ctl_dir[0].child)
-		sd_free_ctl_entry(&sd_ctl_dir[0].child);
-}
-#else
-static void register_sched_domain_sysctl(void)
-{
-}
-static void unregister_sched_domain_sysctl(void)
-{
-}
-#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
-
 static void set_rq_online(struct rq *rq)
 {
 	if (!rq->online) {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index d2dedfc..313c65f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -189,6 +189,179 @@ static __init int sched_init_debug(void)
 }
 late_initcall(sched_init_debug);
 
+#ifdef CONFIG_SMP
+
+#ifdef CONFIG_SYSCTL
+
+static struct ctl_table sd_ctl_dir[] = {
+	{
+		.procname	= "sched_domain",
+		.mode		= 0555,
+	},
+	{}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= sd_ctl_dir,
+	},
+	{}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+	struct ctl_table *entry =
+		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+	return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+	struct ctl_table *entry;
+
+	/*
+	 * In the intermediate directories, both the child directory and
+	 * procname are dynamically allocated and could fail but the mode
+	 * will always be set. In the lowest directory the names are
+	 * static strings and all have proc handlers.
+	 */
+	for (entry = *tablep; entry->mode; entry++) {
+		if (entry->child)
+			sd_free_ctl_entry(&entry->child);
+		if (entry->proc_handler == NULL)
+			kfree(entry->procname);
+	}
+
+	kfree(*tablep);
+	*tablep = NULL;
+}
+
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+		const char *procname, void *data, int maxlen,
+		umode_t mode, proc_handler *proc_handler,
+		bool load_idx)
+{
+	entry->procname = procname;
+	entry->data = data;
+	entry->maxlen = maxlen;
+	entry->mode = mode;
+	entry->proc_handler = proc_handler;
+
+	if (load_idx) {
+		entry->extra1 = &min_load_idx;
+		entry->extra2 = &max_load_idx;
+	}
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+	struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+	if (table == NULL)
+		return NULL;
+
+	set_table_entry(&table[0], "min_interval", &sd->min_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[1], "max_interval", &sd->max_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[9], "cache_nice_tries",
+		&sd->cache_nice_tries,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[10], "flags", &sd->flags,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[11], "max_newidle_lb_cost",
+		&sd->max_newidle_lb_cost,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[12], "name", sd->name,
+		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+	/* &table[13] is terminator */
+
+	return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+	struct ctl_table *entry, *table;
+	struct sched_domain *sd;
+	int domain_num = 0, i;
+	char buf[32];
+
+	for_each_domain(cpu, sd)
+		domain_num++;
+	entry = table = sd_alloc_ctl_entry(domain_num + 1);
+	if (table == NULL)
+		return NULL;
+
+	i = 0;
+	for_each_domain(cpu, sd) {
+		snprintf(buf, 32, "domain%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_domain_table(sd);
+		entry++;
+		i++;
+	}
+	return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+void register_sched_domain_sysctl(void)
+{
+	int i, cpu_num = num_possible_cpus();
+	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+	char buf[32];
+
+	WARN_ON(sd_ctl_dir[0].child);
+	sd_ctl_dir[0].child = entry;
+
+	if (entry == NULL)
+		return;
+
+	for_each_possible_cpu(i) {
+		snprintf(buf, 32, "cpu%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_cpu_table(i);
+		entry++;
+	}
+
+	WARN_ON(sd_sysctl_header);
+	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+	unregister_sysctl_table(sd_sysctl_header);
+	sd_sysctl_header = NULL;
+	if (sd_ctl_dir[0].child)
+		sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#endif /* CONFIG_SYSCTL */
+#endif /* CONFIG_SMP */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d3606e4..ef5875f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3,6 +3,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/deadline.h>
+#include <linux/binfmts.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
@@ -908,6 +909,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
 
 extern int group_balance_cpu(struct sched_group *sg);
 
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
 #else
 
 static inline void sched_ttwu_pending(void) { }
-- 
cgit v0.10.2


From ef477183d06b0aa41c9e7c02cf5bfec41536e2c4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 22 Feb 2016 16:26:52 -0500
Subject: sched/debug: Add deadline scheduler bandwidth ratio to
 /proc/sched_debug

Playing with SCHED_DEADLINE and cpusets, I found that I was unable to create
new SCHED_DEADLINE tasks, with the error of EBUSY as if the bandwidth was
already used up. I then realized there wa no way to see what bandwidth is
used by the runqueues to debug the issue.

By adding the dl_bw->bw and dl_bw->total_bw to the output of the deadline
info in /proc/sched_debug, this allows us to see what bandwidth has been
reserved and where a problem may exist.

For example, before the issue we see the ratio of the bandwidth:

 # cat /proc/sys/kernel/sched_rt_runtime_us
 950000
 # cat /proc/sys/kernel/sched_rt_period_us
 1000000

  # grep dl /proc/sched_debug
  dl_rq[0]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : 0
  dl_rq[1]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : 0
  dl_rq[2]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : 0
  dl_rq[3]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : 0
  dl_rq[4]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : 0
  dl_rq[5]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : 0
  dl_rq[6]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : 0
  dl_rq[7]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : 0

Note: (950000 / 1000000) << 20 == 996147

After I played with cpusets and hit the issue, the result is now:

  # grep dl /proc/sched_debug
  dl_rq[0]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : -104857
  dl_rq[1]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : 104857
  dl_rq[2]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : 104857
  dl_rq[3]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : 104857
  dl_rq[4]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : -104857
  dl_rq[5]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : -104857
  dl_rq[6]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : -104857
  dl_rq[7]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : -104857

This shows that there is definitely a problem as we should never have a
negative total bandwidth.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Clark Williams <williams@redhat.com>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160222212825.756849091@goodmis.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 313c65f..4fbc3bd 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -566,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 
 void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
 {
+	struct dl_bw *dl_bw;
+
 	SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
 	SEQ_printf(m, "  .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+#ifdef CONFIG_SMP
+	dl_bw = &cpu_rq(cpu)->rd->dl_bw;
+#else
+	dl_bw = &dl_rq->dl_bw;
+#endif
+	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
+	SEQ_printf(m, "  .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
 }
 
 extern __read_mostly int sched_clock_running;
-- 
cgit v0.10.2


From c3a990dc9fab590fb88608410f1cc2bc866bdf30 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 16 Feb 2016 18:37:46 -0500
Subject: sched/rt: Kick RT bandwidth timer immediately on start up

I've been debugging why deadline tasks can cause the RT scheduler to
throttle, even when the deadline tasks are only taking up 50% of the
CPU and RT tasks are not even using 1% of the CPU. Here's what I found.

In order to keep a CPU from being hogged by RT tasks, the deadline
scheduler adds its run time (delta_exec) to the rt_time of the RT
bandwidth. That way, if the two use more than 95% of the CPU within one
second (default settings), the RT tasks are throttled to allow non RT
tasks to run.

Although the deadline tasks add their run time to the RT bandwidth, it
lets the RT tasks do the accounting. This is where the problem lies. If
a deadline task runs for a bit, and no RT tasks are running, then it
will continually add to the RT rt_time that is used to calculate how
much CPU the RT tasks use. But no RT period is in play, and this
accumulation of the runtime never gets reset.

When an RT task finally gets to run, and the watchdog goes off, it can
see that the RT task has used more than it should of, because the
deadline task added all this runtime to its rt_time. Then the RT task
that just woke up gets throttled for no good reason.

I also noticed that when an RT task is queued, it starts the timer to
account for overload and such. But that timer goes off one period
later, which may be too late and the extra rt_time will trigger a
throttle.

This is a quick work around to the problem. When a new RT task is
queued, the bandwidth timer is set to go off immediately. Then the
timer can clear out the extra time added to the rt_time while there was
no RT task running. This stops my tests from triggering the throttle,
and it will still throttle if an RT task runs too much, even while a
deadline task is running.

A better solution may be to subtract the bandwidth that the deadline
task uses from the rt_runtime, and add it back when its finished. Then
there wont be a need for runtime tracking of the time used by deadline
tasks.

I may play with that solution tomorrow.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <juri.lelli@gmail.com>
Cc: <williams@redhat.com>
Cc: Clark Williams
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Juri Lelli
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160216183746.349ec98b@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 406a9c2..a774b4d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 	raw_spin_lock(&rt_b->rt_runtime_lock);
 	if (!rt_b->rt_period_active) {
 		rt_b->rt_period_active = 1;
-		hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+		/*
+		 * SCHED_DEADLINE updates the bandwidth, as a run away
+		 * RT task with a DL task could hog a CPU. But DL does
+		 * not reset the period. If a deadline task was running
+		 * without an RT task running, it can cause RT tasks to
+		 * throttle when they start up. Kick the timer right away
+		 * to update the period.
+		 */
+		hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
 		hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
 	}
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
-- 
cgit v0.10.2


From 382c2fe994321d503647ce8ee329b9420dc7c1f9 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 10 Feb 2016 20:08:24 -0500
Subject: sched, time: Remove non-power-of-two divides from
 __acct_update_integrals()

When running a microbenchmark calling an invalid syscall number
in a loop, on a nohz_full CPU, we spend a full 9% of our CPU
time in __acct_update_integrals().

This function converts cputime_t to jiffies, to a timeval, only to
convert the timeval back to microseconds before discarding it.

This patch leaves __acct_update_integrals() functionally equivalent,
but speeds things up by about 12%, with 10 million calls to an
invalid syscall number dropping from 3.7 to 3.25 seconds.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: clark@redhat.com
Cc: eric.dumazet@gmail.com
Cc: fweisbec@gmail.com
Cc: luto@amacapital.net
Link: http://lkml.kernel.org/r/1455152907-18495-2-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 975cb49..460ee2b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 {
 	struct mm_struct *mm;
 
-	/* convert pages-usec to Mbyte-usec */
-	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
-	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
+	/* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
+	stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
+	do_div(stats->coremem, 1000 * KB);
+	stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
+	do_div(stats->virtmem, 1000 * KB);
 	mm = get_task_mm(p);
 	if (mm) {
 		/* adjust to KB unit */
@@ -125,22 +127,26 @@ static void __acct_update_integrals(struct task_struct *tsk,
 {
 	if (likely(tsk->mm)) {
 		cputime_t time, dtime;
-		struct timeval value;
 		unsigned long flags;
 		u64 delta;
 
 		local_irq_save(flags);
 		time = stime + utime;
 		dtime = time - tsk->acct_timexpd;
-		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
-		delta = value.tv_sec;
-		delta = delta * USEC_PER_SEC + value.tv_usec;
+		/* Avoid division: cputime_t is often in nanoseconds already. */
+		delta = cputime_to_nsecs(dtime);
 
-		if (delta == 0)
+		if (delta < TICK_NSEC)
 			goto out;
+
 		tsk->acct_timexpd = time;
-		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
-		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+		/*
+		 * Divide by 1024 to avoid overflow, and to avoid division.
+		 * The final unit reported to userspace is Mbyte-usecs,
+		 * the rest of the math is done in xacct_add_tsk.
+		 */
+		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
+		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
 	out:
 		local_irq_restore(flags);
 	}
-- 
cgit v0.10.2


From b2add86edd3bc050af350515e6ba26f4622c38f3 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 10 Feb 2016 20:08:25 -0500
Subject: acct, time: Change indentation in __acct_update_integrals()

Change the indentation in __acct_update_integrals() to make the function
a little easier to read.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Frederic Weisbecker <fweisbec@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: clark@redhat.com
Cc: eric.dumazet@gmail.com
Cc: fweisbec@gmail.com
Cc: luto@amacapital.net
Link: http://lkml.kernel.org/r/1455152907-18495-3-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 460ee2b..d12e815 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -125,31 +125,32 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
 static void __acct_update_integrals(struct task_struct *tsk,
 				    cputime_t utime, cputime_t stime)
 {
-	if (likely(tsk->mm)) {
-		cputime_t time, dtime;
-		unsigned long flags;
-		u64 delta;
-
-		local_irq_save(flags);
-		time = stime + utime;
-		dtime = time - tsk->acct_timexpd;
-		/* Avoid division: cputime_t is often in nanoseconds already. */
-		delta = cputime_to_nsecs(dtime);
-
-		if (delta < TICK_NSEC)
-			goto out;
-
-		tsk->acct_timexpd = time;
-		/*
-		 * Divide by 1024 to avoid overflow, and to avoid division.
-		 * The final unit reported to userspace is Mbyte-usecs,
-		 * the rest of the math is done in xacct_add_tsk.
-		 */
-		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
-		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
-	out:
-		local_irq_restore(flags);
-	}
+	cputime_t time, dtime;
+	unsigned long flags;
+	u64 delta;
+
+	if (!likely(tsk->mm))
+		return;
+
+	local_irq_save(flags);
+	time = stime + utime;
+	dtime = time - tsk->acct_timexpd;
+	/* Avoid division: cputime_t is often in nanoseconds already. */
+	delta = cputime_to_nsecs(dtime);
+
+	if (delta < TICK_NSEC)
+		goto out;
+
+	tsk->acct_timexpd = time;
+	/*
+	 * Divide by 1024 to avoid overflow, and to avoid division.
+	 * The final unit reported to userspace is Mbyte-usecs,
+	 * the rest of the math is done in xacct_add_tsk.
+	 */
+	tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
+	tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
+out:
+	local_irq_restore(flags);
 }
 
 /**
-- 
cgit v0.10.2


From 9344c92c2e72e495f695caef8364b3dd73af0eab Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 10 Feb 2016 20:08:26 -0500
Subject: time, acct: Drop irq save & restore from __acct_update_integrals()

It looks like all the call paths that lead to __acct_update_integrals()
already have irqs disabled, and __acct_update_integrals() does not need
to disable irqs itself.

This is very convenient since about half the CPU time left in this
function was spent in local_irq_save alone.

Performance of a microbenchmark that calls an invalid syscall
ten million times in a row on a nohz_full CPU improves 21% vs.
4.5-rc1 with both the removal of divisions from __acct_update_integrals()
and this patch, with runtime dropping from 3.7 to 2.9 seconds.

With these patches applied, the highest remaining cpu user in
the trace is native_sched_clock, which is addressed in the next
patch.

For testing purposes I stuck a WARN_ON(!irqs_disabled()) test
in __acct_update_integrals(). It did not trigger.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: clark@redhat.com
Cc: eric.dumazet@gmail.com
Cc: fweisbec@gmail.com
Cc: luto@amacapital.net
Link: http://lkml.kernel.org/r/1455152907-18495-4-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index d12e815..f8e26ab 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -126,20 +126,18 @@ static void __acct_update_integrals(struct task_struct *tsk,
 				    cputime_t utime, cputime_t stime)
 {
 	cputime_t time, dtime;
-	unsigned long flags;
 	u64 delta;
 
 	if (!likely(tsk->mm))
 		return;
 
-	local_irq_save(flags);
 	time = stime + utime;
 	dtime = time - tsk->acct_timexpd;
 	/* Avoid division: cputime_t is often in nanoseconds already. */
 	delta = cputime_to_nsecs(dtime);
 
 	if (delta < TICK_NSEC)
-		goto out;
+		return;
 
 	tsk->acct_timexpd = time;
 	/*
@@ -149,8 +147,6 @@ static void __acct_update_integrals(struct task_struct *tsk,
 	 */
 	tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
 	tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
-out:
-	local_irq_restore(flags);
 }
 
 /**
@@ -160,9 +156,12 @@ out:
 void acct_update_integrals(struct task_struct *tsk)
 {
 	cputime_t utime, stime;
+	unsigned long flags;
 
+	local_irq_save(flags);
 	task_cputime(tsk, &utime, &stime);
 	__acct_update_integrals(tsk, utime, stime);
+	local_irq_restore(flags);
 }
 
 /**
-- 
cgit v0.10.2


From ff9a9b4c4334b53b52ee9279f30bd5dd92ea9bdd Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Wed, 10 Feb 2016 20:08:27 -0500
Subject: sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity

When profiling syscall overhead on nohz-full kernels,
after removing __acct_update_integrals() from the profile,
native_sched_clock() remains as the top CPU user. This can be
reduced by moving VIRT_CPU_ACCOUNTING_GEN to jiffy granularity.

This will reduce timing accuracy on nohz_full CPUs to jiffy
based sampling, just like on normal CPUs. It results in
totally removing native_sched_clock from the profile, and
significantly speeding up the syscall entry and exit path,
as well as irq entry and exit, and KVM guest entry & exit.

Additionally, only call the more expensive functions (and
advance the seqlock) when jiffies actually changed.

This code relies on another CPU advancing jiffies when the
system is busy. On a nohz_full system, this is done by a
housekeeping CPU.

A microbenchmark calling an invalid syscall number 10 million
times in a row speeds up an additional 30% over the numbers
with just the previous patches, for a total speedup of about
40% over 4.4 and 4.5-rc1.

Run times for the microbenchmark:

 4.4				3.8 seconds
 4.5-rc1			3.7 seconds
 4.5-rc1 + first patch		3.3 seconds
 4.5-rc1 + first 3 patches	3.1 seconds
 4.5-rc1 + all patches		2.3 seconds

A non-NOHZ_FULL cpu (not the housekeeping CPU):

 all kernels			1.86 seconds

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: clark@redhat.com
Cc: eric.dumazet@gmail.com
Cc: fweisbec@gmail.com
Cc: luto@amacapital.net
Link: http://lkml.kernel.org/r/1455152907-18495-5-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b2ab2ff..01d9898 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static unsigned long long vtime_delta(struct task_struct *tsk)
+static cputime_t vtime_delta(struct task_struct *tsk)
 {
-	unsigned long long clock;
+	unsigned long now = READ_ONCE(jiffies);
 
-	clock = local_clock();
-	if (clock < tsk->vtime_snap)
+	if (time_before(now, (unsigned long)tsk->vtime_snap))
 		return 0;
 
-	return clock - tsk->vtime_snap;
+	return jiffies_to_cputime(now - tsk->vtime_snap);
 }
 
 static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
-	unsigned long long delta = vtime_delta(tsk);
+	unsigned long now = READ_ONCE(jiffies);
+	unsigned long delta = now - tsk->vtime_snap;
 
 	WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
-	tsk->vtime_snap += delta;
+	tsk->vtime_snap = now;
 
-	/* CHECKME: always safe to convert nsecs to cputime? */
-	return nsecs_to_cputime(delta);
+	return jiffies_to_cputime(delta);
 }
 
 static void __vtime_account_system(struct task_struct *tsk)
@@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_system(struct task_struct *tsk)
 {
+	if (!vtime_delta(tsk))
+		return;
+
 	write_seqcount_begin(&tsk->vtime_seqcount);
 	__vtime_account_system(tsk);
 	write_seqcount_end(&tsk->vtime_seqcount);
@@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk)
 void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	__vtime_account_system(tsk);
+	if (vtime_delta(tsk))
+		__vtime_account_system(tsk);
 	if (context_tracking_in_user())
 		tsk->vtime_snap_whence = VTIME_USER;
 	write_seqcount_end(&tsk->vtime_seqcount);
@@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk)
 	cputime_t delta_cpu;
 
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	delta_cpu = get_vtime_delta(tsk);
 	tsk->vtime_snap_whence = VTIME_SYS;
-	account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+	if (vtime_delta(tsk)) {
+		delta_cpu = get_vtime_delta(tsk);
+		account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+	}
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_user_enter(struct task_struct *tsk)
 {
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	__vtime_account_system(tsk);
+	if (vtime_delta(tsk))
+		__vtime_account_system(tsk);
 	tsk->vtime_snap_whence = VTIME_USER;
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk)
 	 * that can thus safely catch up with a tickless delta.
 	 */
 	write_seqcount_begin(&tsk->vtime_seqcount);
-	__vtime_account_system(tsk);
+	if (vtime_delta(tsk))
+		__vtime_account_system(tsk);
 	current->flags |= PF_VCPU;
 	write_seqcount_end(&tsk->vtime_seqcount);
 }
@@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
 
 	write_seqcount_begin(&current->vtime_seqcount);
 	current->vtime_snap_whence = VTIME_SYS;
-	current->vtime_snap = sched_clock_cpu(smp_processor_id());
+	current->vtime_snap = jiffies;
 	write_seqcount_end(&current->vtime_seqcount);
 }
 
@@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
 	local_irq_save(flags);
 	write_seqcount_begin(&t->vtime_seqcount);
 	t->vtime_snap_whence = VTIME_SYS;
-	t->vtime_snap = sched_clock_cpu(cpu);
+	t->vtime_snap = jiffies;
 	write_seqcount_end(&t->vtime_seqcount);
 	local_irq_restore(flags);
 }
-- 
cgit v0.10.2


From f904f58263e1df5f70feb8b283f4bbe662847334 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 26 Feb 2016 14:54:56 +0100
Subject: sched/debug: Fix preempt_disable_ip recording for preempt_disable()

The preempt_disable() invokes preempt_count_add() which saves the caller
in ->preempt_disable_ip. It uses CALLER_ADDR1 which does not look for
its caller but for the parent of the caller. Which means we get the correct
caller for something like spin_lock() unless the architectures inlines
those invocations. It is always wrong for preempt_disable() or
local_bh_disable().

This patch makes the function get_lock_parent_ip() which tries
CALLER_ADDR0,1,2 if the former is a locking function.
This seems to record the preempt_disable() caller properly for
preempt_disable() itself as well as for get_cpu_var() or
local_bh_disable().

Steven asked for the get_parent_ip() -> get_lock_parent_ip() rename.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160226135456.GB18244@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index c2b340e..6d9df3f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -713,6 +713,18 @@ static inline void __ftrace_enabled_restore(int enabled)
 #define CALLER_ADDR5 ((unsigned long)ftrace_return_address(5))
 #define CALLER_ADDR6 ((unsigned long)ftrace_return_address(6))
 
+static inline unsigned long get_lock_parent_ip(void)
+{
+	unsigned long addr = CALLER_ADDR0;
+
+	if (!in_lock_functions(addr))
+		return addr;
+	addr = CALLER_ADDR1;
+	if (!in_lock_functions(addr))
+		return addr;
+	return CALLER_ADDR2;
+}
+
 #ifdef CONFIG_IRQSOFF_TRACER
   extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
   extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 87e5f98..9519b66 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -182,8 +182,6 @@ extern void update_cpu_load_nohz(int active);
 static inline void update_cpu_load_nohz(int active) { }
 #endif
 
-extern unsigned long get_parent_ip(unsigned long addr);
-
 extern void dump_cpu_task(int cpu);
 
 struct seq_file;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6dbb21f..423452c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2946,16 +2946,6 @@ u64 scheduler_tick_max_deferment(void)
 }
 #endif
 
-notrace unsigned long get_parent_ip(unsigned long addr)
-{
-	if (in_lock_functions(addr)) {
-		addr = CALLER_ADDR2;
-		if (in_lock_functions(addr))
-			addr = CALLER_ADDR3;
-	}
-	return addr;
-}
-
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
 
@@ -2977,7 +2967,7 @@ void preempt_count_add(int val)
 				PREEMPT_MASK - 10);
 #endif
 	if (preempt_count() == val) {
-		unsigned long ip = get_parent_ip(CALLER_ADDR1);
+		unsigned long ip = get_lock_parent_ip();
 #ifdef CONFIG_DEBUG_PREEMPT
 		current->preempt_disable_ip = ip;
 #endif
@@ -3004,7 +2994,7 @@ void preempt_count_sub(int val)
 #endif
 
 	if (preempt_count() == val)
-		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
 	__preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 479e443..8aae49d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 
 	if (preempt_count() == cnt) {
 #ifdef CONFIG_DEBUG_PREEMPT
-		current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
+		current->preempt_disable_ip = get_lock_parent_ip();
 #endif
-		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+		trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
 	}
 }
 EXPORT_SYMBOL(__local_bh_disable_ip);
-- 
cgit v0.10.2


From 801ccdbf018ca5dbd478756ece55cd6c7726ed5b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 25 Feb 2016 15:01:49 +0100
Subject: sched/deadline: Remove superfluous call to switched_to_dl()

	if (A || B) {

	} else if (A && !B) {

	}

If A we'll take the first branch, if !A we will not satisfy the second.
Therefore the second branch will never be taken.

Reported-by: luca abeni <luca.abeni@unitn.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160225140149.GK6357@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 04a569c..15abf04 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1772,8 +1772,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
 		 */
 		resched_curr(rq);
 #endif /* CONFIG_SMP */
-	} else
-		switched_to_dl(rq, p);
+	}
 }
 
 const struct sched_class dl_sched_class = {
-- 
cgit v0.10.2


From 920c720aa5aa3900a7f1689228fdfc2580a91e7e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 1 Feb 2016 15:11:28 +0100
Subject: locking/mcs: Fix mcs_spin_lock() ordering

Similar to commit b4b29f94856a ("locking/osq: Fix ordering of node
initialisation in osq_lock") the use of xchg_acquire() is
fundamentally broken with MCS like constructs.

Furthermore, it turns out we rely on the global transitivity of this
operation because the unlock path observes the pointer with a
READ_ONCE(), not an smp_load_acquire().

This is non-critical because the MCS code isn't actually used and
mostly serves as documentation, a stepping stone to the more complex
things we've build on top of the idea.

Reported-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Fixes: 3552a07a9c4a ("locking/mcs: Use acquire/release semantics")
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 5b9102a..c835270 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 	node->locked = 0;
 	node->next   = NULL;
 
-	prev = xchg_acquire(lock, node);
+	/*
+	 * We rely on the full barrier with global transitivity implied by the
+	 * below xchg() to order the initialization stores above against any
+	 * observation of @node. And to provide the ACQUIRE ordering associated
+	 * with a LOCK primitive.
+	 */
+	prev = xchg(lock, node);
 	if (likely(prev == NULL)) {
 		/*
 		 * Lock acquired, don't need to set node->locked to 1. Threads
-- 
cgit v0.10.2


From eaff0e7003cca6c2748b67ead2d4b1a8ad858fc7 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Thu, 10 Dec 2015 15:17:46 -0500
Subject: locking/pvqspinlock: Move lock stealing count tracking code into
 pv_queued_spin_steal_lock()

This patch moves the lock stealing count tracking code into
pv_queued_spin_steal_lock() instead of via a jacket function simplifying
the code.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1449778666-13593-3-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 87bb235..78f04a2 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -55,6 +55,11 @@ struct pv_node {
 };
 
 /*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
+/*
  * By replacing the regular queued_spin_trylock() with the function below,
  * it will be called once when a lock waiter enter the PV slowpath before
  * being queued. By allowing one lock stealing attempt here when the pending
@@ -65,9 +70,11 @@ struct pv_node {
 static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
 {
 	struct __qspinlock *l = (void *)lock;
+	int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+		   (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
 
-	return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-		(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+	qstat_inc(qstat_pv_lock_stealing, ret);
+	return ret;
 }
 
 /*
@@ -138,11 +145,6 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
 #endif /* _Q_PENDING_BITS == 8 */
 
 /*
- * Include queued spinlock statistics code
- */
-#include "qspinlock_stat.h"
-
-/*
  * Lock and MCS node addresses hash table for fast lookup
  *
  * Hashing is done on a per-cacheline basis to minimize the need to access
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 640dcec..869988d 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -279,19 +279,6 @@ static inline void __pv_wait(u8 *ptr, u8 val)
 #define pv_kick(c)	__pv_kick(c)
 #define pv_wait(p, v)	__pv_wait(p, v)
 
-/*
- * PV unfair trylock count tracking function
- */
-static inline int qstat_spin_steal_lock(struct qspinlock *lock)
-{
-	int ret = pv_queued_spin_steal_lock(lock);
-
-	qstat_inc(qstat_pv_lock_stealing, ret);
-	return ret;
-}
-#undef  queued_spin_trylock
-#define queued_spin_trylock(l)	qstat_spin_steal_lock(l)
-
 #else /* CONFIG_QUEUED_LOCK_STAT */
 
 static inline void qstat_inc(enum qlock_stats stat, bool cond)	{ }
-- 
cgit v0.10.2


From cb037fdad6772df2d49fe61c97d7c0d8265bc918 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Thu, 10 Dec 2015 15:17:44 -0500
Subject: locking/qspinlock: Use smp_cond_acquire() in pending code

The newly introduced smp_cond_acquire() was used to replace the
slowpath lock acquisition loop. Similarly, the new function can also
be applied to the pending bit locking loop. This patch uses the new
function in that loop.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1449778666-13593-1-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 393d187..ce2f75e 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -358,8 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	 * sequentiality; this is because not all clear_pending_set_locked()
 	 * implementations imply full barriers.
 	 */
-	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
-		cpu_relax();
+	smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
 
 	/*
 	 * take ownership and clear the pending bit.
@@ -435,7 +434,7 @@ queue:
 	 *
 	 * The PV pv_wait_head_or_lock function, if active, will acquire
 	 * the lock and return a non-zero value. So we have to skip the
-	 * smp_load_acquire() call. As the next PV queue head hasn't been
+	 * smp_cond_acquire() call. As the next PV queue head hasn't been
 	 * designated yet, there is no way for the locked value to become
 	 * _Q_SLOW_VAL. So both the set_locked() and the
 	 * atomic_cmpxchg_relaxed() calls will be safe.
@@ -466,7 +465,7 @@ locked:
 			break;
 		}
 		/*
-		 * The smp_load_acquire() call above has provided the necessary
+		 * The smp_cond_acquire() call above has provided the necessary
 		 * acquire semantics required for locking. At most two
 		 * iterations of this loop may be ran.
 		 */
-- 
cgit v0.10.2


From 32d62510f949d3c8e83b9b3b844a84446611661b Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hpe.com>
Date: Thu, 10 Dec 2015 15:17:45 -0500
Subject: locking/pvqspinlock: Enable slowpath locking count tracking

This patch enables the tracking of the number of slowpath locking
operations performed. This can be used to compare against the number
of lock stealing operations to see what percentage of locks are stolen
versus acquired via the regular slowpath.

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1449778666-13593-2-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 78f04a2..21ede57 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -400,6 +400,11 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 	if (READ_ONCE(pn->state) == vcpu_hashed)
 		lp = (struct qspinlock **)1;
 
+	/*
+	 * Tracking # of slowpath locking operations
+	 */
+	qstat_inc(qstat_pv_lock_slowpath, true);
+
 	for (;; waitcnt++) {
 		/*
 		 * Set correct vCPU state to be used by queue node wait-early
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 869988d..eb2a2c9 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,6 +22,7 @@
  *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake
  *   pv_latency_kick	- average latency (ns) of vCPU kick operation
  *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup
+ *   pv_lock_slowpath	- # of locking operations via the slowpath
  *   pv_lock_stealing	- # of lock stealing operations
  *   pv_spurious_wakeup	- # of spurious wakeups
  *   pv_wait_again	- # of vCPU wait's that happened after a vCPU kick
@@ -45,6 +46,7 @@ enum qlock_stats {
 	qstat_pv_kick_wake,
 	qstat_pv_latency_kick,
 	qstat_pv_latency_wake,
+	qstat_pv_lock_slowpath,
 	qstat_pv_lock_stealing,
 	qstat_pv_spurious_wakeup,
 	qstat_pv_wait_again,
@@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = {
 	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
 	[qstat_pv_latency_kick]	   = "pv_latency_kick",
 	[qstat_pv_latency_wake]    = "pv_latency_wake",
+	[qstat_pv_lock_slowpath]   = "pv_lock_slowpath",
 	[qstat_pv_lock_stealing]   = "pv_lock_stealing",
 	[qstat_pv_wait_again]      = "pv_wait_again",
 	[qstat_pv_wait_early]      = "pv_wait_early",
-- 
cgit v0.10.2


From 1329ce6fbbe4536592dfcfc8d64d61bfeb598fe6 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Sun, 24 Jan 2016 18:23:43 -0800
Subject: locking/mutex: Allow next waiter lockless wakeup

Make use of wake-queues and enable the wakeup to occur after releasing the
wait_lock. This is similar to what we do with rtmutex top waiter,
slightly shortening the critical region and allow other waiters to
acquire the wait_lock sooner. In low contention cases it can also help
the recently woken waiter to find the wait_lock available (fastpath)
when it continues execution.

Reviewed-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ding Tianhong <dingtianhong@huawei.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Waiman Long <waiman.long@hpe.com>
Cc: Will Deacon <Will.Deacon@arm.com>
Link: http://lkml.kernel.org/r/20160125022343.GA3322@linux-uzut.site
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 0551c21..e364b42 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -716,6 +716,7 @@ static inline void
 __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
 {
 	unsigned long flags;
+	WAKE_Q(wake_q);
 
 	/*
 	 * As a performance measurement, release the lock before doing other
@@ -743,11 +744,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
 					   struct mutex_waiter, list);
 
 		debug_mutex_wake_waiter(lock, waiter);
-
-		wake_up_process(waiter->task);
+		wake_q_add(&wake_q, waiter->task);
 	}
 
 	spin_unlock_mutex(&lock->wait_lock, flags);
+	wake_up_q(&wake_q);
 }
 
 /*
-- 
cgit v0.10.2


From b82e530290a0437522720becaf4abdf8ca4cb7d2 Mon Sep 17 00:00:00 2001
From: Dan Streetman <dan.streetman@canonical.com>
Date: Fri, 19 Feb 2016 13:49:27 -0500
Subject: locking/qspinlock: Move __ARCH_SPIN_LOCK_UNLOCKED to
 qspinlock_types.h

Move the __ARCH_SPIN_LOCK_UNLOCKED definition from qspinlock.h into
qspinlock_types.h.

The definition of __ARCH_SPIN_LOCK_UNLOCKED comes from the build arch's
include files; but on x86 when CONFIG_QUEUED_SPINLOCKS=y, it just
it's defined in asm-generic/qspinlock.h.  In most cases, this doesn't
matter because linux/spinlock.h includes asm/spinlock.h, which for x86
includes asm-generic/qspinlock.h.  However, any code that only includes
linux/mutex.h will break, because it only includes asm/spinlock_types.h.

For example, this breaks systemtap, which only includes mutex.h.

Signed-off-by: Dan Streetman <dan.streetman@canonical.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Waiman Long <Waiman.Long@hpe.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455907767-17821-1-git-send-email-dan.streetman@canonical.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index 39e1cb2..35a52a8 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -120,11 +120,6 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock)
 #endif
 
 /*
- * Initializier
- */
-#define	__ARCH_SPIN_LOCK_UNLOCKED	{ ATOMIC_INIT(0) }
-
-/*
  * Remapping spinlock architecture specific functions to the corresponding
  * queued spinlock functions.
  */
diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h
index 85f888e..034acd0 100644
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -33,6 +33,11 @@ typedef struct qspinlock {
 } arch_spinlock_t;
 
 /*
+ * Initializier
+ */
+#define	__ARCH_SPIN_LOCK_UNLOCKED	{ ATOMIC_INIT(0) }
+
+/*
  * Bitfields in the atomic value:
  *
  * When NR_CPUS < 16K
-- 
cgit v0.10.2


From b2ed0998f66c4f68d3cccd2558c627b194e30415 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 29 Feb 2016 10:25:32 +0100
Subject: tools/lib/lockdep: Fix the build on recent kernels

The following upstream commit:

  4a389810bc3c ("kernel/locking/lockdep.c: convert hash tables to hlists")

broke the tools/lib/lockdep build. Add trivial RCU wrappers to fix it.

These wrappers should probably be moved into their own header file.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Krinkin <krinkin.m.u@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/lib/lockdep/lockdep.c b/tools/lib/lockdep/lockdep.c
index f42b7e9..a0a2e3a 100644
--- a/tools/lib/lockdep/lockdep.c
+++ b/tools/lib/lockdep/lockdep.c
@@ -1,2 +1,8 @@
 #include <linux/lockdep.h>
+
+/* Trivial API wrappers, we don't (yet) have RCU in user-space: */
+#define hlist_for_each_entry_rcu	hlist_for_each_entry
+#define hlist_add_head_rcu		hlist_add_head
+#define hlist_del_rcu			hlist_del
+
 #include "../../../kernel/locking/lockdep.c"
-- 
cgit v0.10.2


From 9d5a23ac8e0ede14a2b23740d231727ba5be483a Mon Sep 17 00:00:00 2001
From: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Date: Fri, 19 Feb 2016 07:48:51 +0100
Subject: tools/lib/lockdep: Add userspace version of READ_ONCE()

This was added to the kernel code in <1658d35ead5d> ("list: Use
READ_ONCE() when testing for empty lists").

There's nothing special we need to do about it in userspace.

Signed-off-by: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455864533-7536-2-git-send-email-alfredoalvarezernandez@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/lib/lockdep/uinclude/linux/compiler.h b/tools/lib/lockdep/uinclude/linux/compiler.h
index 6386dc3..fd3e56a 100644
--- a/tools/lib/lockdep/uinclude/linux/compiler.h
+++ b/tools/lib/lockdep/uinclude/linux/compiler.h
@@ -3,6 +3,7 @@
 
 #define __used		__attribute__((__unused__))
 #define unlikely
+#define READ_ONCE(x) (x)
 #define WRITE_ONCE(x, val) x=(val)
 #define RCU_INIT_POINTER(p, v) p=(v)
 
-- 
cgit v0.10.2


From 11a1ac206d0806b0db39712c3292f6006e77a7e8 Mon Sep 17 00:00:00 2001
From: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Date: Fri, 19 Feb 2016 07:48:52 +0100
Subject: tools/lib/lockdep: Add tests for AA and ABBA locking

Add test for AA and 2 threaded ABBA locking.

Rename AA.c to ABA.c since it was implementing an ABA instead of a pure
AA. Now both cases are covered.

The expected output for AA.c is that the process blocks and lockdep
reports a deadlock.

ABBA_2threads.c differs from ABBA.c in that lockdep keeps separate chains
of held locks per task. This can lead to different behaviour regarding
lock detection. The expected output for this test is that the process
blocks and lockdep reports a circular locking dependency.

These tests found a lockdep bug - fixed by the next commit.

Signed-off-by: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1455864533-7536-3-git-send-email-alfredoalvarezernandez@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/lib/lockdep/tests/AA.c b/tools/lib/lockdep/tests/AA.c
index 0f782ff..18211a5 100644
--- a/tools/lib/lockdep/tests/AA.c
+++ b/tools/lib/lockdep/tests/AA.c
@@ -1,13 +1,13 @@
 #include <liblockdep/mutex.h>
 
-void main(void)
+int main(void)
 {
-	pthread_mutex_t a, b;
+	pthread_mutex_t a;
 
 	pthread_mutex_init(&a, NULL);
-	pthread_mutex_init(&b, NULL);
 
 	pthread_mutex_lock(&a);
-	pthread_mutex_lock(&b);
 	pthread_mutex_lock(&a);
+
+	return 0;
 }
diff --git a/tools/lib/lockdep/tests/ABA.c b/tools/lib/lockdep/tests/ABA.c
new file mode 100644
index 0000000..0f782ff
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABA.c
@@ -0,0 +1,13 @@
+#include <liblockdep/mutex.h>
+
+void main(void)
+{
+	pthread_mutex_t a, b;
+
+	pthread_mutex_init(&a, NULL);
+	pthread_mutex_init(&b, NULL);
+
+	pthread_mutex_lock(&a);
+	pthread_mutex_lock(&b);
+	pthread_mutex_lock(&a);
+}
diff --git a/tools/lib/lockdep/tests/ABBA_2threads.c b/tools/lib/lockdep/tests/ABBA_2threads.c
new file mode 100644
index 0000000..cd807d7
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABBA_2threads.c
@@ -0,0 +1,46 @@
+#include <stdio.h>
+#include <pthread.h>
+
+pthread_mutex_t a = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t b = PTHREAD_MUTEX_INITIALIZER;
+pthread_barrier_t bar;
+
+void *ba_lock(void *arg)
+{
+	int ret, i;
+
+	pthread_mutex_lock(&b);
+
+	if (pthread_barrier_wait(&bar) == PTHREAD_BARRIER_SERIAL_THREAD)
+		pthread_barrier_destroy(&bar);
+
+	pthread_mutex_lock(&a);
+
+	pthread_mutex_unlock(&a);
+	pthread_mutex_unlock(&b);
+}
+
+int main(void)
+{
+	pthread_t t;
+
+	pthread_barrier_init(&bar, NULL, 2);
+
+	if (pthread_create(&t, NULL, ba_lock, NULL)) {
+		fprintf(stderr, "pthread_create() failed\n");
+		return 1;
+	}
+	pthread_mutex_lock(&a);
+
+	if (pthread_barrier_wait(&bar) == PTHREAD_BARRIER_SERIAL_THREAD)
+		pthread_barrier_destroy(&bar);
+
+	pthread_mutex_lock(&b);
+
+	pthread_mutex_unlock(&b);
+	pthread_mutex_unlock(&a);
+
+	pthread_join(t, NULL);
+
+	return 0;
+}
-- 
cgit v0.10.2


From 013e379a3094ff2898f8d33cfbff1573d471ee14 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 29 Feb 2016 10:29:59 +0100
Subject: tools/lib/lockdep: Fix link creation warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This warning triggers if the .so library has already been linked:

 triton:~/tip/tools/lib/lockdep> make
  CC       common.o
  CC       lockdep.o
  CC       rbtree.o
  LD       liblockdep-in.o
  LD       liblockdep.a
  ln: failed to create symbolic link ‘liblockdep.so’: File exists
  LD       liblockdep.so.4.5.0-rc6

Overwrite the link.

Cc: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/lib/lockdep/Makefile b/tools/lib/lockdep/Makefile
index 90d2bae..1d57af5 100644
--- a/tools/lib/lockdep/Makefile
+++ b/tools/lib/lockdep/Makefile
@@ -100,7 +100,7 @@ include $(srctree)/tools/build/Makefile.include
 
 do_compile_shared_library =			\
 	($(print_shared_lib_compile)		\
-	$(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -s $@ liblockdep.so))
+	$(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -sf $@ liblockdep.so))
 
 do_build_static_lib =				\
 	($(print_static_lib_build)		\
-- 
cgit v0.10.2


From 5f18ab5c6bdba735b31a1e7c2618f48eae6b1b5c Mon Sep 17 00:00:00 2001
From: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Date: Thu, 11 Feb 2016 00:33:32 +0100
Subject: locking/lockdep: Prevent chain_key collisions

The chain_key hashing macro iterate_chain_key(key1, key2) does not
generate a new different value if both key1 and key2 are 0. In that
case the generated value is again 0. This can lead to collisions which
can result in lockdep not detecting deadlocks or circular
dependencies.

Avoid the problem by using class_idx (1-based) instead of class id
(0-based) as an input for the hashing macro 'key2' in
iterate_chain_key(key1, key2).

The use of class id created collisions in cases like the following:

1.- Consider an initial state in which no class has been acquired yet.
Under these circumstances an AA deadlock will not be detected by
lockdep:

  lock  [key1,key2]->new key  (key1=old chain_key, key2=id)
  --------------------------
  A     [0,0]->0
  A     [0,0]->0 (collision)

  The newly generated chain_key collides with the one used before and as
  a result the check for a deadlock is skipped

  A simple test using liblockdep and a pthread mutex confirms the
  problem: (omitting stack traces)

    new class 0xe15038: 0x7ffc64950f20
    acquire class [0xe15038] 0x7ffc64950f20
    acquire class [0xe15038] 0x7ffc64950f20
    hash chain already cached, key: 0000000000000000 tail class:
    [0xe15038] 0x7ffc64950f20

2.- Consider an ABBA in 2 different tasks and no class yet acquired.

  T1 [key1,key2]->new key     T2[key1,key2]->new key
  --                          --
  A [0,0]->0

                              B [0,1]->1

  B [0,1]->1  (collision)

                              A

In this case the collision prevents lockdep from creating the new
dependency A->B. This in turn results in lockdep not detecting the
circular dependency when T2 acquires A.

Signed-off-by: Alfredo Alvarez Fernandez <alfredoalvarezernandez@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: sasha.levin@oracle.com
Link: http://lkml.kernel.org/r/1455147212-2389-4-git-send-email-alfredoalvarezernandez@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 3261214..6f446eb 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2143,7 +2143,7 @@ static void check_chain_key(struct task_struct *curr)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
 	struct held_lock *hlock, *prev_hlock = NULL;
-	unsigned int i, id;
+	unsigned int i;
 	u64 chain_key = 0;
 
 	for (i = 0; i < curr->lockdep_depth; i++) {
@@ -2160,17 +2160,16 @@ static void check_chain_key(struct task_struct *curr)
 				(unsigned long long)hlock->prev_chain_key);
 			return;
 		}
-		id = hlock->class_idx - 1;
 		/*
 		 * Whoops ran out of static storage again?
 		 */
-		if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+		if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
 			return;
 
 		if (prev_hlock && (prev_hlock->irq_context !=
 							hlock->irq_context))
 			chain_key = 0;
-		chain_key = iterate_chain_key(chain_key, id);
+		chain_key = iterate_chain_key(chain_key, hlock->class_idx);
 		prev_hlock = hlock;
 	}
 	if (chain_key != curr->curr_chain_key) {
@@ -3048,7 +3047,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	struct task_struct *curr = current;
 	struct lock_class *class = NULL;
 	struct held_lock *hlock;
-	unsigned int depth, id;
+	unsigned int depth;
 	int chain_head = 0;
 	int class_idx;
 	u64 chain_key;
@@ -3151,11 +3150,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	 * The 'key ID' is what is the most compact key value to drive
 	 * the hash, not class->key.
 	 */
-	id = class - lock_classes;
 	/*
 	 * Whoops, we did it again.. ran straight out of our static allocation.
 	 */
-	if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+	if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
 		return 0;
 
 	chain_key = curr->curr_chain_key;
@@ -3173,7 +3171,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		chain_key = 0;
 		chain_head = 1;
 	}
-	chain_key = iterate_chain_key(chain_key, id);
+	chain_key = iterate_chain_key(chain_key, class_idx);
 
 	if (nest_lock && !__lock_is_held(nest_lock))
 		return print_lock_nested_lock_not_held(curr, hlock, ip);
-- 
cgit v0.10.2


From 9e4e7554e755aaad8df0e26268787438735b4b76 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 29 Feb 2016 10:03:58 +0100
Subject: locking/lockdep: Detect chain_key collisions

Add detection for chain_key collision under CONFIG_DEBUG_LOCKDEP.
When a collision is detected the problem is reported and all lock
debugging is turned off.

Tested using liblockdep and the added tests before and after
applying the fix, confirming both that the code added for the
detection correctly reports the problem and that the fix actually
fixes it.

Tested tweaking lockdep to generate false collisions and
verified that the problem is reported and that lock debugging is
turned off.

Also tested with lockdep's test suite after applying the patch:

    [    0.000000] Good, all 253 testcases passed! |

Signed-off-by: Alfredo Alvarez Fernandez <alfredoalvarezernandez@gmail.com>
Cc: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: sasha.levin@oracle.com
Link: http://lkml.kernel.org/r/1455864533-7536-4-git-send-email-alfredoalvarezernandez@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 6f446eb..f894a2c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1982,6 +1982,53 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
 }
 
 /*
+ * Returns the index of the first held_lock of the current chain
+ */
+static inline int get_first_held_lock(struct task_struct *curr,
+					struct held_lock *hlock)
+{
+	int i;
+	struct held_lock *hlock_curr;
+
+	for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+		hlock_curr = curr->held_locks + i;
+		if (hlock_curr->irq_context != hlock->irq_context)
+			break;
+
+	}
+
+	return ++i;
+}
+
+/*
+ * Checks whether the chain and the current held locks are consistent
+ * in depth and also in content. If they are not it most likely means
+ * that there was a collision during the calculation of the chain_key.
+ * Returns: 0 not passed, 1 passed
+ */
+static int check_no_collision(struct task_struct *curr,
+			struct held_lock *hlock,
+			struct lock_chain *chain)
+{
+#ifdef CONFIG_DEBUG_LOCKDEP
+	int i, j, id;
+
+	i = get_first_held_lock(curr, hlock);
+
+	if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1)))
+		return 0;
+
+	for (j = 0; j < chain->depth - 1; j++, i++) {
+		id = curr->held_locks[i].class_idx - 1;
+
+		if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id))
+			return 0;
+	}
+#endif
+	return 1;
+}
+
+/*
  * Look up a dependency chain. If the key is not present yet then
  * add it and return 1 - in this case the new dependency chain is
  * validated. If the key is already hashed, return 0.
@@ -1994,7 +2041,6 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 	struct lock_class *class = hlock_class(hlock);
 	struct hlist_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
-	struct held_lock *hlock_curr;
 	int i, j;
 
 	/*
@@ -2012,6 +2058,9 @@ static inline int lookup_chain_cache(struct task_struct *curr,
 		if (chain->chain_key == chain_key) {
 cache_hit:
 			debug_atomic_inc(chain_lookup_hits);
+			if (!check_no_collision(curr, hlock, chain))
+				return 0;
+
 			if (very_verbose(class))
 				printk("\nhash chain already cached, key: "
 					"%016Lx tail class: [%p] %s\n",
@@ -2049,13 +2098,7 @@ cache_hit:
 	chain = lock_chains + nr_lock_chains++;
 	chain->chain_key = chain_key;
 	chain->irq_context = hlock->irq_context;
-	/* Find the first held_lock of current chain */
-	for (i = curr->lockdep_depth - 1; i >= 0; i--) {
-		hlock_curr = curr->held_locks + i;
-		if (hlock_curr->irq_context != hlock->irq_context)
-			break;
-	}
-	i++;
+	i = get_first_held_lock(curr, hlock);
 	chain->depth = curr->lockdep_depth + 1 - i;
 	if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
 		chain->base = nr_chain_hlocks;
-- 
cgit v0.10.2


From 4cad67fca3fc952d6f2ed9e799621f07666a560f Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 28 Feb 2016 17:32:07 +0200
Subject: arm/arm64: KVM: Fix ioctl error handling

Calling return copy_to_user(...) in an ioctl will not
do the right thing if there's a pagefault:
copy_to_user returns the number of bytes not copied
in this case.

Fix up kvm to do
	return copy_to_user(...)) ?  -EFAULT : 0;

everywhere.

Cc: stable@vger.kernel.org
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index 5fa69d7..99361f1 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -161,7 +161,7 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 	u64 val;
 
 	val = kvm_arm_timer_get_reg(vcpu, reg->id);
-	return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id));
+	return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
 }
 
 static unsigned long num_core_regs(void)
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index fcb7788..9e54ad7 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -194,7 +194,7 @@ static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 	u64 val;
 
 	val = kvm_arm_timer_get_reg(vcpu, reg->id);
-	return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id));
+	return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
 }
 
 /**
-- 
cgit v0.10.2


From 71e60073ca7b5f931b16f29add16c68b2e06949a Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Tue, 16 Feb 2016 12:29:46 +0100
Subject: MIPS: jz4740: Fix surviving instance of irq_to_gpio()

This is fallout from commit 832f5dacfa0b ("MIPS: Remove all the uses of
custom gpio.h").

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Suggested-by: Lars-Peter Clausen <lars@metafoo.de>

diff --git a/arch/mips/jz4740/gpio.c b/arch/mips/jz4740/gpio.c
index 8c6d76c..d9907e5 100644
--- a/arch/mips/jz4740/gpio.c
+++ b/arch/mips/jz4740/gpio.c
@@ -270,7 +270,7 @@ uint32_t jz_gpio_port_get_value(int port, uint32_t mask)
 }
 EXPORT_SYMBOL(jz_gpio_port_get_value);
 
-#define IRQ_TO_BIT(irq) BIT(irq_to_gpio(irq) & 0x1f)
+#define IRQ_TO_BIT(irq) BIT((irq - JZ4740_IRQ_GPIO(0)) & 0x1f)
 
 static void jz_gpio_check_trigger_both(struct jz_gpio_chip *chip, unsigned int irq)
 {
-- 
cgit v0.10.2


From 51ff5d7767eae285969da75c209e9425d84b012d Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel.sanders@imgtec.com>
Date: Tue, 23 Feb 2016 10:29:20 +0000
Subject: MIPS: Avoid variant of .type unsupported by LLVM Assembler

The target independent parts of the LLVM Lexer considers 'fault@function'
to be a single token representing the 'fault' symbol with a 'function'
modifier. However, this is not the case in the .type directive where
'function' refers to STT_FUNC from the ELF standard.

Although GAS accepts it, '.type symbol@function' is an undocumented form of
this directive. The documentation specifies a comma between the symbol and
'@function'.

Signed-off-by: Scott Egerton <Scott.Egerton@imgtec.com>
Signed-off-by: Daniel Sanders <daniel.sanders@imgtec.com>
Reviewed-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12587/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/kernel/r2300_fpu.S b/arch/mips/kernel/r2300_fpu.S
index 5ce3b74..b4ac637 100644
--- a/arch/mips/kernel/r2300_fpu.S
+++ b/arch/mips/kernel/r2300_fpu.S
@@ -125,7 +125,7 @@ LEAF(_restore_fp_context)
 	END(_restore_fp_context)
 	.set	reorder
 
-	.type	fault@function
+	.type	fault, @function
 	.ent	fault
 fault:	li	v0, -EFAULT
 	jr	ra
diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S
index f09546e..17732f8 100644
--- a/arch/mips/kernel/r4k_fpu.S
+++ b/arch/mips/kernel/r4k_fpu.S
@@ -358,7 +358,7 @@ LEAF(_restore_msa_all_upper)
 
 	.set	reorder
 
-	.type	fault@function
+	.type	fault, @function
 	.ent	fault
 fault:	li	v0, -EFAULT				# failure
 	jr	ra
-- 
cgit v0.10.2


From 923adb1646d5ba739d2a1e63ee20d60574d9da8e Mon Sep 17 00:00:00 2001
From: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Date: Wed, 24 Feb 2016 18:27:51 +0100
Subject: cxl: Fix PSL timebase synchronization detection

The PSL timebase synchronization is seemingly failing for
configuration not including VIRT_CPU_ACCOUNTING_NATIVE. The driver
shows the following trace in dmesg:
PSL: Timebase sync: giving up!

The PSL timebase register is actually syncing correctly, but the cxl
driver is not detecting it. Fix is to use the proper timebase-to-time
conversion.

Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org> # 4.3+
Acked-by: Michael Neuling <mikey@neuling.org>
Reviewed-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Reviewed-by: Vaibhav Jain <vaibhav@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 4c1903f..0c6c17a1 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -415,7 +415,7 @@ static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
 		delta = mftb() - psl_tb;
 		if (delta < 0)
 			delta = -delta;
-	} while (cputime_to_usecs(delta) > 16);
+	} while (tb_to_ns(delta) > 16000);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 67d5268908283c187e0a460048a423256c2fb288 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Sat, 27 Feb 2016 21:21:12 +0100
Subject: perf tools: Fix python extension build

The util/python-ext-sources file contains source files required to build
the python extension relative to $(srctree)/tools/perf,

Such a file path $(FILE).c is handed over to the python extension build
system, which builds the final object in the
$(PYTHON_EXTBUILD)/tmp/$(FILE).o path.

After the build is done all files from $(PYTHON_EXTBUILD)lib/ are
carried as the result binaries.

Above system fails when we add source file relative to ../lib, which we
do for:

  ../lib/bitmap.c
  ../lib/find_bit.c
  ../lib/hweight.c
  ../lib/rbtree.c

All above objects will be built like:

  $(PYTHON_EXTBUILD)/tmp/../lib/bitmap.c
  $(PYTHON_EXTBUILD)/tmp/../lib/find_bit.c
  $(PYTHON_EXTBUILD)/tmp/../lib/hweight.c
  $(PYTHON_EXTBUILD)/tmp/../lib/rbtree.c

which accidentally happens to be final library path:

  $(PYTHON_EXTBUILD)/lib/

Changing setup.py to pass full paths of source files to Extension build
class and thus keep all built objects under $(PYTHON_EXTBUILD)tmp
directory.

Reported-by: Jeff Bastian <jbastian@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Josh Boyer <jwboyer@fedoraproject.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@vger.kernel.org # v4.2+
Link: http://lkml.kernel.org/r/20160227201350.GB28494@krava.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py
index 1833103..c868098 100644
--- a/tools/perf/util/setup.py
+++ b/tools/perf/util/setup.py
@@ -22,6 +22,7 @@ cflags = getenv('CFLAGS', '').split()
 # switch off several checks (need to be at the end of cflags list)
 cflags += ['-fno-strict-aliasing', '-Wno-write-strings', '-Wno-unused-parameter' ]
 
+src_perf  = getenv('srctree') + '/tools/perf'
 build_lib = getenv('PYTHON_EXTBUILD_LIB')
 build_tmp = getenv('PYTHON_EXTBUILD_TMP')
 libtraceevent = getenv('LIBTRACEEVENT')
@@ -30,6 +31,9 @@ libapikfs = getenv('LIBAPI')
 ext_sources = [f.strip() for f in file('util/python-ext-sources')
 				if len(f.strip()) > 0 and f[0] != '#']
 
+# use full paths with source files
+ext_sources = map(lambda x: '%s/%s' % (src_perf, x) , ext_sources)
+
 perf = Extension('perf',
 		  sources = ext_sources,
 		  include_dirs = ['util/include'],
-- 
cgit v0.10.2


From c42de706dad3f39c1f65e473a1d165ea33f8b6e8 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Fri, 26 Feb 2016 22:14:25 +0900
Subject: perf trace: Check and discard not only 'nr' but also '__syscall_nr'

Format fields of a syscall have the first variable '__syscall_nr' or
'nr' that mean the syscall number.  But it isn't relevant here so drop
it.

'nr' among fields of syscall was renamed '__syscall_nr'.  So add
exception handling to drop '__syscall_nr' and modify the comment for
this excpetion handling.

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1456492465-5946-1-git-send-email-treeze.taeung@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 26a337f..8dc98c5 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1725,8 +1725,12 @@ static int trace__read_syscall_info(struct trace *trace, int id)
 
 	sc->args = sc->tp_format->format.fields;
 	sc->nr_args = sc->tp_format->format.nr_fields;
-	/* drop nr field - not relevant here; does not exist on older kernels */
-	if (sc->args && strcmp(sc->args->name, "nr") == 0) {
+	/*
+	 * We need to check and discard the first variable '__syscall_nr'
+	 * or 'nr' that mean the syscall number. It is needless here.
+	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
+	 */
+	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
 		sc->args = sc->args->next;
 		--sc->nr_args;
 	}
-- 
cgit v0.10.2


From 026842d148b920dc28f0499ede4950dcb098d4d5 Mon Sep 17 00:00:00 2001
From: Taeung Song <treeze.taeung@gmail.com>
Date: Fri, 26 Feb 2016 13:23:01 -0500
Subject: tracing/syscalls: Rename "/format" tracepoint field name "nr" to
 "__syscall_nr:

Some tracepoint have multiple fields with the same name, "nr", the first
one is a unique syscall ID, the other is a syscall argument:

  # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_io_getevents/format
  name: sys_enter_io_getevents
  ID: 747
  format:
 	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
 	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
 	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
 	field:int common_pid;	offset:4;	size:4;	signed:1;

 	field:int nr;	offset:8;	size:4;	signed:1;
 	field:aio_context_t ctx_id;	offset:16;	size:8;	signed:0;
 	field:long min_nr;	offset:24;	size:8;	signed:0;
 	field:long nr;	offset:32;	size:8;	signed:0;
 	field:struct io_event * events;	offset:40;	size:8;	signed:0;
 	field:struct timespec * timeout;	offset:48;	size:8;	signed:0;

  print fmt: "ctx_id: 0x%08lx, min_nr: 0x%08lx, nr: 0x%08lx, events: 0x%08lx, timeout: 0x%08lx", ((unsigned long)(REC->ctx_id)), ((unsigned long)(REC->min_nr)), ((unsigned long)(REC->nr)), ((unsigned long)(REC->events)), ((unsigned long)(REC->timeout))
  #

Fix it by renaming the "/format" common tracepoint field "nr" to "__syscall_nr".

Signed-off-by: Taeung Song <treeze.taeung@gmail.com>
[ Do not rename the struct member, just the '/format' field name ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160226132301.3ae065a4@gandalf.local.home
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 0655afb..d166308 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -186,11 +186,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
 
 extern char *__bad_type_size(void);
 
-#define SYSCALL_FIELD(type, name)					\
-	sizeof(type) != sizeof(trace.name) ?				\
+#define SYSCALL_FIELD(type, field, name)				\
+	sizeof(type) != sizeof(trace.field) ?				\
 		__bad_type_size() :					\
-		#type, #name, offsetof(typeof(trace), name),		\
-		sizeof(trace.name), is_signed_type(type)
+		#type, #name, offsetof(typeof(trace), field),		\
+		sizeof(trace.field), is_signed_type(type)
 
 static int __init
 __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
@@ -261,7 +261,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
 	int i;
 	int offset = offsetof(typeof(trace), args);
 
-	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+	ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
+				 FILTER_OTHER);
 	if (ret)
 		return ret;
 
@@ -281,11 +282,12 @@ static int __init syscall_exit_define_fields(struct trace_event_call *call)
 	struct syscall_trace_exit trace;
 	int ret;
 
-	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
+	ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
+				 FILTER_OTHER);
 	if (ret)
 		return ret;
 
-	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
+	ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret),
 				 FILTER_OTHER);
 
 	return ret;
-- 
cgit v0.10.2


From a6745330789f25103e67011bcfeec908fcc3b341 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 29 Feb 2016 09:01:28 -0500
Subject: tools lib traceevent: Split pevent_print_event() into specific
 functionality functions

Currently there's a single function that is used to display a record's
data in human readable format. That's pevent_print_event().
Unfortunately, this gives little room for adding other output within the
line without updating that function call.

I've decided to split that function into 3 parts.

 pevent_print_event_task() which prints the task comm, pid and the CPU
 pevent_print_event_time() which outputs the record's timestamp
 pevent_print_event_data() which outputs the rest of the event data.

pevent_print_event() now simply calls these three functions.

To save time from doing the search for event from the record's type, I
created a new helper function called pevent_find_event_by_record(),
which returns the record's event, and this event has to be passed to the
above functions.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: http://lkml.kernel.org/r/20160229090128.43a56704@gandalf.local.home
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 575e751..9a1e48a 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -5339,41 +5339,45 @@ static bool is_timestamp_in_us(char *trace_clock, bool use_trace_clock)
 	return false;
 }
 
-void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
-			struct pevent_record *record, bool use_trace_clock)
+/**
+ * pevent_find_event_by_record - return the event from a given record
+ * @pevent: a handle to the pevent
+ * @record: The record to get the event from
+ *
+ * Returns the associated event for a given record, or NULL if non is
+ * is found.
+ */
+struct event_format *
+pevent_find_event_by_record(struct pevent *pevent, struct pevent_record *record)
 {
-	static const char *spaces = "                    "; /* 20 spaces */
-	struct event_format *event;
-	unsigned long secs;
-	unsigned long usecs;
-	unsigned long nsecs;
-	const char *comm;
-	void *data = record->data;
 	int type;
-	int pid;
-	int len;
-	int p;
-	bool use_usec_format;
-
-	use_usec_format = is_timestamp_in_us(pevent->trace_clock,
-							use_trace_clock);
-	if (use_usec_format) {
-		secs = record->ts / NSECS_PER_SEC;
-		nsecs = record->ts - secs * NSECS_PER_SEC;
-	}
 
 	if (record->size < 0) {
 		do_warning("ug! negative record size %d", record->size);
-		return;
+		return NULL;
 	}
 
-	type = trace_parse_common_type(pevent, data);
+	type = trace_parse_common_type(pevent, record->data);
 
-	event = pevent_find_event(pevent, type);
-	if (!event) {
-		do_warning("ug! no event found for type %d", type);
-		return;
-	}
+	return pevent_find_event(pevent, type);
+}
+
+/**
+ * pevent_print_event_task - Write the event task comm, pid and CPU
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ *
+ * Writes the tasks comm, pid and CPU to @s.
+ */
+void pevent_print_event_task(struct pevent *pevent, struct trace_seq *s,
+			     struct event_format *event,
+			     struct pevent_record *record)
+{
+	void *data = record->data;
+	const char *comm;
+	int pid;
 
 	pid = parse_common_pid(pevent, data);
 	comm = find_cmdline(pevent, pid);
@@ -5381,9 +5385,43 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
 	if (pevent->latency_format) {
 		trace_seq_printf(s, "%8.8s-%-5d %3d",
 		       comm, pid, record->cpu);
-		pevent_data_lat_fmt(pevent, s, record);
 	} else
 		trace_seq_printf(s, "%16s-%-5d [%03d]", comm, pid, record->cpu);
+}
+
+/**
+ * pevent_print_event_time - Write the event timestamp
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ * @use_trace_clock: Set to parse according to the @pevent->trace_clock
+ *
+ * Writes the timestamp of the record into @s.
+ */
+void pevent_print_event_time(struct pevent *pevent, struct trace_seq *s,
+			     struct event_format *event,
+			     struct pevent_record *record,
+			     bool use_trace_clock)
+{
+	unsigned long secs;
+	unsigned long usecs;
+	unsigned long nsecs;
+	int p;
+	bool use_usec_format;
+
+	use_usec_format = is_timestamp_in_us(pevent->trace_clock,
+							use_trace_clock);
+	if (use_usec_format) {
+		secs = record->ts / NSECS_PER_SEC;
+		nsecs = record->ts - secs * NSECS_PER_SEC;
+	}
+
+	if (pevent->latency_format) {
+		trace_seq_printf(s, " %3d", record->cpu);
+		pevent_data_lat_fmt(pevent, s, record);
+	} else
+		trace_seq_printf(s, " [%03d]", record->cpu);
 
 	if (use_usec_format) {
 		if (pevent->flags & PEVENT_NSEC_OUTPUT) {
@@ -5394,11 +5432,28 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
 			p = 6;
 		}
 
-		trace_seq_printf(s, " %5lu.%0*lu: %s: ",
-					secs, p, usecs, event->name);
+		trace_seq_printf(s, " %5lu.%0*lu:", secs, p, usecs);
 	} else
-		trace_seq_printf(s, " %12llu: %s: ",
-					record->ts, event->name);
+		trace_seq_printf(s, " %12llu:", record->ts);
+}
+
+/**
+ * pevent_print_event_data - Write the event data section
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @event: the handle to the record's event
+ * @record: The record to get the event from
+ *
+ * Writes the parsing of the record's data to @s.
+ */
+void pevent_print_event_data(struct pevent *pevent, struct trace_seq *s,
+			     struct event_format *event,
+			     struct pevent_record *record)
+{
+	static const char *spaces = "                    "; /* 20 spaces */
+	int len;
+
+	trace_seq_printf(s, " %s: ", event->name);
 
 	/* Space out the event names evenly. */
 	len = strlen(event->name);
@@ -5408,6 +5463,23 @@ void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
 	pevent_event_info(s, event, record);
 }
 
+void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
+			struct pevent_record *record, bool use_trace_clock)
+{
+	struct event_format *event;
+
+	event = pevent_find_event_by_record(pevent, record);
+	if (!event) {
+		do_warning("ug! no event found for type %d",
+			   trace_parse_common_type(pevent, record->data));
+		return;
+	}
+
+	pevent_print_event_task(pevent, s, event, record);
+	pevent_print_event_time(pevent, s, event, record, use_trace_clock);
+	pevent_print_event_data(pevent, s, event, record);
+}
+
 static int events_id_cmp(const void *a, const void *b)
 {
 	struct event_format * const * ea = a;
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
index 706d9bc..9ffde37 100644
--- a/tools/lib/traceevent/event-parse.h
+++ b/tools/lib/traceevent/event-parse.h
@@ -628,6 +628,16 @@ int pevent_register_print_string(struct pevent *pevent, const char *fmt,
 				 unsigned long long addr);
 int pevent_pid_is_registered(struct pevent *pevent, int pid);
 
+void pevent_print_event_task(struct pevent *pevent, struct trace_seq *s,
+			     struct event_format *event,
+			     struct pevent_record *record);
+void pevent_print_event_time(struct pevent *pevent, struct trace_seq *s,
+			     struct event_format *event,
+			     struct pevent_record *record,
+			     bool use_trace_clock);
+void pevent_print_event_data(struct pevent *pevent, struct trace_seq *s,
+			     struct event_format *event,
+			     struct pevent_record *record);
 void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
 			struct pevent_record *record, bool use_trace_clock);
 
@@ -694,6 +704,9 @@ struct event_format *pevent_find_event(struct pevent *pevent, int id);
 struct event_format *
 pevent_find_event_by_name(struct pevent *pevent, const char *sys, const char *name);
 
+struct event_format *
+pevent_find_event_by_record(struct pevent *pevent, struct pevent_record *record);
+
 void pevent_data_lat_fmt(struct pevent *pevent,
 			 struct trace_seq *s, struct pevent_record *record);
 int pevent_data_type(struct pevent *pevent, struct pevent_record *rec);
-- 
cgit v0.10.2


From 56fa81fc9a5445938f3aa2e63d15ab63dc938ad6 Mon Sep 17 00:00:00 2001
From: Govindraj Raja <govindraj.raja@imgtec.com>
Date: Mon, 29 Feb 2016 11:41:20 +0000
Subject: MIPS: scache: Fix scache init with invalid line size.

In current scache init cache line_size is determined from
cpu config register, however if there there no scache
then mips_sc_probe_cm3 function populates a invalid line_size of 2.

The invalid line_size can cause a NULL pointer deference
during r4k_dma_cache_inv as r4k_blast_scache is populated
based on line_size. Scache line_size of 2 is invalid option in
r4k_blast_scache_setup.

This issue was faced during a MIPS I6400 based virtual platform bring up
where scache was not available in virtual platform model.

Signed-off-by: Govindraj Raja <Govindraj.Raja@imgtec.com>
Fixes: 7d53e9c4cd21("MIPS: CM3: Add support for CM3 L2 cache.")
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hartley <James.Hartley@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: stable@vger.kernel.org # v4.2+
Patchwork: https://patchwork.linux-mips.org/patch/12710/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/mm/sc-mips.c b/arch/mips/mm/sc-mips.c
index 2496475..91dec32 100644
--- a/arch/mips/mm/sc-mips.c
+++ b/arch/mips/mm/sc-mips.c
@@ -164,11 +164,13 @@ static int __init mips_sc_probe_cm3(void)
 
 	sets = cfg & CM_GCR_L2_CONFIG_SET_SIZE_MSK;
 	sets >>= CM_GCR_L2_CONFIG_SET_SIZE_SHF;
-	c->scache.sets = 64 << sets;
+	if (sets)
+		c->scache.sets = 64 << sets;
 
 	line_sz = cfg & CM_GCR_L2_CONFIG_LINE_SIZE_MSK;
 	line_sz >>= CM_GCR_L2_CONFIG_LINE_SIZE_SHF;
-	c->scache.linesz = 2 << line_sz;
+	if (line_sz)
+		c->scache.linesz = 2 << line_sz;
 
 	assoc = cfg & CM_GCR_L2_CONFIG_ASSOC_MSK;
 	assoc >>= CM_GCR_L2_CONFIG_ASSOC_SHF;
@@ -176,9 +178,12 @@ static int __init mips_sc_probe_cm3(void)
 	c->scache.waysize = c->scache.sets * c->scache.linesz;
 	c->scache.waybit = __ffs(c->scache.waysize);
 
-	c->scache.flags &= ~MIPS_CACHE_NOT_PRESENT;
+	if (c->scache.linesz) {
+		c->scache.flags &= ~MIPS_CACHE_NOT_PRESENT;
+		return 1;
+	}
 
-	return 1;
+	return 0;
 }
 
 static inline int __init mips_sc_probe(void)
-- 
cgit v0.10.2


From 887349f69f37e71e2a8bfbd743831625a0b2ff51 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 28 Feb 2016 17:35:59 +0200
Subject: MIPS: kvm: Fix ioctl error handling.

Calling return copy_to_user(...) or return copy_from_user in an ioctl
will not do the right thing if there's a pagefault:
copy_to_user/copy_from_user return the number of bytes not copied in
this case.

Fix up kvm on mips to do
	return copy_to_user(...)) ?  -EFAULT : 0;
and
	return copy_from_user(...)) ?  -EFAULT : 0;

everywhere.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: stable@vger.kernel.org
Cc: kvm@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12709/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 8bc3977..3110447 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -702,7 +702,7 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
 	} else if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U128) {
 		void __user *uaddr = (void __user *)(long)reg->addr;
 
-		return copy_to_user(uaddr, vs, 16);
+		return copy_to_user(uaddr, vs, 16) ? -EFAULT : 0;
 	} else {
 		return -EINVAL;
 	}
@@ -732,7 +732,7 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
 	} else if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U128) {
 		void __user *uaddr = (void __user *)(long)reg->addr;
 
-		return copy_from_user(vs, uaddr, 16);
+		return copy_from_user(vs, uaddr, 16) ? -EFAULT : 0;
 	} else {
 		return -EINVAL;
 	}
-- 
cgit v0.10.2


From b6809ee573cc2f8de97f7c8f45eacc5db1129060 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Fri, 26 Feb 2016 16:48:59 +0100
Subject: iommu/amd: Detach device from domain before removal

Detach the device that is about to be removed from its
domain (if it has one) to clear any related state like DTE
entry and device's ATS state.

Reported-by: Kelly Zytaruk <Kelly.Zytaruk@amd.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index e5e2239..374c129 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -114,6 +114,7 @@ struct kmem_cache *amd_iommu_irq_cache;
 
 static void update_domain(struct protection_domain *domain);
 static int protection_domain_init(struct protection_domain *domain);
+static void detach_device(struct device *dev);
 
 /*
  * For dynamic growth the aperture size is split into ranges of 128MB of
@@ -384,6 +385,9 @@ static void iommu_uninit_device(struct device *dev)
 	if (!dev_data)
 		return;
 
+	if (dev_data->domain)
+		detach_device(dev);
+
 	iommu_device_unlink(amd_iommu_rlookup_table[dev_data->devid]->iommu_dev,
 			    dev);
 
-- 
cgit v0.10.2


From 537e48136295c5860a92138c5ea3959b9542868b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 14:25:16 +0100
Subject: ALSA: hdspm: Fix wrong boolean ctl value accesses

snd-hdspm driver accesses enum item values (int) instead of boolean
values (long) wrongly for some ctl elements.  This patch fixes them.

Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/rme9652/hdspm.c b/sound/pci/rme9652/hdspm.c
index 8bc8016..b047b1b 100644
--- a/sound/pci/rme9652/hdspm.c
+++ b/sound/pci/rme9652/hdspm.c
@@ -2261,7 +2261,7 @@ static int snd_hdspm_put_system_sample_rate(struct snd_kcontrol *kcontrol,
 {
 	struct hdspm *hdspm = snd_kcontrol_chip(kcontrol);
 
-	hdspm_set_dds_value(hdspm, ucontrol->value.enumerated.item[0]);
+	hdspm_set_dds_value(hdspm, ucontrol->value.integer.value[0]);
 	return 0;
 }
 
@@ -4449,7 +4449,7 @@ static int snd_hdspm_get_tco_word_term(struct snd_kcontrol *kcontrol,
 {
 	struct hdspm *hdspm = snd_kcontrol_chip(kcontrol);
 
-	ucontrol->value.enumerated.item[0] = hdspm->tco->term;
+	ucontrol->value.integer.value[0] = hdspm->tco->term;
 
 	return 0;
 }
@@ -4460,8 +4460,8 @@ static int snd_hdspm_put_tco_word_term(struct snd_kcontrol *kcontrol,
 {
 	struct hdspm *hdspm = snd_kcontrol_chip(kcontrol);
 
-	if (hdspm->tco->term != ucontrol->value.enumerated.item[0]) {
-		hdspm->tco->term = ucontrol->value.enumerated.item[0];
+	if (hdspm->tco->term != ucontrol->value.integer.value[0]) {
+		hdspm->tco->term = ucontrol->value.integer.value[0];
 
 		hdspm_tco_write(hdspm);
 
-- 
cgit v0.10.2


From c1099c3294c2344110085a38c50e478a5992b368 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 14:32:42 +0100
Subject: ALSA: hdspm: Fix zero-division

HDSPM driver contains a code issuing zero-division potentially in
system sample rate ctl code.  This patch fixes it by not processing
a zero or invalid rate value as a divisor, as well as excluding the
invalid value to be passed via the given ctl element.

Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/rme9652/hdspm.c b/sound/pci/rme9652/hdspm.c
index b047b1b..a4a999a 100644
--- a/sound/pci/rme9652/hdspm.c
+++ b/sound/pci/rme9652/hdspm.c
@@ -1601,6 +1601,9 @@ static void hdspm_set_dds_value(struct hdspm *hdspm, int rate)
 {
 	u64 n;
 
+	if (snd_BUG_ON(rate <= 0))
+		return;
+
 	if (rate >= 112000)
 		rate /= 4;
 	else if (rate >= 56000)
@@ -2215,6 +2218,8 @@ static int hdspm_get_system_sample_rate(struct hdspm *hdspm)
 		} else {
 			/* slave mode, return external sample rate */
 			rate = hdspm_external_sample_rate(hdspm);
+			if (!rate)
+				rate = hdspm->system_sample_rate;
 		}
 	}
 
@@ -2260,7 +2265,10 @@ static int snd_hdspm_put_system_sample_rate(struct snd_kcontrol *kcontrol,
 					    ucontrol)
 {
 	struct hdspm *hdspm = snd_kcontrol_chip(kcontrol);
+	int rate = ucontrol->value.integer.value[0];
 
+	if (rate < 27000 || rate > 207000)
+		return -EINVAL;
 	hdspm_set_dds_value(hdspm, ucontrol->value.integer.value[0]);
 	return 0;
 }
-- 
cgit v0.10.2


From eab3c4db193f5fcccf70e884de9a922ca2c63d80 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 14:26:43 +0100
Subject: ALSA: hdsp: Fix wrong boolean ctl value accesses

snd-hdsp driver accesses enum item values (int) instead of boolean
values (long) wrongly for some ctl elements.  This patch fixes them.

Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/rme9652/hdsp.c b/sound/pci/rme9652/hdsp.c
index 2875b4f..7c8941b 100644
--- a/sound/pci/rme9652/hdsp.c
+++ b/sound/pci/rme9652/hdsp.c
@@ -2879,7 +2879,7 @@ static int snd_hdsp_get_dds_offset(struct snd_kcontrol *kcontrol, struct snd_ctl
 {
 	struct hdsp *hdsp = snd_kcontrol_chip(kcontrol);
 
-	ucontrol->value.enumerated.item[0] = hdsp_dds_offset(hdsp);
+	ucontrol->value.integer.value[0] = hdsp_dds_offset(hdsp);
 	return 0;
 }
 
@@ -2891,7 +2891,7 @@ static int snd_hdsp_put_dds_offset(struct snd_kcontrol *kcontrol, struct snd_ctl
 
 	if (!snd_hdsp_use_is_exclusive(hdsp))
 		return -EBUSY;
-	val = ucontrol->value.enumerated.item[0];
+	val = ucontrol->value.integer.value[0];
 	spin_lock_irq(&hdsp->lock);
 	if (val != hdsp_dds_offset(hdsp))
 		change = (hdsp_set_dds_offset(hdsp, val) == 0) ? 1 : 0;
-- 
cgit v0.10.2


From a528aca7f359f4b0b1d72ae406097e491a5ba9ea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 29 Feb 2016 12:12:46 -0500
Subject: use ->d_seq to get coherency between ->d_inode and ->d_flags

Games with ordering and barriers are way too brittle.  Just
bump ->d_seq before and after updating ->d_inode and ->d_flags
type bits, so that verifying ->d_seq would guarantee they are
coherent.

Cc: stable@vger.kernel.org # v3.13+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/dcache.c b/fs/dcache.c
index 92d5140..2398f9f9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -269,9 +269,6 @@ static inline int dname_external(const struct dentry *dentry)
 	return dentry->d_name.name != dentry->d_iname;
 }
 
-/*
- * Make sure other CPUs see the inode attached before the type is set.
- */
 static inline void __d_set_inode_and_type(struct dentry *dentry,
 					  struct inode *inode,
 					  unsigned type_flags)
@@ -279,28 +276,18 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
 	unsigned flags;
 
 	dentry->d_inode = inode;
-	smp_wmb();
 	flags = READ_ONCE(dentry->d_flags);
 	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
 	flags |= type_flags;
 	WRITE_ONCE(dentry->d_flags, flags);
 }
 
-/*
- * Ideally, we want to make sure that other CPUs see the flags cleared before
- * the inode is detached, but this is really a violation of RCU principles
- * since the ordering suggests we should always set inode before flags.
- *
- * We should instead replace or discard the entire dentry - but that sucks
- * performancewise on mass deletion/rename.
- */
 static inline void __d_clear_type_and_inode(struct dentry *dentry)
 {
 	unsigned flags = READ_ONCE(dentry->d_flags);
 
 	flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
 	WRITE_ONCE(dentry->d_flags, flags);
-	smp_wmb();
 	dentry->d_inode = NULL;
 }
 
@@ -370,9 +357,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
 	__releases(dentry->d_inode->i_lock)
 {
 	struct inode *inode = dentry->d_inode;
+
+	raw_write_seqcount_begin(&dentry->d_seq);
 	__d_clear_type_and_inode(dentry);
 	hlist_del_init(&dentry->d_u.d_alias);
-	dentry_rcuwalk_invalidate(dentry);
+	raw_write_seqcount_end(&dentry->d_seq);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&inode->i_lock);
 	if (!inode->i_nlink)
@@ -1758,8 +1747,9 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	spin_lock(&dentry->d_lock);
 	if (inode)
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
+	raw_write_seqcount_begin(&dentry->d_seq);
 	__d_set_inode_and_type(dentry, inode, add_flags);
-	dentry_rcuwalk_invalidate(dentry);
+	raw_write_seqcount_end(&dentry->d_seq);
 	spin_unlock(&dentry->d_lock);
 	fsnotify_d_instantiate(dentry, inode);
 }
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 7781ce11..c4b5f4b 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -409,9 +409,7 @@ static inline bool d_mountpoint(const struct dentry *dentry)
  */
 static inline unsigned __d_entry_type(const struct dentry *dentry)
 {
-	unsigned type = READ_ONCE(dentry->d_flags);
-	smp_rmb();
-	return type & DCACHE_ENTRY_TYPE;
+	return dentry->d_flags & DCACHE_ENTRY_TYPE;
 }
 
 static inline bool d_is_miss(const struct dentry *dentry)
-- 
cgit v0.10.2


From 013dd239d6220a4e0dfdf0d45a82c34f1fd73deb Mon Sep 17 00:00:00 2001
From: Patrik Halfar <patrik_halfar@halfarit.cz>
Date: Sat, 20 Feb 2016 18:49:56 +0100
Subject: USB: qcserial: add Dell Wireless 5809e Gobi 4G HSPA+ (rev3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New revision of Dell Wireless 5809e Gobi 4G HSPA+ Mobile Broadband Card
has new idProduct.

Bus 002 Device 006: ID 413c:81b3 Dell Computer Corp.
Device Descriptor:
  bLength                18
  bDescriptorType         1
  bcdUSB               2.00
  bDeviceClass            0
  bDeviceSubClass         0
  bDeviceProtocol         0
  bMaxPacketSize0        64
  idVendor           0x413c Dell Computer Corp.
  idProduct          0x81b3
  bcdDevice            0.06
  iManufacturer           1 Sierra Wireless, Incorporated
  iProduct                2 Dell Wireless 5809e Gobi™ 4G HSPA+ Mobile Broadband Card
  iSerial                 3
  bNumConfigurations      2

Signed-off-by: Patrik Halfar <patrik_halfar@halfarit.cz>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>

diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
index 9919d2a..9c09d5f 100644
--- a/drivers/usb/serial/qcserial.c
+++ b/drivers/usb/serial/qcserial.c
@@ -165,6 +165,7 @@ static const struct usb_device_id id_table[] = {
 	{DEVICE_SWI(0x413c, 0x81a8)},	/* Dell Wireless 5808 Gobi(TM) 4G LTE Mobile Broadband Card */
 	{DEVICE_SWI(0x413c, 0x81a9)},	/* Dell Wireless 5808e Gobi(TM) 4G LTE Mobile Broadband Card */
 	{DEVICE_SWI(0x413c, 0x81b1)},	/* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card */
+	{DEVICE_SWI(0x413c, 0x81b3)},	/* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */
 
 	/* Huawei devices */
 	{DEVICE_HWI(0x03f0, 0x581d)},	/* HP lt4112 LTE/HSPA+ Gobi 4G Modem (Huawei me906e) */
-- 
cgit v0.10.2


From 3c4c615d70c8cbdc8ba8c79ed702640930652a79 Mon Sep 17 00:00:00 2001
From: Vittorio Alfieri <vittorio88@gmail.com>
Date: Sun, 28 Feb 2016 14:40:24 +0100
Subject: USB: cp210x: Add ID for Parrot NMEA GPS Flight Recorder

The Parrot NMEA GPS Flight Recorder is a USB composite device
consisting of hub, flash storage, and cp210x usb to serial chip.
It is an accessory to the mass-produced Parrot AR Drone 2.
The device emits standard NMEA messages which make the it compatible
with NMEA compatible software. It was tested using gpsd version 3.11-3
as an NMEA interpreter and using the official Parrot Flight Recorder.

Signed-off-by: Vittorio Alfieri <vittorio88@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index 7c319e7..73a366d 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -165,6 +165,7 @@ static const struct usb_device_id id_table[] = {
 	{ USB_DEVICE(0x18EF, 0xE025) }, /* ELV Marble Sound Board 1 */
 	{ USB_DEVICE(0x1901, 0x0190) }, /* GE B850 CP2105 Recorder interface */
 	{ USB_DEVICE(0x1901, 0x0193) }, /* GE B650 CP2104 PMC interface */
+	{ USB_DEVICE(0x19CF, 0x3000) }, /* Parrot NMEA GPS Flight Recorder */
 	{ USB_DEVICE(0x1ADB, 0x0001) }, /* Schweitzer Engineering C662 Cable */
 	{ USB_DEVICE(0x1B1C, 0x1C00) }, /* Corsair USB Dongle */
 	{ USB_DEVICE(0x1BA4, 0x0002) },	/* Silicon Labs 358x factory default */
-- 
cgit v0.10.2


From d243bed32f5042582896237f88fa1798aee55ff9 Mon Sep 17 00:00:00 2001
From: Tirumalesh Chalamarla <tchalamarla@caviumnetworks.com>
Date: Tue, 16 Feb 2016 12:08:49 -0800
Subject: ahci: Workaround for ThunderX Errata#22536

Due to Errata in ThunderX, HOST_IRQ_STAT should be
cleared before leaving the interrupt handler.
The patch attempts to satisfy the need.

Changes from V2:
	- removed newfile
	- code is now under CONFIG_ARM64

Changes from V1:
	- Rebased on top of libata/for-4.6
        - Moved ThunderX intr handler to new file

tj: Minor adjustments to comments.

Signed-off-by: Tirumalesh Chalamarla <tchalamarla@caviumnetworks.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index b6263b3..146dc0b 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1331,6 +1331,44 @@ static inline void ahci_gtf_filter_workaround(struct ata_host *host)
 {}
 #endif
 
+#ifdef CONFIG_ARM64
+/*
+ * Due to ERRATA#22536, ThunderX needs to handle HOST_IRQ_STAT differently.
+ * Workaround is to make sure all pending IRQs are served before leaving
+ * handler.
+ */
+static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
+{
+	struct ata_host *host = dev_instance;
+	struct ahci_host_priv *hpriv;
+	unsigned int rc = 0;
+	void __iomem *mmio;
+	u32 irq_stat, irq_masked;
+	unsigned int handled = 1;
+
+	VPRINTK("ENTER\n");
+	hpriv = host->private_data;
+	mmio = hpriv->mmio;
+	irq_stat = readl(mmio + HOST_IRQ_STAT);
+	if (!irq_stat)
+		return IRQ_NONE;
+
+	do {
+		irq_masked = irq_stat & hpriv->port_map;
+		spin_lock(&host->lock);
+		rc = ahci_handle_port_intr(host, irq_masked);
+		if (!rc)
+			handled = 0;
+		writel(irq_stat, mmio + HOST_IRQ_STAT);
+		irq_stat = readl(mmio + HOST_IRQ_STAT);
+		spin_unlock(&host->lock);
+	} while (irq_stat);
+	VPRINTK("EXIT\n");
+
+	return IRQ_RETVAL(handled);
+}
+#endif
+
 /*
  * ahci_init_msix() - optionally enable per-port MSI-X otherwise defer
  * to single msi.
@@ -1566,6 +1604,11 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (ahci_broken_devslp(pdev))
 		hpriv->flags |= AHCI_HFLAG_NO_DEVSLP;
 
+#ifdef CONFIG_ARM64
+	if (pdev->vendor == 0x177d && pdev->device == 0xa01c)
+		hpriv->irq_handler = ahci_thunderx_irq_handler;
+#endif
+
 	/* save initial config */
 	ahci_pci_save_initial_config(pdev, hpriv);
 
-- 
cgit v0.10.2


From dc8b4afc4a04fac8ee55a19b59f2356a25e7e778 Mon Sep 17 00:00:00 2001
From: Manuel Lauss <manuel.lauss@gmail.com>
Date: Sat, 27 Feb 2016 16:10:05 +0100
Subject: ata: ahci: don't mark HotPlugCapable Ports as external/removable

The HPCP bit is set by bioses for on-board sata ports either because
they think sata is hotplug capable in general or to allow Windows
to display a "device eject" icon on ports which are routed to an
external connector bracket.

However in Redhat Bugzilla #1310682, users report that with kernel 4.4,
where this bit test first appeared, a lot of partitions on sata drives
are now mounted automatically.

This patch should fix redhat and a lot of other distros which
unconditionally automount all devices which have the "removable"
bit set.

Signed-off-by: Manuel Lauss <manuel.lauss@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 8a3e33cf92c7 ("ata: ahci: find eSATA ports and flag them as removable" changes userspace behavior)
Link: http://lkml.kernel.org/g/56CF35FA.1070500@redhat.com
Cc: stable@vger.kernel.org #v4.4+

diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index 513b3fa..85ea514 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -1168,8 +1168,7 @@ static void ahci_port_init(struct device *dev, struct ata_port *ap,
 
 	/* mark esata ports */
 	tmp = readl(port_mmio + PORT_CMD);
-	if ((tmp & PORT_CMD_HPCP) ||
-	    ((tmp & PORT_CMD_ESP) && (hpriv->cap & HOST_CAP_SXS)))
+	if ((tmp & PORT_CMD_ESP) && (hpriv->cap & HOST_CAP_SXS))
 		ap->pflags |= ATA_PFLAG_EXTERNAL;
 }
 
-- 
cgit v0.10.2


From a6522c08987daa6f9ac25a9c08870041a43db6b0 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 25 Feb 2016 13:02:50 -0500
Subject: 3c59x: mask LAST_FRAG bit from length field in ring

Recently, I fixed a bug in 3c59x:

commit 6e144419e4da11a9a4977c8d899d7247d94ca338
Author: Neil Horman <nhorman@tuxdriver.com>
Date:   Wed Jan 13 12:43:54 2016 -0500

    3c59x: fix another page map/single unmap imbalance

Which correctly rebalanced dma mapping and unmapping types.  Unfortunately it
introduced a new bug which causes oopses on older systems.

When mapping dma regions, the last entry for a packet in the 3c59x tx ring
encodes a LAST_FRAG bit, which is encoded as the high order bit of the buffers
length field.  When it is unmapped the LAST_FRAG bit is cleared prior to being
passed to the unmap function.  Unfortunately the commit above fails to do that
masking.  It was missed in testing because the system on which I tested it had
an intel iommu, the driver for which ignores the size field, using only the DMA
address as the token to identify the mapping to be released.  However, on older
systems that rely on swiotlb (or other dma drivers that key off that length
field), not masking off that LAST_FRAG high order bit results in parsing a huge
size to be release, leading to all sorts of odd corruptions and the like.

Fix is easy, just mask the length with 0xFFF.  It should really be
&(LAST_FRAG-1), but 0xFFF is the style of the file, and I'd like to make this
fix minimal and correct before making it prettier.

Appies to the net tree cleanly.  All testing on both iommu and swiommu based
systems produce good results

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Steffen Klassert <klassert@mathematik.tu-chemnitz.de>
CC: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c
index 79e1a02..17b2126 100644
--- a/drivers/net/ethernet/3com/3c59x.c
+++ b/drivers/net/ethernet/3com/3c59x.c
@@ -2461,7 +2461,7 @@ boomerang_interrupt(int irq, void *dev_id)
 					int i;
 					pci_unmap_single(VORTEX_PCI(vp),
 							le32_to_cpu(vp->tx_ring[entry].frag[0].addr),
-							le32_to_cpu(vp->tx_ring[entry].frag[0].length),
+							le32_to_cpu(vp->tx_ring[entry].frag[0].length)&0xFFF,
 							PCI_DMA_TODEVICE);
 
 					for (i=1; i<=skb_shinfo(skb)->nr_frags; i++)
-- 
cgit v0.10.2


From e6a8c9b337eed56eb481e1b4dd2180c25a1e5310 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 29 Feb 2016 23:49:47 +0100
Subject: iommu/vt-d: Use BUS_NOTIFY_REMOVED_DEVICE in hotplug path

In the PCI hotplug path of the Intel IOMMU driver, replace
the usage of the BUS_NOTIFY_DEL_DEVICE notifier, which is
executed before the driver is unbound from the device, with
BUS_NOTIFY_REMOVED_DEVICE, which runs after that.

This fixes a kernel BUG being triggered in the VT-d code
when the device driver tries to unmap DMA buffers and the
VT-d driver already destroyed all mappings.

Reported-by: Stefani Seibold <stefani@seibold.net>
Cc: stable@vger.kernel.org # v4.3+
Signed-off-by: Joerg Roedel <jroedel@suse.de>

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index fb092f3..8ffd756 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -329,7 +329,8 @@ static int dmar_pci_bus_notifier(struct notifier_block *nb,
 	/* Only care about add/remove events for physical functions */
 	if (pdev->is_virtfn)
 		return NOTIFY_DONE;
-	if (action != BUS_NOTIFY_ADD_DEVICE && action != BUS_NOTIFY_DEL_DEVICE)
+	if (action != BUS_NOTIFY_ADD_DEVICE &&
+	    action != BUS_NOTIFY_REMOVED_DEVICE)
 		return NOTIFY_DONE;
 
 	info = dmar_alloc_pci_notify_info(pdev, action);
@@ -339,7 +340,7 @@ static int dmar_pci_bus_notifier(struct notifier_block *nb,
 	down_write(&dmar_global_lock);
 	if (action == BUS_NOTIFY_ADD_DEVICE)
 		dmar_pci_bus_add_dev(info);
-	else if (action == BUS_NOTIFY_DEL_DEVICE)
+	else if (action == BUS_NOTIFY_REMOVED_DEVICE)
 		dmar_pci_bus_del_dev(info);
 	up_write(&dmar_global_lock);
 
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 986a53e..a2e1b7f 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4367,7 +4367,7 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
 				rmrru->devices_cnt);
 			if(ret < 0)
 				return ret;
-		} else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
+		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
 			dmar_remove_dev_scope(info, rmrr->segment,
 				rmrru->devices, rmrru->devices_cnt);
 		}
@@ -4387,7 +4387,7 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
 				break;
 			else if(ret < 0)
 				return ret;
-		} else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
+		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
 			if (dmar_remove_dev_scope(info, atsr->segment,
 					atsru->devices, atsru->devices_cnt))
 				break;
-- 
cgit v0.10.2


From 79e3f4a853ed161cd4c06d84b50beebf961a47c6 Mon Sep 17 00:00:00 2001
From: Murali Karicheri <m-karicheri2@ti.com>
Date: Mon, 29 Feb 2016 17:18:22 -0600
Subject: PCI: keystone: Fix MSI code that retrieves struct pcie_port pointer

Commit cbce7900598c ("PCI: designware: Make driver arch-agnostic") changed
the host bridge sysdata pointer from the ARM pci_sys_data to the DesignWare
pcie_port structure, and changed pcie-designware.c to reflect that.  But it
did not change the corresponding code in pci-keystone-dw.c, so it caused
crashes on Keystone:

  Unable to handle kernel NULL pointer dereference at virtual address 00000030
  pgd = c0003000
  [00000030] *pgd=80000800004003, *pmd=00000000
  Internal error: Oops: 206 [#1] PREEMPT SMP ARM
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.4.2-00139-gb74f926 #2
  Hardware name: Keystone
  PC is at ks_dw_pcie_msi_irq_unmask+0x24/0x58

Change pci-keystone-dw.c to expect sysdata to be the struct pcie_port
pointer.

[bhelgaas: changelog]
Fixes: cbce7900598c ("PCI: designware: Make driver arch-agnostic")
Signed-off-by: Murali Karicheri <m-karicheri2@ti.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: stable@vger.kernel.org	# v4.4+
CC: Zhou Wang <wangzhou1@hisilicon.com>

diff --git a/drivers/pci/host/pci-keystone-dw.c b/drivers/pci/host/pci-keystone-dw.c
index ed34c95..6153853 100644
--- a/drivers/pci/host/pci-keystone-dw.c
+++ b/drivers/pci/host/pci-keystone-dw.c
@@ -58,11 +58,6 @@
 
 #define to_keystone_pcie(x)	container_of(x, struct keystone_pcie, pp)
 
-static inline struct pcie_port *sys_to_pcie(struct pci_sys_data *sys)
-{
-	return sys->private_data;
-}
-
 static inline void update_reg_offset_bit_pos(u32 offset, u32 *reg_offset,
 					     u32 *bit_pos)
 {
@@ -108,7 +103,7 @@ static void ks_dw_pcie_msi_irq_ack(struct irq_data *d)
 	struct pcie_port *pp;
 
 	msi = irq_data_get_msi_desc(d);
-	pp = sys_to_pcie(msi_desc_to_pci_sysdata(msi));
+	pp = (struct pcie_port *) msi_desc_to_pci_sysdata(msi);
 	ks_pcie = to_keystone_pcie(pp);
 	offset = d->irq - irq_linear_revmap(pp->irq_domain, 0);
 	update_reg_offset_bit_pos(offset, &reg_offset, &bit_pos);
@@ -146,7 +141,7 @@ static void ks_dw_pcie_msi_irq_mask(struct irq_data *d)
 	u32 offset;
 
 	msi = irq_data_get_msi_desc(d);
-	pp = sys_to_pcie(msi_desc_to_pci_sysdata(msi));
+	pp = (struct pcie_port *) msi_desc_to_pci_sysdata(msi);
 	ks_pcie = to_keystone_pcie(pp);
 	offset = d->irq - irq_linear_revmap(pp->irq_domain, 0);
 
@@ -167,7 +162,7 @@ static void ks_dw_pcie_msi_irq_unmask(struct irq_data *d)
 	u32 offset;
 
 	msi = irq_data_get_msi_desc(d);
-	pp = sys_to_pcie(msi_desc_to_pci_sysdata(msi));
+	pp = (struct pcie_port *) msi_desc_to_pci_sysdata(msi);
 	ks_pcie = to_keystone_pcie(pp);
 	offset = d->irq - irq_linear_revmap(pp->irq_domain, 0);
 
-- 
cgit v0.10.2


From 1195c103f6c98d9ff381cac3a8760d4f8a133627 Mon Sep 17 00:00:00 2001
From: Minghuan Lian <Minghuan.Lian@nxp.com>
Date: Mon, 29 Feb 2016 17:24:15 -0600
Subject: PCI: layerscape: Fix MSG TLP drop setting

Some kinds of Layerscape PCIe controllers will forward the received message
TLPs to system application address space, which could corrupt system memory
or lead to a system hang.  Enable MSG_DROP to fix this issue.

Signed-off-by: Minghuan Lian <Minghuan.Lian@nxp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>

diff --git a/drivers/pci/host/pci-layerscape.c b/drivers/pci/host/pci-layerscape.c
index 3923bed..f39961b 100644
--- a/drivers/pci/host/pci-layerscape.c
+++ b/drivers/pci/host/pci-layerscape.c
@@ -77,6 +77,16 @@ static void ls_pcie_fix_class(struct ls_pcie *pcie)
 	iowrite16(PCI_CLASS_BRIDGE_PCI, pcie->dbi + PCI_CLASS_DEVICE);
 }
 
+/* Drop MSG TLP except for Vendor MSG */
+static void ls_pcie_drop_msg_tlp(struct ls_pcie *pcie)
+{
+	u32 val;
+
+	val = ioread32(pcie->dbi + PCIE_STRFMR1);
+	val &= 0xDFFFFFFF;
+	iowrite32(val, pcie->dbi + PCIE_STRFMR1);
+}
+
 static int ls1021_pcie_link_up(struct pcie_port *pp)
 {
 	u32 state;
@@ -97,7 +107,7 @@ static int ls1021_pcie_link_up(struct pcie_port *pp)
 static void ls1021_pcie_host_init(struct pcie_port *pp)
 {
 	struct ls_pcie *pcie = to_ls_pcie(pp);
-	u32 val, index[2];
+	u32 index[2];
 
 	pcie->scfg = syscon_regmap_lookup_by_phandle(pp->dev->of_node,
 						     "fsl,pcie-scfg");
@@ -116,13 +126,7 @@ static void ls1021_pcie_host_init(struct pcie_port *pp)
 
 	dw_pcie_setup_rc(pp);
 
-	/*
-	 * LS1021A Workaround for internal TKT228622
-	 * to fix the INTx hang issue
-	 */
-	val = ioread32(pcie->dbi + PCIE_STRFMR1);
-	val &= 0xffff;
-	iowrite32(val, pcie->dbi + PCIE_STRFMR1);
+	ls_pcie_drop_msg_tlp(pcie);
 }
 
 static int ls_pcie_link_up(struct pcie_port *pp)
@@ -147,6 +151,7 @@ static void ls_pcie_host_init(struct pcie_port *pp)
 	iowrite32(1, pcie->dbi + PCIE_DBI_RO_WR_EN);
 	ls_pcie_fix_class(pcie);
 	ls_pcie_clear_multifunction(pcie);
+	ls_pcie_drop_msg_tlp(pcie);
 	iowrite32(0, pcie->dbi + PCIE_DBI_RO_WR_EN);
 }
 
-- 
cgit v0.10.2


From 5adebafb75bdfbbe4ec69f14c3613e70f6ed7f6f Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leon@leon.nu>
Date: Sun, 21 Feb 2016 18:12:26 +0200
Subject: IB/core: Fix missed clean call in registration path

In case of failure returned from query function in
IB device registration, we need to clean IB cache which
was missed.

This change fixes it.

Fixes: 3e153a93a1c1 ('IB/core: Save the device attributes on the device
structure')
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 00da80e..94b80a5 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -358,6 +358,7 @@ int ib_register_device(struct ib_device *device,
 	ret = device->query_device(device, &device->attrs, &uhw);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't query the device attributes\n");
+		ib_cache_cleanup_one(device);
 		goto out;
 	}
 
-- 
cgit v0.10.2


From ddd30ef4741c30945e43a2956bfbc3afe74a73b5 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 29 Feb 2016 17:04:20 +0100
Subject: cpufreq: qoriq: allow building as module with THERMAL=m

My previous patch to avoid link errors with the qoriq cpufreq
driver disallowed all of the broken cases, but also prevented
the driver from being built when CONFIG_THERMAL is a module.

This changes the dependency to allow the cpufreq driver to
also be a module in this case, just not built-in.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 8ae1702a0df5 (cpufreq: qoriq: Register cooling device based on device tree)
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 659879a..f935110 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -296,6 +296,7 @@ endif
 config QORIQ_CPUFREQ
 	tristate "CPU frequency scaling driver for Freescale QorIQ SoCs"
 	depends on OF && COMMON_CLK && (PPC_E500MC || ARM)
+	depends on !CPU_THERMAL || THERMAL
 	select CLK_QORIQ
 	help
 	  This adds the CPUFreq driver support for Freescale QorIQ SoCs
-- 
cgit v0.10.2


From 3c2002aec3ee3a9fc86ff8f8618f1253c94ec919 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 29 Feb 2016 17:04:21 +0100
Subject: cpufreq: mediatek: allow building as a module

The MT8173 cpufreq driver can currently only be built-in, but
it has a Kconfig dependency on the thermal core. THERMAL
can be a loadable module, which in turn makes this driver
impossible to build.

It is nicer to make the cpufreq driver a module as well, so
this patch turns the option in to a 'tristate' and adapts
the dependency accordingly.

The driver has no module_exit() function, so it will continue
to not support unloading, but it can be built as a module
and loaded at runtime now.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 5269e7067cd6 (cpufreq: Add ARM_MT8173_CPUFREQ dependency on THERMAL)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 0031069..14b1f93 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -84,10 +84,10 @@ config ARM_KIRKWOOD_CPUFREQ
 	  SoCs.
 
 config ARM_MT8173_CPUFREQ
-	bool "Mediatek MT8173 CPUFreq support"
+	tristate "Mediatek MT8173 CPUFreq support"
 	depends on ARCH_MEDIATEK && REGULATOR
 	depends on ARM64 || (ARM_CPU_TOPOLOGY && COMPILE_TEST)
-	depends on !CPU_THERMAL || THERMAL=y
+	depends on !CPU_THERMAL || THERMAL
 	select PM_OPP
 	help
 	  This adds the CPUFreq driver support for Mediatek MT8173 SoC.
diff --git a/drivers/cpufreq/mt8173-cpufreq.c b/drivers/cpufreq/mt8173-cpufreq.c
index 1efba34..2058e6d 100644
--- a/drivers/cpufreq/mt8173-cpufreq.c
+++ b/drivers/cpufreq/mt8173-cpufreq.c
@@ -17,6 +17,7 @@
 #include <linux/cpu_cooling.h>
 #include <linux/cpufreq.h>
 #include <linux/cpumask.h>
+#include <linux/module.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/pm_opp.h>
-- 
cgit v0.10.2


From ead8f34c701ec7bf3234118b8c746227f30dfd1a Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 25 Feb 2016 21:10:28 +0000
Subject: drm/i915: Balance assert_rpm_wakelock_held() for
 !IS_ENABLED(CONFIG_PM)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 09731280028ce03e6a27e1998137f1775a2839f3
Author: Imre Deak <imre.deak@intel.com>
Date:   Wed Feb 17 14:17:42 2016 +0200

    drm/i915: Add helper to get a display power ref if it was already enabled

left the rpm wakelock assertions unbalanced if CONFIG_PM was disabled as
intel_runtime_pm_get_if_in_use() would return true without incrementing
the local bookkeeping required for the assertions.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
CC: Mika Kuoppala <mika.kuoppala@intel.com>
CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
CC: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Imre Deak <imre.deak@intel.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index 678ed34..92f8d96 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -2349,22 +2349,20 @@ bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *dev_priv)
 {
 	struct drm_device *dev = dev_priv->dev;
 	struct device *device = &dev->pdev->dev;
-	int ret;
 
-	if (!IS_ENABLED(CONFIG_PM))
-		return true;
+	if (IS_ENABLED(CONFIG_PM)) {
+		int ret = pm_runtime_get_if_in_use(device);
 
-	ret = pm_runtime_get_if_in_use(device);
-
-	/*
-	 * In cases runtime PM is disabled by the RPM core and we get an
-	 * -EINVAL return value we are not supposed to call this function,
-	 * since the power state is undefined. This applies atm to the
-	 * late/early system suspend/resume handlers.
-	 */
-	WARN_ON_ONCE(ret < 0);
-	if (ret <= 0)
-		return false;
+		/*
+		 * In cases runtime PM is disabled by the RPM core and we get
+		 * an -EINVAL return value we are not supposed to call this
+		 * function, since the power state is undefined. This applies
+		 * atm to the late/early system suspend/resume handlers.
+		 */
+		WARN_ON_ONCE(ret < 0);
+		if (ret <= 0)
+			return false;
+	}
 
 	atomic_inc(&dev_priv->pm.wakeref_count);
 	assert_rpm_wakelock_held(dev_priv);
-- 
cgit v0.10.2


From 741338f99f16dc24d2d01ac777b0798ae9d10a90 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 17:20:48 +0100
Subject: ASoC: dapm: Fix ctl value accesses in a wrong type

snd_soc_dapm_dai_link_get() and _put() access the associated ctl
values as value.integer.value[].  However, this is an enum ctl, and it
has to be accessed via value.enumerated.item[].  The former is long
while the latter is unsigned int, so they don't align.

Fixes: c66150824b8a ('ASoC: dapm: add code to configure dai link parameters')
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c
index 5a2812f..335c7de 100644
--- a/sound/soc/soc-dapm.c
+++ b/sound/soc/soc-dapm.c
@@ -3573,7 +3573,7 @@ static int snd_soc_dapm_dai_link_get(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_dapm_widget *w = snd_kcontrol_chip(kcontrol);
 
-	ucontrol->value.integer.value[0] = w->params_select;
+	ucontrol->value.enumerated.item[0] = w->params_select;
 
 	return 0;
 }
@@ -3587,13 +3587,13 @@ static int snd_soc_dapm_dai_link_put(struct snd_kcontrol *kcontrol,
 	if (w->power)
 		return -EBUSY;
 
-	if (ucontrol->value.integer.value[0] == w->params_select)
+	if (ucontrol->value.enumerated.item[0] == w->params_select)
 		return 0;
 
-	if (ucontrol->value.integer.value[0] >= w->num_params)
+	if (ucontrol->value.enumerated.item[0] >= w->num_params)
 		return -EINVAL;
 
-	w->params_select = ucontrol->value.integer.value[0];
+	w->params_select = ucontrol->value.enumerated.item[0];
 
 	return 0;
 }
-- 
cgit v0.10.2


From 1457ad0e99dc5b1fe3fe3123d5135c95c00ed74b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 17:23:49 +0100
Subject: ASoC: pxa: corgi: Fix enum ctl accesses in a wrong type

"Jack Function" and "Speaker Function" ctls in corgi are enum, while
the current driver accesses wrongly via value.integer.value[].  They
have to be via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/pxa/corgi.c b/sound/soc/pxa/corgi.c
index c97dc13..dcbb7aa 100644
--- a/sound/soc/pxa/corgi.c
+++ b/sound/soc/pxa/corgi.c
@@ -163,7 +163,7 @@ static struct snd_soc_ops corgi_ops = {
 static int corgi_get_jack(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = corgi_jack_func;
+	ucontrol->value.enumerated.item[0] = corgi_jack_func;
 	return 0;
 }
 
@@ -172,10 +172,10 @@ static int corgi_set_jack(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-	if (corgi_jack_func == ucontrol->value.integer.value[0])
+	if (corgi_jack_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	corgi_jack_func = ucontrol->value.integer.value[0];
+	corgi_jack_func = ucontrol->value.enumerated.item[0];
 	corgi_ext_control(&card->dapm);
 	return 1;
 }
@@ -183,7 +183,7 @@ static int corgi_set_jack(struct snd_kcontrol *kcontrol,
 static int corgi_get_spk(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = corgi_spk_func;
+	ucontrol->value.enumerated.item[0] = corgi_spk_func;
 	return 0;
 }
 
@@ -192,10 +192,10 @@ static int corgi_set_spk(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-	if (corgi_spk_func == ucontrol->value.integer.value[0])
+	if (corgi_spk_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	corgi_spk_func = ucontrol->value.integer.value[0];
+	corgi_spk_func = ucontrol->value.enumerated.item[0];
 	corgi_ext_control(&card->dapm);
 	return 1;
 }
-- 
cgit v0.10.2


From 127ee199d5d232f29ad64bace3b441127fce6cd3 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 17:23:50 +0100
Subject: ASoC: pxa: magician: Fix enum ctl accesses in a wrong type

"Input Select" ctl in magician driver is an enum, while the current
driver accesses wrongly via value.integer.value[].  They have to be
via value.enumerated.item[] instead.

(Meanwhile "Headphone Switch" and "Speaker Switch" are boolean, so
 they should stay to access via value.integer.value[] as is.)

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/pxa/magician.c b/sound/soc/pxa/magician.c
index 241d0be..62b8377 100644
--- a/sound/soc/pxa/magician.c
+++ b/sound/soc/pxa/magician.c
@@ -308,17 +308,17 @@ static int magician_set_spk(struct snd_kcontrol *kcontrol,
 static int magician_get_input(struct snd_kcontrol *kcontrol,
 			      struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = magician_in_sel;
+	ucontrol->value.enumerated.item[0] = magician_in_sel;
 	return 0;
 }
 
 static int magician_set_input(struct snd_kcontrol *kcontrol,
 			      struct snd_ctl_elem_value *ucontrol)
 {
-	if (magician_in_sel == ucontrol->value.integer.value[0])
+	if (magician_in_sel == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	magician_in_sel = ucontrol->value.integer.value[0];
+	magician_in_sel = ucontrol->value.enumerated.item[0];
 
 	switch (magician_in_sel) {
 	case MAGICIAN_MIC:
-- 
cgit v0.10.2


From 7a3f4b488ce366de6ab2cf19d95353da9814bdd9 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 17:23:51 +0100
Subject: ASoC: pxa: poodle: Fix enum ctl accesses in a wrong type

"Jack Function" and "Speaker Function" ctls in poodle are enum, while
the current driver accesses wrongly via value.integer.value[].  They
have to be via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/pxa/poodle.c b/sound/soc/pxa/poodle.c
index 84d0e2e..4b3b714 100644
--- a/sound/soc/pxa/poodle.c
+++ b/sound/soc/pxa/poodle.c
@@ -138,7 +138,7 @@ static struct snd_soc_ops poodle_ops = {
 static int poodle_get_jack(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = poodle_jack_func;
+	ucontrol->value.enumerated.item[0] = poodle_jack_func;
 	return 0;
 }
 
@@ -147,10 +147,10 @@ static int poodle_set_jack(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-	if (poodle_jack_func == ucontrol->value.integer.value[0])
+	if (poodle_jack_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	poodle_jack_func = ucontrol->value.integer.value[0];
+	poodle_jack_func = ucontrol->value.enumerated.item[0];
 	poodle_ext_control(&card->dapm);
 	return 1;
 }
@@ -158,7 +158,7 @@ static int poodle_set_jack(struct snd_kcontrol *kcontrol,
 static int poodle_get_spk(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = poodle_spk_func;
+	ucontrol->value.enumerated.item[0] = poodle_spk_func;
 	return 0;
 }
 
@@ -167,10 +167,10 @@ static int poodle_set_spk(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-	if (poodle_spk_func == ucontrol->value.integer.value[0])
+	if (poodle_spk_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	poodle_spk_func = ucontrol->value.integer.value[0];
+	poodle_spk_func = ucontrol->value.enumerated.item[0];
 	poodle_ext_control(&card->dapm);
 	return 1;
 }
-- 
cgit v0.10.2


From 49a1ba16ab15b9a572c188fe0d9d79017d2fef99 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 17:23:52 +0100
Subject: ASoC: pxa: spitz: Fix enum ctl accesses in a wrong type

"Jack Function" and "Speaker Function" ctls in spitz are enum, while
the current driver accesses wrongly via value.integer.value[].  They
have to be via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/pxa/spitz.c b/sound/soc/pxa/spitz.c
index b002226..0e02634 100644
--- a/sound/soc/pxa/spitz.c
+++ b/sound/soc/pxa/spitz.c
@@ -164,7 +164,7 @@ static struct snd_soc_ops spitz_ops = {
 static int spitz_get_jack(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = spitz_jack_func;
+	ucontrol->value.enumerated.item[0] = spitz_jack_func;
 	return 0;
 }
 
@@ -173,10 +173,10 @@ static int spitz_set_jack(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-	if (spitz_jack_func == ucontrol->value.integer.value[0])
+	if (spitz_jack_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	spitz_jack_func = ucontrol->value.integer.value[0];
+	spitz_jack_func = ucontrol->value.enumerated.item[0];
 	spitz_ext_control(&card->dapm);
 	return 1;
 }
@@ -184,7 +184,7 @@ static int spitz_set_jack(struct snd_kcontrol *kcontrol,
 static int spitz_get_spk(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = spitz_spk_func;
+	ucontrol->value.enumerated.item[0] = spitz_spk_func;
 	return 0;
 }
 
@@ -193,10 +193,10 @@ static int spitz_set_spk(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-	if (spitz_spk_func == ucontrol->value.integer.value[0])
+	if (spitz_spk_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	spitz_spk_func = ucontrol->value.integer.value[0];
+	spitz_spk_func = ucontrol->value.enumerated.item[0];
 	spitz_ext_control(&card->dapm);
 	return 1;
 }
-- 
cgit v0.10.2


From 419396d5a1d12003a18a58785687697800d2f02a Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 17:23:53 +0100
Subject: ASoC: pxa: tosa: Fix enum ctl accesses in a wrong type

"Jack Function" and "Speaker Function" ctls in tosa are enum, while
the current driver accesses wrongly via value.integer.value[].  They
have to be via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/pxa/tosa.c b/sound/soc/pxa/tosa.c
index 49518dd..c508f02 100644
--- a/sound/soc/pxa/tosa.c
+++ b/sound/soc/pxa/tosa.c
@@ -95,7 +95,7 @@ static struct snd_soc_ops tosa_ops = {
 static int tosa_get_jack(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = tosa_jack_func;
+	ucontrol->value.enumerated.item[0] = tosa_jack_func;
 	return 0;
 }
 
@@ -104,10 +104,10 @@ static int tosa_set_jack(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-	if (tosa_jack_func == ucontrol->value.integer.value[0])
+	if (tosa_jack_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	tosa_jack_func = ucontrol->value.integer.value[0];
+	tosa_jack_func = ucontrol->value.enumerated.item[0];
 	tosa_ext_control(&card->dapm);
 	return 1;
 }
@@ -115,7 +115,7 @@ static int tosa_set_jack(struct snd_kcontrol *kcontrol,
 static int tosa_get_spk(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = tosa_spk_func;
+	ucontrol->value.enumerated.item[0] = tosa_spk_func;
 	return 0;
 }
 
@@ -124,10 +124,10 @@ static int tosa_set_spk(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-	if (tosa_spk_func == ucontrol->value.integer.value[0])
+	if (tosa_spk_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	tosa_spk_func = ucontrol->value.integer.value[0];
+	tosa_spk_func = ucontrol->value.enumerated.item[0];
 	tosa_ext_control(&card->dapm);
 	return 1;
 }
-- 
cgit v0.10.2


From dd90533cd6bbfe075a72ab789b6e9d6f280ba476 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 17:26:21 +0100
Subject: ASoC: omap: n810: Fix enum ctl accesses in a wrong type

"Jack Function", "Speaker Function" and "Input Select" ctls in n810
driver are enum, while the current driver accesses wrongly via
value.integer.value[].  They have to be via value.enumerated.item[]
instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Acked-by: Jarkko Nikula <jarkko.nikula@bitmer.com>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/omap/n810.c b/sound/soc/omap/n810.c
index 190f868..fdecb70 100644
--- a/sound/soc/omap/n810.c
+++ b/sound/soc/omap/n810.c
@@ -133,7 +133,7 @@ static struct snd_soc_ops n810_ops = {
 static int n810_get_spk(struct snd_kcontrol *kcontrol,
 			struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = n810_spk_func;
+	ucontrol->value.enumerated.item[0] = n810_spk_func;
 
 	return 0;
 }
@@ -143,10 +143,10 @@ static int n810_set_spk(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-	if (n810_spk_func == ucontrol->value.integer.value[0])
+	if (n810_spk_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	n810_spk_func = ucontrol->value.integer.value[0];
+	n810_spk_func = ucontrol->value.enumerated.item[0];
 	n810_ext_control(&card->dapm);
 
 	return 1;
@@ -155,7 +155,7 @@ static int n810_set_spk(struct snd_kcontrol *kcontrol,
 static int n810_get_jack(struct snd_kcontrol *kcontrol,
 			 struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = n810_jack_func;
+	ucontrol->value.enumerated.item[0] = n810_jack_func;
 
 	return 0;
 }
@@ -165,10 +165,10 @@ static int n810_set_jack(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-	if (n810_jack_func == ucontrol->value.integer.value[0])
+	if (n810_jack_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	n810_jack_func = ucontrol->value.integer.value[0];
+	n810_jack_func = ucontrol->value.enumerated.item[0];
 	n810_ext_control(&card->dapm);
 
 	return 1;
@@ -177,7 +177,7 @@ static int n810_set_jack(struct snd_kcontrol *kcontrol,
 static int n810_get_input(struct snd_kcontrol *kcontrol,
 			  struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = n810_dmic_func;
+	ucontrol->value.enumerated.item[0] = n810_dmic_func;
 
 	return 0;
 }
@@ -187,10 +187,10 @@ static int n810_set_input(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card =  snd_kcontrol_chip(kcontrol);
 
-	if (n810_dmic_func == ucontrol->value.integer.value[0])
+	if (n810_dmic_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	n810_dmic_func = ucontrol->value.integer.value[0];
+	n810_dmic_func = ucontrol->value.enumerated.item[0];
 	n810_ext_control(&card->dapm);
 
 	return 1;
-- 
cgit v0.10.2


From f4d438eec7fe56d55998195e08dfaa5aa3e08f0c Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 17:26:22 +0100
Subject: ASoC: omap: rx51: Fix enum ctl accesses in a wrong type

"Speaker Function", "Input Select" and "Jack Function" ctls in rx51
driver are enum, while the current driver accesses wrongly via
value.integer.value[].  They have to be via value.enumerated.item[]
instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Acked-by: Jarkko Nikula <jarkko.nikula@bitmer.com>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/omap/rx51.c b/sound/soc/omap/rx51.c
index 5e21f08..5494924 100644
--- a/sound/soc/omap/rx51.c
+++ b/sound/soc/omap/rx51.c
@@ -132,7 +132,7 @@ static struct snd_soc_ops rx51_ops = {
 static int rx51_get_spk(struct snd_kcontrol *kcontrol,
 			struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = rx51_spk_func;
+	ucontrol->value.enumerated.item[0] = rx51_spk_func;
 
 	return 0;
 }
@@ -142,10 +142,10 @@ static int rx51_set_spk(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-	if (rx51_spk_func == ucontrol->value.integer.value[0])
+	if (rx51_spk_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	rx51_spk_func = ucontrol->value.integer.value[0];
+	rx51_spk_func = ucontrol->value.enumerated.item[0];
 	rx51_ext_control(&card->dapm);
 
 	return 1;
@@ -180,7 +180,7 @@ static int rx51_hp_event(struct snd_soc_dapm_widget *w,
 static int rx51_get_input(struct snd_kcontrol *kcontrol,
 			  struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = rx51_dmic_func;
+	ucontrol->value.enumerated.item[0] = rx51_dmic_func;
 
 	return 0;
 }
@@ -190,10 +190,10 @@ static int rx51_set_input(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-	if (rx51_dmic_func == ucontrol->value.integer.value[0])
+	if (rx51_dmic_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	rx51_dmic_func = ucontrol->value.integer.value[0];
+	rx51_dmic_func = ucontrol->value.enumerated.item[0];
 	rx51_ext_control(&card->dapm);
 
 	return 1;
@@ -202,7 +202,7 @@ static int rx51_set_input(struct snd_kcontrol *kcontrol,
 static int rx51_get_jack(struct snd_kcontrol *kcontrol,
 			 struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = rx51_jack_func;
+	ucontrol->value.enumerated.item[0] = rx51_jack_func;
 
 	return 0;
 }
@@ -212,10 +212,10 @@ static int rx51_set_jack(struct snd_kcontrol *kcontrol,
 {
 	struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 
-	if (rx51_jack_func == ucontrol->value.integer.value[0])
+	if (rx51_jack_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
-	rx51_jack_func = ucontrol->value.integer.value[0];
+	rx51_jack_func = ucontrol->value.enumerated.item[0];
 	rx51_ext_control(&card->dapm);
 
 	return 1;
-- 
cgit v0.10.2


From 508ddfba37cad76d774da8f8d8e2387f0eca5fa4 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 17:29:25 +0100
Subject: ASoC: intel: mfld: Fix enum ctl accesses in a wrong type

"Playback Switch" and "Lineout Mux" ctls in medfld machine driver are
enum, while the current driver accesses wrongly via
value.integer.value[].  They have to be via value.enumerated.item[]
instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/intel/boards/mfld_machine.c b/sound/soc/intel/boards/mfld_machine.c
index 49c09a0..34f46c7 100644
--- a/sound/soc/intel/boards/mfld_machine.c
+++ b/sound/soc/intel/boards/mfld_machine.c
@@ -94,7 +94,7 @@ static const struct soc_enum lo_enum =
 static int headset_get_switch(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = hs_switch;
+	ucontrol->value.enumerated.item[0] = hs_switch;
 	return 0;
 }
 
@@ -104,12 +104,12 @@ static int headset_set_switch(struct snd_kcontrol *kcontrol,
 	struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 	struct snd_soc_dapm_context *dapm = &card->dapm;
 
-	if (ucontrol->value.integer.value[0] == hs_switch)
+	if (ucontrol->value.enumerated.item[0] == hs_switch)
 		return 0;
 
 	snd_soc_dapm_mutex_lock(dapm);
 
-	if (ucontrol->value.integer.value[0]) {
+	if (ucontrol->value.enumerated.item[0]) {
 		pr_debug("hs_set HS path\n");
 		snd_soc_dapm_enable_pin_unlocked(dapm, "Headphones");
 		snd_soc_dapm_disable_pin_unlocked(dapm, "EPOUT");
@@ -123,7 +123,7 @@ static int headset_set_switch(struct snd_kcontrol *kcontrol,
 
 	snd_soc_dapm_mutex_unlock(dapm);
 
-	hs_switch = ucontrol->value.integer.value[0];
+	hs_switch = ucontrol->value.enumerated.item[0];
 
 	return 0;
 }
@@ -148,7 +148,7 @@ static void lo_enable_out_pins(struct snd_soc_dapm_context *dapm)
 static int lo_get_switch(struct snd_kcontrol *kcontrol,
 	struct snd_ctl_elem_value *ucontrol)
 {
-	ucontrol->value.integer.value[0] = lo_dac;
+	ucontrol->value.enumerated.item[0] = lo_dac;
 	return 0;
 }
 
@@ -158,7 +158,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
 	struct snd_soc_card *card = snd_kcontrol_chip(kcontrol);
 	struct snd_soc_dapm_context *dapm = &card->dapm;
 
-	if (ucontrol->value.integer.value[0] == lo_dac)
+	if (ucontrol->value.enumerated.item[0] == lo_dac)
 		return 0;
 
 	snd_soc_dapm_mutex_lock(dapm);
@@ -168,7 +168,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
 	 */
 	lo_enable_out_pins(dapm);
 
-	switch (ucontrol->value.integer.value[0]) {
+	switch (ucontrol->value.enumerated.item[0]) {
 	case 0:
 		pr_debug("set vibra path\n");
 		snd_soc_dapm_disable_pin_unlocked(dapm, "VIB1OUT");
@@ -202,7 +202,7 @@ static int lo_set_switch(struct snd_kcontrol *kcontrol,
 
 	snd_soc_dapm_mutex_unlock(dapm);
 
-	lo_dac = ucontrol->value.integer.value[0];
+	lo_dac = ucontrol->value.enumerated.item[0];
 	return 0;
 }
 
-- 
cgit v0.10.2


From 89300b4e5a5b5b5c145318f3b257291239522da6 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:02:59 +0100
Subject: ASoC: cs42l51: Fix enum ctl accesses in a wrong type

"PCM channel mixer" ctl in cs42l51 codec driver is enum, while the
current driver accesses wrongly via value.integer.value[].  They have
to be via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/cs42l51.c b/sound/soc/codecs/cs42l51.c
index b395152..35488f1 100644
--- a/sound/soc/codecs/cs42l51.c
+++ b/sound/soc/codecs/cs42l51.c
@@ -60,15 +60,15 @@ static int cs42l51_get_chan_mix(struct snd_kcontrol *kcontrol,
 	switch (value) {
 	default:
 	case 0:
-		ucontrol->value.integer.value[0] = 0;
+		ucontrol->value.enumerated.item[0] = 0;
 		break;
 	/* same value : (L+R)/2 and (R+L)/2 */
 	case 1:
 	case 2:
-		ucontrol->value.integer.value[0] = 1;
+		ucontrol->value.enumerated.item[0] = 1;
 		break;
 	case 3:
-		ucontrol->value.integer.value[0] = 2;
+		ucontrol->value.enumerated.item[0] = 2;
 		break;
 	}
 
@@ -85,7 +85,7 @@ static int cs42l51_set_chan_mix(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	unsigned char val;
 
-	switch (ucontrol->value.integer.value[0]) {
+	switch (ucontrol->value.enumerated.item[0]) {
 	default:
 	case 0:
 		val = CHAN_MIX_NORMAL;
-- 
cgit v0.10.2


From fe9aba13108ee8e15ae59009a26db02460dbb04f Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:04:42 +0100
Subject: ASoC: da732x: Fix enum ctl accesses in a wrong type

"DAC1 High Pass Filter Mode" & co in da732x codec driver are enum,
while the current driver accesses wrongly via value.integer.value[].
They have to be via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/da732x.c b/sound/soc/codecs/da732x.c
index 1d5a89c..461506a 100644
--- a/sound/soc/codecs/da732x.c
+++ b/sound/soc/codecs/da732x.c
@@ -334,7 +334,7 @@ static int da732x_hpf_set(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	struct soc_enum *enum_ctrl = (struct soc_enum *)kcontrol->private_value;
 	unsigned int reg = enum_ctrl->reg;
-	unsigned int sel = ucontrol->value.integer.value[0];
+	unsigned int sel = ucontrol->value.enumerated.item[0];
 	unsigned int bits;
 
 	switch (sel) {
@@ -368,13 +368,13 @@ static int da732x_hpf_get(struct snd_kcontrol *kcontrol,
 
 	switch (val) {
 	case DA732X_HPF_VOICE_EN:
-		ucontrol->value.integer.value[0] = DA732X_HPF_VOICE;
+		ucontrol->value.enumerated.item[0] = DA732X_HPF_VOICE;
 		break;
 	case DA732X_HPF_MUSIC_EN:
-		ucontrol->value.integer.value[0] = DA732X_HPF_MUSIC;
+		ucontrol->value.enumerated.item[0] = DA732X_HPF_MUSIC;
 		break;
 	default:
-		ucontrol->value.integer.value[0] = DA732X_HPF_DISABLED;
+		ucontrol->value.enumerated.item[0] = DA732X_HPF_DISABLED;
 		break;
 	}
 
-- 
cgit v0.10.2


From 4b606316129571c46381430852c149855ac50477 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:08:00 +0100
Subject: ASoC: ab8500: Fix enum ctl accesses in a wrong type

"Sidetone Status" and "ANC Status" ctls in ab8500 codec driver are
enum, while the current driver accesses wrongly via
value.integer.value[].  They have to be via value.enumerated.item[]
instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/ab8500-codec.c b/sound/soc/codecs/ab8500-codec.c
index affb192..faae693 100644
--- a/sound/soc/codecs/ab8500-codec.c
+++ b/sound/soc/codecs/ab8500-codec.c
@@ -1130,7 +1130,7 @@ static int sid_status_control_get(struct snd_kcontrol *kcontrol,
 	struct ab8500_codec_drvdata *drvdata = dev_get_drvdata(codec->dev);
 
 	mutex_lock(&drvdata->ctrl_lock);
-	ucontrol->value.integer.value[0] = drvdata->sid_status;
+	ucontrol->value.enumerated.item[0] = drvdata->sid_status;
 	mutex_unlock(&drvdata->ctrl_lock);
 
 	return 0;
@@ -1147,7 +1147,7 @@ static int sid_status_control_put(struct snd_kcontrol *kcontrol,
 
 	dev_dbg(codec->dev, "%s: Enter\n", __func__);
 
-	if (ucontrol->value.integer.value[0] != SID_APPLY_FIR) {
+	if (ucontrol->value.enumerated.item[0] != SID_APPLY_FIR) {
 		dev_err(codec->dev,
 			"%s: ERROR: This control supports '%s' only!\n",
 			__func__, enum_sid_state[SID_APPLY_FIR]);
@@ -1199,7 +1199,7 @@ static int anc_status_control_get(struct snd_kcontrol *kcontrol,
 	struct ab8500_codec_drvdata *drvdata = dev_get_drvdata(codec->dev);
 
 	mutex_lock(&drvdata->ctrl_lock);
-	ucontrol->value.integer.value[0] = drvdata->anc_status;
+	ucontrol->value.enumerated.item[0] = drvdata->anc_status;
 	mutex_unlock(&drvdata->ctrl_lock);
 
 	return 0;
@@ -1220,7 +1220,7 @@ static int anc_status_control_put(struct snd_kcontrol *kcontrol,
 
 	mutex_lock(&drvdata->ctrl_lock);
 
-	req = ucontrol->value.integer.value[0];
+	req = ucontrol->value.enumerated.item[0];
 	if (req >= ARRAY_SIZE(enum_anc_state)) {
 		status = -EINVAL;
 		goto cleanup;
-- 
cgit v0.10.2


From 9af39044753d7280fa795528598636fe9c58a54e Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:08:01 +0100
Subject: ASoC: max98088: Fix enum ctl accesses in a wrong type

"EQ1 Mode" and "EQ2 Mode" ctls in max98088 codec driver are enum,
while the current driver accesses wrongly via value.integer.value[].
They have to be via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/max98088.c b/sound/soc/codecs/max98088.c
index 20dcc49..fc22804 100644
--- a/sound/soc/codecs/max98088.c
+++ b/sound/soc/codecs/max98088.c
@@ -1496,7 +1496,7 @@ static int max98088_put_eq_enum(struct snd_kcontrol *kcontrol,
        struct max98088_pdata *pdata = max98088->pdata;
        int channel = max98088_get_channel(codec, kcontrol->id.name);
        struct max98088_cdata *cdata;
-       int sel = ucontrol->value.integer.value[0];
+	int sel = ucontrol->value.enumerated.item[0];
 
        if (channel < 0)
 	       return channel;
-- 
cgit v0.10.2


From 58c0213872816bf24d14364989c82dc4acd58103 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:08:02 +0100
Subject: ASoC: max98095: Fix enum ctl accesses in a wrong type

"Biquad1 Mode" and "Biquad2 Mode" ctls in max98095 codec driver are
enum, while the current driver accesses wrongly via
value.integer.value[].  They have to be via value.enumerated.item[]
instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/max98095.c b/sound/soc/codecs/max98095.c
index 1fedac5..3577003 100644
--- a/sound/soc/codecs/max98095.c
+++ b/sound/soc/codecs/max98095.c
@@ -1499,7 +1499,7 @@ static int max98095_put_eq_enum(struct snd_kcontrol *kcontrol,
 	struct max98095_pdata *pdata = max98095->pdata;
 	int channel = max98095_get_eq_channel(kcontrol->id.name);
 	struct max98095_cdata *cdata;
-	unsigned int sel = ucontrol->value.integer.value[0];
+	unsigned int sel = ucontrol->value.enumerated.item[0];
 	struct max98095_eq_cfg *coef_set;
 	int fs, best, best_val, i;
 	int regmask, regsave;
@@ -1653,7 +1653,7 @@ static int max98095_put_bq_enum(struct snd_kcontrol *kcontrol,
 	struct max98095_pdata *pdata = max98095->pdata;
 	int channel = max98095_get_bq_channel(codec, kcontrol->id.name);
 	struct max98095_cdata *cdata;
-	unsigned int sel = ucontrol->value.integer.value[0];
+	unsigned int sel = ucontrol->value.enumerated.item[0];
 	struct max98095_biquad_cfg *coef_set;
 	int fs, best, best_val, i;
 	int regmask, regsave;
-- 
cgit v0.10.2


From 8733f99c23726532918034f4ae599d9e6d27bd1e Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:08:03 +0100
Subject: ASoC: tlv320dac33: Fix enum ctl accesses in a wrong type

"FIFO Mode" ctl in tlv320dac33 codec driver is enum, while the current
driver accesses wrongly via value.integer.value[].  They have to be
via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/tlv320dac33.c b/sound/soc/codecs/tlv320dac33.c
index 781398fb..f7a6ce7 100644
--- a/sound/soc/codecs/tlv320dac33.c
+++ b/sound/soc/codecs/tlv320dac33.c
@@ -446,7 +446,7 @@ static int dac33_get_fifo_mode(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	struct tlv320dac33_priv *dac33 = snd_soc_codec_get_drvdata(codec);
 
-	ucontrol->value.integer.value[0] = dac33->fifo_mode;
+	ucontrol->value.enumerated.item[0] = dac33->fifo_mode;
 
 	return 0;
 }
@@ -458,17 +458,16 @@ static int dac33_set_fifo_mode(struct snd_kcontrol *kcontrol,
 	struct tlv320dac33_priv *dac33 = snd_soc_codec_get_drvdata(codec);
 	int ret = 0;
 
-	if (dac33->fifo_mode == ucontrol->value.integer.value[0])
+	if (dac33->fifo_mode == ucontrol->value.enumerated.item[0])
 		return 0;
 	/* Do not allow changes while stream is running*/
 	if (snd_soc_codec_is_active(codec))
 		return -EPERM;
 
-	if (ucontrol->value.integer.value[0] < 0 ||
-	    ucontrol->value.integer.value[0] >= DAC33_FIFO_LAST_MODE)
+	if (ucontrol->value.enumerated.item[0] >= DAC33_FIFO_LAST_MODE)
 		ret = -EINVAL;
 	else
-		dac33->fifo_mode = ucontrol->value.integer.value[0];
+		dac33->fifo_mode = ucontrol->value.enumerated.item[0];
 
 	return ret;
 }
-- 
cgit v0.10.2


From 154a0fddd4275aa93f4683705579a7d8ec3d177b Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:08:04 +0100
Subject: ASoC: wl1273: Fix enum ctl accesses in a wrong type

"Codec Mode" and "Audio Switch" ctls in wl1273 codec driver are enum,
while the current driver accesses wrongly via value.integer.value[].
They have to be via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/wl1273.c b/sound/soc/codecs/wl1273.c
index 7693c11..1b79778 100644
--- a/sound/soc/codecs/wl1273.c
+++ b/sound/soc/codecs/wl1273.c
@@ -175,7 +175,7 @@ static int snd_wl1273_get_audio_route(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	struct wl1273_priv *wl1273 = snd_soc_codec_get_drvdata(codec);
 
-	ucontrol->value.integer.value[0] = wl1273->mode;
+	ucontrol->value.enumerated.item[0] = wl1273->mode;
 
 	return 0;
 }
@@ -193,18 +193,17 @@ static int snd_wl1273_set_audio_route(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	struct wl1273_priv *wl1273 = snd_soc_codec_get_drvdata(codec);
 
-	if (wl1273->mode == ucontrol->value.integer.value[0])
+	if (wl1273->mode == ucontrol->value.enumerated.item[0])
 		return 0;
 
 	/* Do not allow changes while stream is running */
 	if (snd_soc_codec_is_active(codec))
 		return -EPERM;
 
-	if (ucontrol->value.integer.value[0] < 0 ||
-	    ucontrol->value.integer.value[0] >=  ARRAY_SIZE(wl1273_audio_route))
+	if (ucontrol->value.enumerated.item[0] >=  ARRAY_SIZE(wl1273_audio_route))
 		return -EINVAL;
 
-	wl1273->mode = ucontrol->value.integer.value[0];
+	wl1273->mode = ucontrol->value.enumerated.item[0];
 
 	return 1;
 }
@@ -219,7 +218,7 @@ static int snd_wl1273_fm_audio_get(struct snd_kcontrol *kcontrol,
 
 	dev_dbg(codec->dev, "%s: enter.\n", __func__);
 
-	ucontrol->value.integer.value[0] = wl1273->core->audio_mode;
+	ucontrol->value.enumerated.item[0] = wl1273->core->audio_mode;
 
 	return 0;
 }
@@ -233,7 +232,7 @@ static int snd_wl1273_fm_audio_put(struct snd_kcontrol *kcontrol,
 
 	dev_dbg(codec->dev, "%s: enter.\n", __func__);
 
-	val = ucontrol->value.integer.value[0];
+	val = ucontrol->value.enumerated.item[0];
 	if (wl1273->core->audio_mode == val)
 		return 0;
 
-- 
cgit v0.10.2


From 0cad10539b281e88335afd765afaf9885dcfc3ef Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:01:10 +0100
Subject: ASoC: wm8753: Fix enum ctl accesses in a wrong type

"DAI Mode" ctl in wm8753 codec driver is enum, while the current
driver accesses wrongly via value.integer.value[].  They have to be
via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/wm8753.c b/sound/soc/codecs/wm8753.c
index 61299ca..6f1024f 100644
--- a/sound/soc/codecs/wm8753.c
+++ b/sound/soc/codecs/wm8753.c
@@ -233,7 +233,7 @@ static int wm8753_get_dai(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
 
-	ucontrol->value.integer.value[0] = wm8753->dai_func;
+	ucontrol->value.enumerated.item[0] = wm8753->dai_func;
 	return 0;
 }
 
@@ -244,7 +244,7 @@ static int wm8753_set_dai(struct snd_kcontrol *kcontrol,
 	struct wm8753_priv *wm8753 = snd_soc_codec_get_drvdata(codec);
 	u16 ioctl;
 
-	if (wm8753->dai_func == ucontrol->value.integer.value[0])
+	if (wm8753->dai_func == ucontrol->value.enumerated.item[0])
 		return 0;
 
 	if (snd_soc_codec_is_active(codec))
@@ -252,7 +252,7 @@ static int wm8753_set_dai(struct snd_kcontrol *kcontrol,
 
 	ioctl = snd_soc_read(codec, WM8753_IOCTL);
 
-	wm8753->dai_func = ucontrol->value.integer.value[0];
+	wm8753->dai_func = ucontrol->value.enumerated.item[0];
 
 	if (((ioctl >> 2) & 0x3) == wm8753->dai_func)
 		return 1;
-- 
cgit v0.10.2


From c41a024c4e770fff999f4164cc4d1696e5f17437 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:01:11 +0100
Subject: ASoC: wm8904: Fix enum ctl accesses in a wrong type

"DRC Mode" and "EQ Mode" ctls in wm8904 codec driver are enum, while
the current driver accesses wrongly via value.integer.value[].  They
have to be via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/wm8904.c b/sound/soc/codecs/wm8904.c
index 8172e49..edd7a77 100644
--- a/sound/soc/codecs/wm8904.c
+++ b/sound/soc/codecs/wm8904.c
@@ -396,7 +396,7 @@ static int wm8904_put_drc_enum(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
 	struct wm8904_pdata *pdata = wm8904->pdata;
-	int value = ucontrol->value.integer.value[0];
+	int value = ucontrol->value.enumerated.item[0];
 
 	if (value >= pdata->num_drc_cfgs)
 		return -EINVAL;
@@ -467,7 +467,7 @@ static int wm8904_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	struct wm8904_priv *wm8904 = snd_soc_codec_get_drvdata(codec);
 	struct wm8904_pdata *pdata = wm8904->pdata;
-	int value = ucontrol->value.integer.value[0];
+	int value = ucontrol->value.enumerated.item[0];
 
 	if (value >= pdata->num_retune_mobile_cfgs)
 		return -EINVAL;
-- 
cgit v0.10.2


From d0784829ae3b0beeb69b476f017d5c8a2eb95198 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:01:12 +0100
Subject: ASoC: wm8958: Fix enum ctl accesses in a wrong type

"MBC Mode", "VSS Mode", "VSS HPF Mode" and "Enhanced EQ Mode" ctls in
wm8958 codec driver are enum, while the current driver accesses
wrongly via value.integer.value[].  They have to be via
value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org

diff --git a/sound/soc/codecs/wm8958-dsp2.c b/sound/soc/codecs/wm8958-dsp2.c
index c799cca..6b864c0 100644
--- a/sound/soc/codecs/wm8958-dsp2.c
+++ b/sound/soc/codecs/wm8958-dsp2.c
@@ -459,7 +459,7 @@ static int wm8958_put_mbc_enum(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
 	struct wm8994 *control = wm8994->wm8994;
-	int value = ucontrol->value.integer.value[0];
+	int value = ucontrol->value.enumerated.item[0];
 	int reg;
 
 	/* Don't allow on the fly reconfiguration */
@@ -549,7 +549,7 @@ static int wm8958_put_vss_enum(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
 	struct wm8994 *control = wm8994->wm8994;
-	int value = ucontrol->value.integer.value[0];
+	int value = ucontrol->value.enumerated.item[0];
 	int reg;
 
 	/* Don't allow on the fly reconfiguration */
@@ -582,7 +582,7 @@ static int wm8958_put_vss_hpf_enum(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
 	struct wm8994 *control = wm8994->wm8994;
-	int value = ucontrol->value.integer.value[0];
+	int value = ucontrol->value.enumerated.item[0];
 	int reg;
 
 	/* Don't allow on the fly reconfiguration */
@@ -749,7 +749,7 @@ static int wm8958_put_enh_eq_enum(struct snd_kcontrol *kcontrol,
 	struct snd_soc_codec *codec = snd_soc_kcontrol_codec(kcontrol);
 	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
 	struct wm8994 *control = wm8994->wm8994;
-	int value = ucontrol->value.integer.value[0];
+	int value = ucontrol->value.enumerated.item[0];
 	int reg;
 
 	/* Don't allow on the fly reconfiguration */
-- 
cgit v0.10.2


From b5ab265905b3e07ad9dc7d553a074404b25e9200 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:01:13 +0100
Subject: ASoC: wm8983: Fix enum ctl accesses in a wrong type

"Equalizer Function" ctl in wm8983 codec driver is enum, while the
current driver accesses wrongly via value.integer.value[].  They have
to be via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/wm8983.c b/sound/soc/codecs/wm8983.c
index 7350ff6..0c002a5 100644
--- a/sound/soc/codecs/wm8983.c
+++ b/sound/soc/codecs/wm8983.c
@@ -497,9 +497,9 @@ static int eqmode_get(struct snd_kcontrol *kcontrol,
 
 	reg = snd_soc_read(codec, WM8983_EQ1_LOW_SHELF);
 	if (reg & WM8983_EQ3DMODE)
-		ucontrol->value.integer.value[0] = 1;
+		ucontrol->value.enumerated.item[0] = 1;
 	else
-		ucontrol->value.integer.value[0] = 0;
+		ucontrol->value.enumerated.item[0] = 0;
 
 	return 0;
 }
@@ -511,18 +511,18 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
 	unsigned int regpwr2, regpwr3;
 	unsigned int reg_eq;
 
-	if (ucontrol->value.integer.value[0] != 0
-	    && ucontrol->value.integer.value[0] != 1)
+	if (ucontrol->value.enumerated.item[0] != 0
+	    && ucontrol->value.enumerated.item[0] != 1)
 		return -EINVAL;
 
 	reg_eq = snd_soc_read(codec, WM8983_EQ1_LOW_SHELF);
 	switch ((reg_eq & WM8983_EQ3DMODE) >> WM8983_EQ3DMODE_SHIFT) {
 	case 0:
-		if (!ucontrol->value.integer.value[0])
+		if (!ucontrol->value.enumerated.item[0])
 			return 0;
 		break;
 	case 1:
-		if (ucontrol->value.integer.value[0])
+		if (ucontrol->value.enumerated.item[0])
 			return 0;
 		break;
 	}
@@ -537,7 +537,7 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
 	/* set the desired eqmode */
 	snd_soc_update_bits(codec, WM8983_EQ1_LOW_SHELF,
 			    WM8983_EQ3DMODE_MASK,
-			    ucontrol->value.integer.value[0]
+			    ucontrol->value.enumerated.item[0]
 			    << WM8983_EQ3DMODE_SHIFT);
 	/* restore DAC/ADC configuration */
 	snd_soc_write(codec, WM8983_POWER_MANAGEMENT_2, regpwr2);
-- 
cgit v0.10.2


From 251d604778f1f7bbf29672a79cccc4663a7efd62 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:01:14 +0100
Subject: ASoC: wm8985: Fix enum ctl accesses in a wrong type

"Equalizer Function" ctl in wm8985 codec driver is enum, while the
current driver accesses wrongly via value.integer.value[].  They have
to be via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/wm8985.c b/sound/soc/codecs/wm8985.c
index 9918152..6ac76fe 100644
--- a/sound/soc/codecs/wm8985.c
+++ b/sound/soc/codecs/wm8985.c
@@ -531,9 +531,9 @@ static int eqmode_get(struct snd_kcontrol *kcontrol,
 
 	reg = snd_soc_read(codec, WM8985_EQ1_LOW_SHELF);
 	if (reg & WM8985_EQ3DMODE)
-		ucontrol->value.integer.value[0] = 1;
+		ucontrol->value.enumerated.item[0] = 1;
 	else
-		ucontrol->value.integer.value[0] = 0;
+		ucontrol->value.enumerated.item[0] = 0;
 
 	return 0;
 }
@@ -545,18 +545,18 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
 	unsigned int regpwr2, regpwr3;
 	unsigned int reg_eq;
 
-	if (ucontrol->value.integer.value[0] != 0
-			&& ucontrol->value.integer.value[0] != 1)
+	if (ucontrol->value.enumerated.item[0] != 0
+			&& ucontrol->value.enumerated.item[0] != 1)
 		return -EINVAL;
 
 	reg_eq = snd_soc_read(codec, WM8985_EQ1_LOW_SHELF);
 	switch ((reg_eq & WM8985_EQ3DMODE) >> WM8985_EQ3DMODE_SHIFT) {
 	case 0:
-		if (!ucontrol->value.integer.value[0])
+		if (!ucontrol->value.enumerated.item[0])
 			return 0;
 		break;
 	case 1:
-		if (ucontrol->value.integer.value[0])
+		if (ucontrol->value.enumerated.item[0])
 			return 0;
 		break;
 	}
@@ -573,7 +573,7 @@ static int eqmode_put(struct snd_kcontrol *kcontrol,
 	/* set the desired eqmode */
 	snd_soc_update_bits(codec, WM8985_EQ1_LOW_SHELF,
 			    WM8985_EQ3DMODE_MASK,
-			    ucontrol->value.integer.value[0]
+			    ucontrol->value.enumerated.item[0]
 			    << WM8985_EQ3DMODE_SHIFT);
 	/* restore DAC/ADC configuration */
 	snd_soc_write(codec, WM8985_POWER_MANAGEMENT_2, regpwr2);
-- 
cgit v0.10.2


From 8019c0b37cd5a87107808300a496388b777225bf Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:01:15 +0100
Subject: ASoC: wm8994: Fix enum ctl accesses in a wrong type

The DRC Mode like "AIF1DRC1 Mode" and EQ Mode like "AIF1.1 EQ Mode" in
wm8994 codec driver are enum ctls, while the current driver accesses
wrongly via value.integer.value[].  They have to be via
value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org

diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c
index 2ccbb32..a18aecb 100644
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -362,7 +362,7 @@ static int wm8994_put_drc_enum(struct snd_kcontrol *kcontrol,
 	struct wm8994 *control = wm8994->wm8994;
 	struct wm8994_pdata *pdata = &control->pdata;
 	int drc = wm8994_get_drc(kcontrol->id.name);
-	int value = ucontrol->value.integer.value[0];
+	int value = ucontrol->value.enumerated.item[0];
 
 	if (drc < 0)
 		return drc;
@@ -469,7 +469,7 @@ static int wm8994_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
 	struct wm8994 *control = wm8994->wm8994;
 	struct wm8994_pdata *pdata = &control->pdata;
 	int block = wm8994_get_retune_mobile_block(kcontrol->id.name);
-	int value = ucontrol->value.integer.value[0];
+	int value = ucontrol->value.enumerated.item[0];
 
 	if (block < 0)
 		return block;
-- 
cgit v0.10.2


From 8293004c81dd33e6e91afe2f9a773fe6796893cc Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:01:16 +0100
Subject: ASoC: wm8996: Fix enum ctl accesses in a wrong type

"DSP1 EQ Mode" and "DSP2 EQ Mode" ctls in wm8996 codec driver are
enum, while the current driver accesses wrongly via
value.integer.value[].  They have to be via value.enumerated.item[]
instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/wm8996.c b/sound/soc/codecs/wm8996.c
index 8d7d6c0..f99b34f 100644
--- a/sound/soc/codecs/wm8996.c
+++ b/sound/soc/codecs/wm8996.c
@@ -416,7 +416,7 @@ static int wm8996_put_retune_mobile_enum(struct snd_kcontrol *kcontrol,
 	struct wm8996_priv *wm8996 = snd_soc_codec_get_drvdata(codec);
 	struct wm8996_pdata *pdata = &wm8996->pdata;
 	int block = wm8996_get_retune_mobile_block(kcontrol->id.name);
-	int value = ucontrol->value.integer.value[0];
+	int value = ucontrol->value.enumerated.item[0];
 
 	if (block < 0)
 		return block;
-- 
cgit v0.10.2


From 39a79fe10482572ce76fd724b7915b3ee4cbd81e Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:01:17 +0100
Subject: ASoC: wm9081: Fix enum ctl accesses in a wrong type

"Speaker Mode "ctl in wm9081 codec driver is enum, while the current
driver accesses wrongly via value.integer.value[].  They have to be
via value.enumerated.item[] instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>

diff --git a/sound/soc/codecs/wm9081.c b/sound/soc/codecs/wm9081.c
index ccb3b15..363b3b6 100644
--- a/sound/soc/codecs/wm9081.c
+++ b/sound/soc/codecs/wm9081.c
@@ -344,9 +344,9 @@ static int speaker_mode_get(struct snd_kcontrol *kcontrol,
 
 	reg = snd_soc_read(codec, WM9081_ANALOGUE_SPEAKER_2);
 	if (reg & WM9081_SPK_MODE)
-		ucontrol->value.integer.value[0] = 1;
+		ucontrol->value.enumerated.item[0] = 1;
 	else
-		ucontrol->value.integer.value[0] = 0;
+		ucontrol->value.enumerated.item[0] = 0;
 
 	return 0;
 }
@@ -365,7 +365,7 @@ static int speaker_mode_put(struct snd_kcontrol *kcontrol,
 	unsigned int reg2 = snd_soc_read(codec, WM9081_ANALOGUE_SPEAKER_2);
 
 	/* Are we changing anything? */
-	if (ucontrol->value.integer.value[0] ==
+	if (ucontrol->value.enumerated.item[0] ==
 	    ((reg2 & WM9081_SPK_MODE) != 0))
 		return 0;
 
@@ -373,7 +373,7 @@ static int speaker_mode_put(struct snd_kcontrol *kcontrol,
 	if (reg_pwr & WM9081_SPK_ENA)
 		return -EINVAL;
 
-	if (ucontrol->value.integer.value[0]) {
+	if (ucontrol->value.enumerated.item[0]) {
 		/* Class AB */
 		reg2 &= ~(WM9081_SPK_INV_MUTE | WM9081_OUT_SPK_CTRL);
 		reg2 |= WM9081_SPK_MODE;
-- 
cgit v0.10.2


From 15c665700bf6f4543f003ac0fbb1e9ec692e93f2 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 29 Feb 2016 18:01:18 +0100
Subject: ASoC: wm_adsp: Fix enum ctl accesses in a wrong type

The firmware ctls like "DSP1 Firmware" in wm_adsp codec driver are
enum, while the current driver accesses wrongly via
value.integer.value[].  They have to be via value.enumerated.item[]
instead.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: stable@vger.kernel.org

diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c
index 33806d4..b9195b9 100644
--- a/sound/soc/codecs/wm_adsp.c
+++ b/sound/soc/codecs/wm_adsp.c
@@ -586,7 +586,7 @@ static int wm_adsp_fw_get(struct snd_kcontrol *kcontrol,
 	struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
 	struct wm_adsp *dsp = snd_soc_codec_get_drvdata(codec);
 
-	ucontrol->value.integer.value[0] = dsp[e->shift_l].fw;
+	ucontrol->value.enumerated.item[0] = dsp[e->shift_l].fw;
 
 	return 0;
 }
@@ -599,10 +599,10 @@ static int wm_adsp_fw_put(struct snd_kcontrol *kcontrol,
 	struct wm_adsp *dsp = snd_soc_codec_get_drvdata(codec);
 	int ret = 0;
 
-	if (ucontrol->value.integer.value[0] == dsp[e->shift_l].fw)
+	if (ucontrol->value.enumerated.item[0] == dsp[e->shift_l].fw)
 		return 0;
 
-	if (ucontrol->value.integer.value[0] >= WM_ADSP_NUM_FW)
+	if (ucontrol->value.enumerated.item[0] >= WM_ADSP_NUM_FW)
 		return -EINVAL;
 
 	mutex_lock(&dsp[e->shift_l].pwr_lock);
@@ -610,7 +610,7 @@ static int wm_adsp_fw_put(struct snd_kcontrol *kcontrol,
 	if (dsp[e->shift_l].running || dsp[e->shift_l].compr)
 		ret = -EBUSY;
 	else
-		dsp[e->shift_l].fw = ucontrol->value.integer.value[0];
+		dsp[e->shift_l].fw = ucontrol->value.enumerated.item[0];
 
 	mutex_unlock(&dsp[e->shift_l].pwr_lock);
 
-- 
cgit v0.10.2


From 22be3b105597002ad3499bdb9c0aba49c9ab05f7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 17 Jan 2016 11:47:29 -0500
Subject: sparc32: Add -Wa,-Av8 to KBUILD_CFLAGS.

Binutils used to be (erroneously) extremely permissive about
instruction usage.  But that got fixed and if you don't properly tell
it to accept classes of instructions it will fail.

This uncovered a specs bug on sparc in gcc where it wouldn't pass the
proper options to binutils options.

Deal with this in the kernel build by adding -Wa,-Av8 to KBUILD_CFLAGS.

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/Makefile b/arch/sparc/Makefile
index eaee146..8496a07 100644
--- a/arch/sparc/Makefile
+++ b/arch/sparc/Makefile
@@ -24,7 +24,13 @@ LDFLAGS        := -m elf32_sparc
 export BITS    := 32
 UTS_MACHINE    := sparc
 
+# We are adding -Wa,-Av8 to KBUILD_CFLAGS to deal with a specs bug in some
+# versions of gcc.  Some gcc versions won't pass -Av8 to binutils when you
+# give -mcpu=v8.  This silently worked with older bintutils versions but
+# does not any more.
 KBUILD_CFLAGS  += -m32 -mcpu=v8 -pipe -mno-fpu -fcall-used-g5 -fcall-used-g7
+KBUILD_CFLAGS  += -Wa,-Av8
+
 KBUILD_AFLAGS  += -m32 -Wa,-Av8
 
 else
-- 
cgit v0.10.2


From 397d1533b6cce0ccb5379542e2e6d079f6936c46 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 1 Mar 2016 00:25:32 -0500
Subject: sparc64: Fix sparc64_set_context stack handling.

Like a signal return, we should use synchronize_user_stack() rather
than flush_user_windows().

Reported-by: Ilya Malakhov <ilmalakhovthefirst@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c
index d88beff4..39aaec1 100644
--- a/arch/sparc/kernel/signal_64.c
+++ b/arch/sparc/kernel/signal_64.c
@@ -52,7 +52,7 @@ asmlinkage void sparc64_set_context(struct pt_regs *regs)
 	unsigned char fenab;
 	int err;
 
-	flush_user_windows();
+	synchronize_user_stack();
 	if (get_thread_wsaved()					||
 	    (((unsigned long)ucp) & (sizeof(unsigned long)-1))	||
 	    (!__access_ok(ucp, sizeof(*ucp))))
-- 
cgit v0.10.2


From 5deef5551c77e488922cc4bf4bc76df63be650d0 Mon Sep 17 00:00:00 2001
From: Daniele Palmas <dnlplm@gmail.com>
Date: Mon, 29 Feb 2016 15:36:11 +0100
Subject: USB: serial: option: add support for Telit LE922 PID 0x1045

This patch adds support for 0x1045 PID of Telit LE922.

Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Johan Hovold <johan@kernel.org>

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 8849439a..8cea03f 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -270,6 +270,7 @@ static void option_instat_callback(struct urb *urb);
 #define TELIT_PRODUCT_UE910_V2			0x1012
 #define TELIT_PRODUCT_LE922_USBCFG0		0x1042
 #define TELIT_PRODUCT_LE922_USBCFG3		0x1043
+#define TELIT_PRODUCT_LE922_USBCFG5		0x1045
 #define TELIT_PRODUCT_LE920			0x1200
 #define TELIT_PRODUCT_LE910			0x1201
 
@@ -1183,6 +1184,8 @@ static const struct usb_device_id option_ids[] = {
 		.driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG3),
 		.driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg3 },
+	{ USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, TELIT_PRODUCT_LE922_USBCFG5, 0xff),
+		.driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
 		.driver_info = (kernel_ulong_t)&telit_le910_blacklist },
 	{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920),
-- 
cgit v0.10.2


From c0992d0f54847d0d1d85c60fcaa054f175ab1ccd Mon Sep 17 00:00:00 2001
From: Yegor Yefremov <yegorslists@googlemail.com>
Date: Mon, 29 Feb 2016 16:39:57 +0100
Subject: USB: serial: option: add support for Quectel UC20

Add support for Quectel UC20 and blacklist the QMI interface.

Signed-off-by: Yegor Yefremov <yegorslists@googlemail.com>
Cc: stable <stable@vger.kernel.org>
[johan: amend commit message ]
Signed-off-by: Johan Hovold <johan@kernel.org>

diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 8cea03f..348e198 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1133,6 +1133,8 @@ static const struct usb_device_id option_ids[] = {
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6613)}, /* Onda H600/ZTE MF330 */
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, 0x0023)}, /* ONYX 3G device */
 	{ USB_DEVICE(QUALCOMM_VENDOR_ID, 0x9000)}, /* SIMCom SIM5218 */
+	{ USB_DEVICE(QUALCOMM_VENDOR_ID, 0x9003), /* Quectel UC20 */
+	  .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
 	{ USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) },
 	{ USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CMU_300) },
 	{ USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6003),
-- 
cgit v0.10.2


From 537b8a8695ea0fbc857fe68f99cd5294f8fca5e3 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 29 Feb 2016 18:56:02 +0100
Subject: Revert "USB: serial: add Moxa UPORT 11x0 driver"

This reverts commit 0b2b093ad405b56a9e6f4f20a25da77ebfa9549c.

Turns out the MOXA vendor driver was basically just a copy of the
ti_usb_3410_5052 driver. We don't want two drivers for the same chip
even if mxu11x0 had gotten some much needed clean up before merge. So
let's remove the mxu11x0 driver, add support for these Moxa devices to
the TI driver, and then clean that driver up instead.

Signed-off-by: Johan Hovold <johan@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

diff --git a/drivers/usb/serial/Kconfig b/drivers/usb/serial/Kconfig
index f612dda..56ecb8b 100644
--- a/drivers/usb/serial/Kconfig
+++ b/drivers/usb/serial/Kconfig
@@ -475,22 +475,6 @@ config USB_SERIAL_MOS7840
 	  To compile this driver as a module, choose M here: the
 	  module will be called mos7840.  If unsure, choose N.
 
-config USB_SERIAL_MXUPORT11
-	tristate "USB Moxa UPORT 11x0 Serial Driver"
-	---help---
-	  Say Y here if you want to use a MOXA UPort 11x0 Serial hub.
-
-	  This driver supports:
-
-	  - UPort 1110  : 1 port RS-232 USB to Serial Hub.
-	  - UPort 1130  : 1 port RS-422/485 USB to Serial Hub.
-	  - UPort 1130I : 1 port RS-422/485 USB to Serial Hub with Isolation.
-	  - UPort 1150  : 1 port RS-232/422/485 USB to Serial Hub.
-	  - UPort 1150I : 1 port RS-232/422/485 USB to Serial Hub with Isolation.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called mxu11x0.
-
 config USB_SERIAL_MXUPORT
 	tristate "USB Moxa UPORT Serial Driver"
 	---help---
diff --git a/drivers/usb/serial/Makefile b/drivers/usb/serial/Makefile
index f3fa5e5..349d9df 100644
--- a/drivers/usb/serial/Makefile
+++ b/drivers/usb/serial/Makefile
@@ -38,7 +38,6 @@ obj-$(CONFIG_USB_SERIAL_METRO)			+= metro-usb.o
 obj-$(CONFIG_USB_SERIAL_MOS7720)		+= mos7720.o
 obj-$(CONFIG_USB_SERIAL_MOS7840)		+= mos7840.o
 obj-$(CONFIG_USB_SERIAL_MXUPORT)		+= mxuport.o
-obj-$(CONFIG_USB_SERIAL_MXUPORT11)		+= mxu11x0.o
 obj-$(CONFIG_USB_SERIAL_NAVMAN)			+= navman.o
 obj-$(CONFIG_USB_SERIAL_OMNINET)		+= omninet.o
 obj-$(CONFIG_USB_SERIAL_OPTICON)		+= opticon.o
diff --git a/drivers/usb/serial/mxu11x0.c b/drivers/usb/serial/mxu11x0.c
deleted file mode 100644
index 6196073..0000000
--- a/drivers/usb/serial/mxu11x0.c
+++ /dev/null
@@ -1,1006 +0,0 @@
-/*
- * USB Moxa UPORT 11x0 Serial Driver
- *
- * Copyright (C) 2007 MOXA Technologies Co., Ltd.
- * Copyright (C) 2015 Mathieu Othacehe <m.othacehe@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- *
- * Supports the following Moxa USB to serial converters:
- *  UPort 1110,  1 port RS-232 USB to Serial Hub.
- *  UPort 1130,  1 port RS-422/485 USB to Serial Hub.
- *  UPort 1130I, 1 port RS-422/485 USB to Serial Hub with isolation
- *    protection.
- *  UPort 1150,  1 port RS-232/422/485 USB to Serial Hub.
- *  UPort 1150I, 1 port RS-232/422/485 USB to Serial Hub with isolation
- *  protection.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/firmware.h>
-#include <linux/jiffies.h>
-#include <linux/serial.h>
-#include <linux/serial_reg.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/mutex.h>
-#include <linux/tty.h>
-#include <linux/tty_driver.h>
-#include <linux/tty_flip.h>
-#include <linux/uaccess.h>
-#include <linux/usb.h>
-#include <linux/usb/serial.h>
-
-/* Vendor and product ids */
-#define MXU1_VENDOR_ID				0x110a
-#define MXU1_1110_PRODUCT_ID			0x1110
-#define MXU1_1130_PRODUCT_ID			0x1130
-#define MXU1_1150_PRODUCT_ID			0x1150
-#define MXU1_1151_PRODUCT_ID			0x1151
-#define MXU1_1131_PRODUCT_ID			0x1131
-
-/* Commands */
-#define MXU1_GET_VERSION			0x01
-#define MXU1_GET_PORT_STATUS			0x02
-#define MXU1_GET_PORT_DEV_INFO			0x03
-#define MXU1_GET_CONFIG				0x04
-#define MXU1_SET_CONFIG				0x05
-#define MXU1_OPEN_PORT				0x06
-#define MXU1_CLOSE_PORT				0x07
-#define MXU1_START_PORT				0x08
-#define MXU1_STOP_PORT				0x09
-#define MXU1_TEST_PORT				0x0A
-#define MXU1_PURGE_PORT				0x0B
-#define MXU1_RESET_EXT_DEVICE			0x0C
-#define MXU1_GET_OUTQUEUE			0x0D
-#define MXU1_WRITE_DATA				0x80
-#define MXU1_READ_DATA				0x81
-#define MXU1_REQ_TYPE_CLASS			0x82
-
-/* Module identifiers */
-#define MXU1_I2C_PORT				0x01
-#define MXU1_IEEE1284_PORT			0x02
-#define MXU1_UART1_PORT				0x03
-#define MXU1_UART2_PORT				0x04
-#define MXU1_RAM_PORT				0x05
-
-/* Modem status */
-#define MXU1_MSR_DELTA_CTS			0x01
-#define MXU1_MSR_DELTA_DSR			0x02
-#define MXU1_MSR_DELTA_RI			0x04
-#define MXU1_MSR_DELTA_CD			0x08
-#define MXU1_MSR_CTS				0x10
-#define MXU1_MSR_DSR				0x20
-#define MXU1_MSR_RI				0x40
-#define MXU1_MSR_CD				0x80
-#define MXU1_MSR_DELTA_MASK			0x0F
-#define MXU1_MSR_MASK				0xF0
-
-/* Line status */
-#define MXU1_LSR_OVERRUN_ERROR			0x01
-#define MXU1_LSR_PARITY_ERROR			0x02
-#define MXU1_LSR_FRAMING_ERROR			0x04
-#define MXU1_LSR_BREAK				0x08
-#define MXU1_LSR_ERROR				0x0F
-#define MXU1_LSR_RX_FULL			0x10
-#define MXU1_LSR_TX_EMPTY			0x20
-
-/* Modem control */
-#define MXU1_MCR_LOOP				0x04
-#define MXU1_MCR_DTR				0x10
-#define MXU1_MCR_RTS				0x20
-
-/* Mask settings */
-#define MXU1_UART_ENABLE_RTS_IN			0x0001
-#define MXU1_UART_DISABLE_RTS			0x0002
-#define MXU1_UART_ENABLE_PARITY_CHECKING	0x0008
-#define MXU1_UART_ENABLE_DSR_OUT		0x0010
-#define MXU1_UART_ENABLE_CTS_OUT		0x0020
-#define MXU1_UART_ENABLE_X_OUT			0x0040
-#define MXU1_UART_ENABLE_XA_OUT			0x0080
-#define MXU1_UART_ENABLE_X_IN			0x0100
-#define MXU1_UART_ENABLE_DTR_IN			0x0800
-#define MXU1_UART_DISABLE_DTR			0x1000
-#define MXU1_UART_ENABLE_MS_INTS		0x2000
-#define MXU1_UART_ENABLE_AUTO_START_DMA		0x4000
-#define MXU1_UART_SEND_BREAK_SIGNAL		0x8000
-
-/* Parity */
-#define MXU1_UART_NO_PARITY			0x00
-#define MXU1_UART_ODD_PARITY			0x01
-#define MXU1_UART_EVEN_PARITY			0x02
-#define MXU1_UART_MARK_PARITY			0x03
-#define MXU1_UART_SPACE_PARITY			0x04
-
-/* Stop bits */
-#define MXU1_UART_1_STOP_BITS			0x00
-#define MXU1_UART_1_5_STOP_BITS			0x01
-#define MXU1_UART_2_STOP_BITS			0x02
-
-/* Bits per character */
-#define MXU1_UART_5_DATA_BITS			0x00
-#define MXU1_UART_6_DATA_BITS			0x01
-#define MXU1_UART_7_DATA_BITS			0x02
-#define MXU1_UART_8_DATA_BITS			0x03
-
-/* Operation modes */
-#define MXU1_UART_232				0x00
-#define MXU1_UART_485_RECEIVER_DISABLED		0x01
-#define MXU1_UART_485_RECEIVER_ENABLED		0x02
-
-/* Pipe transfer mode and timeout */
-#define MXU1_PIPE_MODE_CONTINUOUS		0x01
-#define MXU1_PIPE_MODE_MASK			0x03
-#define MXU1_PIPE_TIMEOUT_MASK			0x7C
-#define MXU1_PIPE_TIMEOUT_ENABLE		0x80
-
-/* Config struct */
-struct mxu1_uart_config {
-	__be16	wBaudRate;
-	__be16	wFlags;
-	u8	bDataBits;
-	u8	bParity;
-	u8	bStopBits;
-	char	cXon;
-	char	cXoff;
-	u8	bUartMode;
-} __packed;
-
-/* Purge modes */
-#define MXU1_PURGE_OUTPUT			0x00
-#define MXU1_PURGE_INPUT			0x80
-
-/* Read/Write data */
-#define MXU1_RW_DATA_ADDR_SFR			0x10
-#define MXU1_RW_DATA_ADDR_IDATA			0x20
-#define MXU1_RW_DATA_ADDR_XDATA			0x30
-#define MXU1_RW_DATA_ADDR_CODE			0x40
-#define MXU1_RW_DATA_ADDR_GPIO			0x50
-#define MXU1_RW_DATA_ADDR_I2C			0x60
-#define MXU1_RW_DATA_ADDR_FLASH			0x70
-#define MXU1_RW_DATA_ADDR_DSP			0x80
-
-#define MXU1_RW_DATA_UNSPECIFIED		0x00
-#define MXU1_RW_DATA_BYTE			0x01
-#define MXU1_RW_DATA_WORD			0x02
-#define MXU1_RW_DATA_DOUBLE_WORD		0x04
-
-struct mxu1_write_data_bytes {
-	u8	bAddrType;
-	u8	bDataType;
-	u8	bDataCounter;
-	__be16	wBaseAddrHi;
-	__be16	wBaseAddrLo;
-	u8	bData[0];
-} __packed;
-
-/* Interrupt codes */
-#define MXU1_CODE_HARDWARE_ERROR		0xFF
-#define MXU1_CODE_DATA_ERROR			0x03
-#define MXU1_CODE_MODEM_STATUS			0x04
-
-static inline int mxu1_get_func_from_code(unsigned char code)
-{
-	return code & 0x0f;
-}
-
-/* Download firmware max packet size */
-#define MXU1_DOWNLOAD_MAX_PACKET_SIZE		64
-
-/* Firmware image header */
-struct mxu1_firmware_header {
-	__le16 wLength;
-	u8 bCheckSum;
-} __packed;
-
-#define MXU1_UART_BASE_ADDR	    0xFFA0
-#define MXU1_UART_OFFSET_MCR	    0x0004
-
-#define MXU1_BAUD_BASE              923077
-
-#define MXU1_TRANSFER_TIMEOUT	    2
-#define MXU1_DOWNLOAD_TIMEOUT       1000
-#define MXU1_DEFAULT_CLOSING_WAIT   4000 /* in .01 secs */
-
-struct mxu1_port {
-	u8 msr;
-	u8 mcr;
-	u8 uart_mode;
-	spinlock_t spinlock; /* Protects msr */
-	struct mutex mutex; /* Protects mcr */
-	bool send_break;
-};
-
-struct mxu1_device {
-	u16 mxd_model;
-};
-
-static const struct usb_device_id mxu1_idtable[] = {
-	{ USB_DEVICE(MXU1_VENDOR_ID, MXU1_1110_PRODUCT_ID) },
-	{ USB_DEVICE(MXU1_VENDOR_ID, MXU1_1130_PRODUCT_ID) },
-	{ USB_DEVICE(MXU1_VENDOR_ID, MXU1_1150_PRODUCT_ID) },
-	{ USB_DEVICE(MXU1_VENDOR_ID, MXU1_1151_PRODUCT_ID) },
-	{ USB_DEVICE(MXU1_VENDOR_ID, MXU1_1131_PRODUCT_ID) },
-	{ }
-};
-
-MODULE_DEVICE_TABLE(usb, mxu1_idtable);
-
-/* Write the given buffer out to the control pipe.  */
-static int mxu1_send_ctrl_data_urb(struct usb_serial *serial,
-				   u8 request,
-				   u16 value, u16 index,
-				   void *data, size_t size)
-{
-	int status;
-
-	status = usb_control_msg(serial->dev,
-				 usb_sndctrlpipe(serial->dev, 0),
-				 request,
-				 (USB_DIR_OUT | USB_TYPE_VENDOR |
-				  USB_RECIP_DEVICE), value, index,
-				 data, size,
-				 USB_CTRL_SET_TIMEOUT);
-	if (status < 0) {
-		dev_err(&serial->interface->dev,
-			"%s - usb_control_msg failed: %d\n",
-			__func__, status);
-		return status;
-	}
-
-	if (status != size) {
-		dev_err(&serial->interface->dev,
-			"%s - short write (%d / %zd)\n",
-			__func__, status, size);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-/* Send a vendor request without any data */
-static int mxu1_send_ctrl_urb(struct usb_serial *serial,
-			      u8 request, u16 value, u16 index)
-{
-	return mxu1_send_ctrl_data_urb(serial, request, value, index,
-				       NULL, 0);
-}
-
-static int mxu1_download_firmware(struct usb_serial *serial,
-				  const struct firmware *fw_p)
-{
-	int status = 0;
-	int buffer_size;
-	int pos;
-	int len;
-	int done;
-	u8 cs = 0;
-	u8 *buffer;
-	struct usb_device *dev = serial->dev;
-	struct mxu1_firmware_header *header;
-	unsigned int pipe;
-
-	pipe = usb_sndbulkpipe(dev, serial->port[0]->bulk_out_endpointAddress);
-
-	buffer_size = fw_p->size + sizeof(*header);
-	buffer = kmalloc(buffer_size, GFP_KERNEL);
-	if (!buffer)
-		return -ENOMEM;
-
-	memcpy(buffer, fw_p->data, fw_p->size);
-	memset(buffer + fw_p->size, 0xff, buffer_size - fw_p->size);
-
-	for (pos = sizeof(*header); pos < buffer_size; pos++)
-		cs = (u8)(cs + buffer[pos]);
-
-	header = (struct mxu1_firmware_header *)buffer;
-	header->wLength = cpu_to_le16(buffer_size - sizeof(*header));
-	header->bCheckSum = cs;
-
-	dev_dbg(&dev->dev, "%s - downloading firmware\n", __func__);
-
-	for (pos = 0; pos < buffer_size; pos += done) {
-		len = min(buffer_size - pos, MXU1_DOWNLOAD_MAX_PACKET_SIZE);
-
-		status = usb_bulk_msg(dev, pipe, buffer + pos, len, &done,
-				MXU1_DOWNLOAD_TIMEOUT);
-		if (status)
-			break;
-	}
-
-	kfree(buffer);
-
-	if (status) {
-		dev_err(&dev->dev, "failed to download firmware: %d\n", status);
-		return status;
-	}
-
-	msleep_interruptible(100);
-	usb_reset_device(dev);
-
-	dev_dbg(&dev->dev, "%s - download successful\n", __func__);
-
-	return 0;
-}
-
-static int mxu1_port_probe(struct usb_serial_port *port)
-{
-	struct mxu1_port *mxport;
-	struct mxu1_device *mxdev;
-
-	if (!port->interrupt_in_urb) {
-		dev_err(&port->dev, "no interrupt urb\n");
-		return -ENODEV;
-	}
-
-	mxport = kzalloc(sizeof(struct mxu1_port), GFP_KERNEL);
-	if (!mxport)
-		return -ENOMEM;
-
-	spin_lock_init(&mxport->spinlock);
-	mutex_init(&mxport->mutex);
-
-	mxdev = usb_get_serial_data(port->serial);
-
-	switch (mxdev->mxd_model) {
-	case MXU1_1110_PRODUCT_ID:
-	case MXU1_1150_PRODUCT_ID:
-	case MXU1_1151_PRODUCT_ID:
-		mxport->uart_mode = MXU1_UART_232;
-		break;
-	case MXU1_1130_PRODUCT_ID:
-	case MXU1_1131_PRODUCT_ID:
-		mxport->uart_mode = MXU1_UART_485_RECEIVER_DISABLED;
-		break;
-	}
-
-	usb_set_serial_port_data(port, mxport);
-
-	port->port.closing_wait =
-			msecs_to_jiffies(MXU1_DEFAULT_CLOSING_WAIT * 10);
-	port->port.drain_delay = 1;
-
-	return 0;
-}
-
-static int mxu1_port_remove(struct usb_serial_port *port)
-{
-	struct mxu1_port *mxport;
-
-	mxport = usb_get_serial_port_data(port);
-	kfree(mxport);
-
-	return 0;
-}
-
-static int mxu1_startup(struct usb_serial *serial)
-{
-	struct mxu1_device *mxdev;
-	struct usb_device *dev = serial->dev;
-	struct usb_host_interface *cur_altsetting;
-	char fw_name[32];
-	const struct firmware *fw_p = NULL;
-	int err;
-
-	dev_dbg(&serial->interface->dev, "%s - product 0x%04X, num configurations %d, configuration value %d\n",
-		__func__, le16_to_cpu(dev->descriptor.idProduct),
-		dev->descriptor.bNumConfigurations,
-		dev->actconfig->desc.bConfigurationValue);
-
-	/* create device structure */
-	mxdev = kzalloc(sizeof(struct mxu1_device), GFP_KERNEL);
-	if (!mxdev)
-		return -ENOMEM;
-
-	usb_set_serial_data(serial, mxdev);
-
-	mxdev->mxd_model = le16_to_cpu(dev->descriptor.idProduct);
-
-	cur_altsetting = serial->interface->cur_altsetting;
-
-	/* if we have only 1 configuration, download firmware */
-	if (cur_altsetting->desc.bNumEndpoints == 1) {
-
-		snprintf(fw_name,
-			 sizeof(fw_name),
-			 "moxa/moxa-%04x.fw",
-			 mxdev->mxd_model);
-
-		err = request_firmware(&fw_p, fw_name, &serial->interface->dev);
-		if (err) {
-			dev_err(&serial->interface->dev, "failed to request firmware: %d\n",
-				err);
-			goto err_free_mxdev;
-		}
-
-		err = mxu1_download_firmware(serial, fw_p);
-		if (err)
-			goto err_release_firmware;
-
-		/* device is being reset */
-		err = -ENODEV;
-		goto err_release_firmware;
-	}
-
-	return 0;
-
-err_release_firmware:
-	release_firmware(fw_p);
-err_free_mxdev:
-	kfree(mxdev);
-
-	return err;
-}
-
-static void mxu1_release(struct usb_serial *serial)
-{
-	struct mxu1_device *mxdev;
-
-	mxdev = usb_get_serial_data(serial);
-	kfree(mxdev);
-}
-
-static int mxu1_write_byte(struct usb_serial_port *port, u32 addr,
-			   u8 mask, u8 byte)
-{
-	int status;
-	size_t size;
-	struct mxu1_write_data_bytes *data;
-
-	dev_dbg(&port->dev, "%s - addr 0x%08X, mask 0x%02X, byte 0x%02X\n",
-		__func__, addr, mask, byte);
-
-	size = sizeof(struct mxu1_write_data_bytes) + 2;
-	data = kzalloc(size, GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	data->bAddrType = MXU1_RW_DATA_ADDR_XDATA;
-	data->bDataType = MXU1_RW_DATA_BYTE;
-	data->bDataCounter = 1;
-	data->wBaseAddrHi = cpu_to_be16(addr >> 16);
-	data->wBaseAddrLo = cpu_to_be16(addr);
-	data->bData[0] = mask;
-	data->bData[1] = byte;
-
-	status = mxu1_send_ctrl_data_urb(port->serial, MXU1_WRITE_DATA, 0,
-					 MXU1_RAM_PORT, data, size);
-	if (status < 0)
-		dev_err(&port->dev, "%s - failed: %d\n", __func__, status);
-
-	kfree(data);
-
-	return status;
-}
-
-static int mxu1_set_mcr(struct usb_serial_port *port, unsigned int mcr)
-{
-	int status;
-
-	status = mxu1_write_byte(port,
-				 MXU1_UART_BASE_ADDR + MXU1_UART_OFFSET_MCR,
-				 MXU1_MCR_RTS | MXU1_MCR_DTR | MXU1_MCR_LOOP,
-				 mcr);
-	return status;
-}
-
-static void mxu1_set_termios(struct tty_struct *tty,
-			     struct usb_serial_port *port,
-			     struct ktermios *old_termios)
-{
-	struct mxu1_port *mxport = usb_get_serial_port_data(port);
-	struct mxu1_uart_config *config;
-	tcflag_t cflag, iflag;
-	speed_t baud;
-	int status;
-	unsigned int mcr;
-
-	cflag = tty->termios.c_cflag;
-	iflag = tty->termios.c_iflag;
-
-	if (old_termios &&
-	    !tty_termios_hw_change(&tty->termios, old_termios) &&
-	    tty->termios.c_iflag == old_termios->c_iflag) {
-		dev_dbg(&port->dev, "%s - nothing to change\n", __func__);
-		return;
-	}
-
-	dev_dbg(&port->dev,
-		"%s - cflag 0x%08x, iflag 0x%08x\n", __func__, cflag, iflag);
-
-	if (old_termios) {
-		dev_dbg(&port->dev, "%s - old cflag 0x%08x, old iflag 0x%08x\n",
-			__func__,
-			old_termios->c_cflag,
-			old_termios->c_iflag);
-	}
-
-	config = kzalloc(sizeof(*config), GFP_KERNEL);
-	if (!config)
-		return;
-
-	/* these flags must be set */
-	config->wFlags |= MXU1_UART_ENABLE_MS_INTS;
-	config->wFlags |= MXU1_UART_ENABLE_AUTO_START_DMA;
-	if (mxport->send_break)
-		config->wFlags |= MXU1_UART_SEND_BREAK_SIGNAL;
-	config->bUartMode = mxport->uart_mode;
-
-	switch (C_CSIZE(tty)) {
-	case CS5:
-		config->bDataBits = MXU1_UART_5_DATA_BITS;
-		break;
-	case CS6:
-		config->bDataBits = MXU1_UART_6_DATA_BITS;
-		break;
-	case CS7:
-		config->bDataBits = MXU1_UART_7_DATA_BITS;
-		break;
-	default:
-	case CS8:
-		config->bDataBits = MXU1_UART_8_DATA_BITS;
-		break;
-	}
-
-	if (C_PARENB(tty)) {
-		config->wFlags |= MXU1_UART_ENABLE_PARITY_CHECKING;
-		if (C_CMSPAR(tty)) {
-			if (C_PARODD(tty))
-				config->bParity = MXU1_UART_MARK_PARITY;
-			else
-				config->bParity = MXU1_UART_SPACE_PARITY;
-		} else {
-			if (C_PARODD(tty))
-				config->bParity = MXU1_UART_ODD_PARITY;
-			else
-				config->bParity = MXU1_UART_EVEN_PARITY;
-		}
-	} else {
-		config->bParity = MXU1_UART_NO_PARITY;
-	}
-
-	if (C_CSTOPB(tty))
-		config->bStopBits = MXU1_UART_2_STOP_BITS;
-	else
-		config->bStopBits = MXU1_UART_1_STOP_BITS;
-
-	if (C_CRTSCTS(tty)) {
-		/* RTS flow control must be off to drop RTS for baud rate B0 */
-		if (C_BAUD(tty) != B0)
-			config->wFlags |= MXU1_UART_ENABLE_RTS_IN;
-		config->wFlags |= MXU1_UART_ENABLE_CTS_OUT;
-	}
-
-	if (I_IXOFF(tty) || I_IXON(tty)) {
-		config->cXon  = START_CHAR(tty);
-		config->cXoff = STOP_CHAR(tty);
-
-		if (I_IXOFF(tty))
-			config->wFlags |= MXU1_UART_ENABLE_X_IN;
-
-		if (I_IXON(tty))
-			config->wFlags |= MXU1_UART_ENABLE_X_OUT;
-	}
-
-	baud = tty_get_baud_rate(tty);
-	if (!baud)
-		baud = 9600;
-	config->wBaudRate = MXU1_BAUD_BASE / baud;
-
-	dev_dbg(&port->dev, "%s - BaudRate=%d, wBaudRate=%d, wFlags=0x%04X, bDataBits=%d, bParity=%d, bStopBits=%d, cXon=%d, cXoff=%d, bUartMode=%d\n",
-		__func__, baud, config->wBaudRate, config->wFlags,
-		config->bDataBits, config->bParity, config->bStopBits,
-		config->cXon, config->cXoff, config->bUartMode);
-
-	cpu_to_be16s(&config->wBaudRate);
-	cpu_to_be16s(&config->wFlags);
-
-	status = mxu1_send_ctrl_data_urb(port->serial, MXU1_SET_CONFIG, 0,
-					 MXU1_UART1_PORT, config,
-					 sizeof(*config));
-	if (status)
-		dev_err(&port->dev, "cannot set config: %d\n", status);
-
-	mutex_lock(&mxport->mutex);
-	mcr = mxport->mcr;
-
-	if (C_BAUD(tty) == B0)
-		mcr &= ~(MXU1_MCR_DTR | MXU1_MCR_RTS);
-	else if (old_termios && (old_termios->c_cflag & CBAUD) == B0)
-		mcr |= MXU1_MCR_DTR | MXU1_MCR_RTS;
-
-	status = mxu1_set_mcr(port, mcr);
-	if (status)
-		dev_err(&port->dev, "cannot set modem control: %d\n", status);
-	else
-		mxport->mcr = mcr;
-
-	mutex_unlock(&mxport->mutex);
-
-	kfree(config);
-}
-
-static int mxu1_get_serial_info(struct usb_serial_port *port,
-				struct serial_struct __user *ret_arg)
-{
-	struct serial_struct ret_serial;
-	unsigned cwait;
-
-	if (!ret_arg)
-		return -EFAULT;
-
-	cwait = port->port.closing_wait;
-	if (cwait != ASYNC_CLOSING_WAIT_NONE)
-		cwait = jiffies_to_msecs(cwait) / 10;
-
-	memset(&ret_serial, 0, sizeof(ret_serial));
-
-	ret_serial.type = PORT_16550A;
-	ret_serial.line = port->minor;
-	ret_serial.port = 0;
-	ret_serial.xmit_fifo_size = port->bulk_out_size;
-	ret_serial.baud_base = MXU1_BAUD_BASE;
-	ret_serial.close_delay = 5*HZ;
-	ret_serial.closing_wait = cwait;
-
-	if (copy_to_user(ret_arg, &ret_serial, sizeof(*ret_arg)))
-		return -EFAULT;
-
-	return 0;
-}
-
-
-static int mxu1_set_serial_info(struct usb_serial_port *port,
-				struct serial_struct __user *new_arg)
-{
-	struct serial_struct new_serial;
-	unsigned cwait;
-
-	if (copy_from_user(&new_serial, new_arg, sizeof(new_serial)))
-		return -EFAULT;
-
-	cwait = new_serial.closing_wait;
-	if (cwait != ASYNC_CLOSING_WAIT_NONE)
-		cwait = msecs_to_jiffies(10 * new_serial.closing_wait);
-
-	port->port.closing_wait = cwait;
-
-	return 0;
-}
-
-static int mxu1_ioctl(struct tty_struct *tty,
-		      unsigned int cmd, unsigned long arg)
-{
-	struct usb_serial_port *port = tty->driver_data;
-
-	switch (cmd) {
-	case TIOCGSERIAL:
-		return mxu1_get_serial_info(port,
-					    (struct serial_struct __user *)arg);
-	case TIOCSSERIAL:
-		return mxu1_set_serial_info(port,
-					    (struct serial_struct __user *)arg);
-	}
-
-	return -ENOIOCTLCMD;
-}
-
-static int mxu1_tiocmget(struct tty_struct *tty)
-{
-	struct usb_serial_port *port = tty->driver_data;
-	struct mxu1_port *mxport = usb_get_serial_port_data(port);
-	unsigned int result;
-	unsigned int msr;
-	unsigned int mcr;
-	unsigned long flags;
-
-	mutex_lock(&mxport->mutex);
-	spin_lock_irqsave(&mxport->spinlock, flags);
-
-	msr = mxport->msr;
-	mcr = mxport->mcr;
-
-	spin_unlock_irqrestore(&mxport->spinlock, flags);
-	mutex_unlock(&mxport->mutex);
-
-	result = ((mcr & MXU1_MCR_DTR)	? TIOCM_DTR	: 0) |
-		 ((mcr & MXU1_MCR_RTS)	? TIOCM_RTS	: 0) |
-		 ((mcr & MXU1_MCR_LOOP) ? TIOCM_LOOP	: 0) |
-		 ((msr & MXU1_MSR_CTS)	? TIOCM_CTS	: 0) |
-		 ((msr & MXU1_MSR_CD)	? TIOCM_CAR	: 0) |
-		 ((msr & MXU1_MSR_RI)	? TIOCM_RI	: 0) |
-		 ((msr & MXU1_MSR_DSR)	? TIOCM_DSR	: 0);
-
-	dev_dbg(&port->dev, "%s - 0x%04X\n", __func__, result);
-
-	return result;
-}
-
-static int mxu1_tiocmset(struct tty_struct *tty,
-			 unsigned int set, unsigned int clear)
-{
-	struct usb_serial_port *port = tty->driver_data;
-	struct mxu1_port *mxport = usb_get_serial_port_data(port);
-	int err;
-	unsigned int mcr;
-
-	mutex_lock(&mxport->mutex);
-	mcr = mxport->mcr;
-
-	if (set & TIOCM_RTS)
-		mcr |= MXU1_MCR_RTS;
-	if (set & TIOCM_DTR)
-		mcr |= MXU1_MCR_DTR;
-	if (set & TIOCM_LOOP)
-		mcr |= MXU1_MCR_LOOP;
-
-	if (clear & TIOCM_RTS)
-		mcr &= ~MXU1_MCR_RTS;
-	if (clear & TIOCM_DTR)
-		mcr &= ~MXU1_MCR_DTR;
-	if (clear & TIOCM_LOOP)
-		mcr &= ~MXU1_MCR_LOOP;
-
-	err = mxu1_set_mcr(port, mcr);
-	if (!err)
-		mxport->mcr = mcr;
-
-	mutex_unlock(&mxport->mutex);
-
-	return err;
-}
-
-static void mxu1_break(struct tty_struct *tty, int break_state)
-{
-	struct usb_serial_port *port = tty->driver_data;
-	struct mxu1_port *mxport = usb_get_serial_port_data(port);
-
-	if (break_state == -1)
-		mxport->send_break = true;
-	else
-		mxport->send_break = false;
-
-	mxu1_set_termios(tty, port, NULL);
-}
-
-static int mxu1_open(struct tty_struct *tty, struct usb_serial_port *port)
-{
-	struct mxu1_port *mxport = usb_get_serial_port_data(port);
-	struct usb_serial *serial = port->serial;
-	int status;
-	u16 open_settings;
-
-	open_settings = (MXU1_PIPE_MODE_CONTINUOUS |
-			 MXU1_PIPE_TIMEOUT_ENABLE |
-			 (MXU1_TRANSFER_TIMEOUT << 2));
-
-	mxport->msr = 0;
-
-	status = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL);
-	if (status) {
-		dev_err(&port->dev, "failed to submit interrupt urb: %d\n",
-			status);
-		return status;
-	}
-
-	if (tty)
-		mxu1_set_termios(tty, port, NULL);
-
-	status = mxu1_send_ctrl_urb(serial, MXU1_OPEN_PORT,
-				    open_settings, MXU1_UART1_PORT);
-	if (status) {
-		dev_err(&port->dev, "cannot send open command: %d\n", status);
-		goto unlink_int_urb;
-	}
-
-	status = mxu1_send_ctrl_urb(serial, MXU1_START_PORT,
-				    0, MXU1_UART1_PORT);
-	if (status) {
-		dev_err(&port->dev, "cannot send start command: %d\n", status);
-		goto unlink_int_urb;
-	}
-
-	status = mxu1_send_ctrl_urb(serial, MXU1_PURGE_PORT,
-				    MXU1_PURGE_INPUT, MXU1_UART1_PORT);
-	if (status) {
-		dev_err(&port->dev, "cannot clear input buffers: %d\n",
-			status);
-
-		goto unlink_int_urb;
-	}
-
-	status = mxu1_send_ctrl_urb(serial, MXU1_PURGE_PORT,
-				    MXU1_PURGE_OUTPUT, MXU1_UART1_PORT);
-	if (status) {
-		dev_err(&port->dev, "cannot clear output buffers: %d\n",
-			status);
-
-		goto unlink_int_urb;
-	}
-
-	/*
-	 * reset the data toggle on the bulk endpoints to work around bug in
-	 * host controllers where things get out of sync some times
-	 */
-	usb_clear_halt(serial->dev, port->write_urb->pipe);
-	usb_clear_halt(serial->dev, port->read_urb->pipe);
-
-	if (tty)
-		mxu1_set_termios(tty, port, NULL);
-
-	status = mxu1_send_ctrl_urb(serial, MXU1_OPEN_PORT,
-				    open_settings, MXU1_UART1_PORT);
-	if (status) {
-		dev_err(&port->dev, "cannot send open command: %d\n", status);
-		goto unlink_int_urb;
-	}
-
-	status = mxu1_send_ctrl_urb(serial, MXU1_START_PORT,
-				    0, MXU1_UART1_PORT);
-	if (status) {
-		dev_err(&port->dev, "cannot send start command: %d\n", status);
-		goto unlink_int_urb;
-	}
-
-	status = usb_serial_generic_open(tty, port);
-	if (status)
-		goto unlink_int_urb;
-
-	return 0;
-
-unlink_int_urb:
-	usb_kill_urb(port->interrupt_in_urb);
-
-	return status;
-}
-
-static void mxu1_close(struct usb_serial_port *port)
-{
-	int status;
-
-	usb_serial_generic_close(port);
-	usb_kill_urb(port->interrupt_in_urb);
-
-	status = mxu1_send_ctrl_urb(port->serial, MXU1_CLOSE_PORT,
-				    0, MXU1_UART1_PORT);
-	if (status) {
-		dev_err(&port->dev, "failed to send close port command: %d\n",
-			status);
-	}
-}
-
-static void mxu1_handle_new_msr(struct usb_serial_port *port, u8 msr)
-{
-	struct mxu1_port *mxport = usb_get_serial_port_data(port);
-	struct async_icount *icount;
-	unsigned long flags;
-
-	dev_dbg(&port->dev, "%s - msr 0x%02X\n", __func__, msr);
-
-	spin_lock_irqsave(&mxport->spinlock, flags);
-	mxport->msr = msr & MXU1_MSR_MASK;
-	spin_unlock_irqrestore(&mxport->spinlock, flags);
-
-	if (msr & MXU1_MSR_DELTA_MASK) {
-		icount = &port->icount;
-		if (msr & MXU1_MSR_DELTA_CTS)
-			icount->cts++;
-		if (msr & MXU1_MSR_DELTA_DSR)
-			icount->dsr++;
-		if (msr & MXU1_MSR_DELTA_CD)
-			icount->dcd++;
-		if (msr & MXU1_MSR_DELTA_RI)
-			icount->rng++;
-
-		wake_up_interruptible(&port->port.delta_msr_wait);
-	}
-}
-
-static void mxu1_interrupt_callback(struct urb *urb)
-{
-	struct usb_serial_port *port = urb->context;
-	unsigned char *data = urb->transfer_buffer;
-	int length = urb->actual_length;
-	int function;
-	int status;
-	u8 msr;
-
-	switch (urb->status) {
-	case 0:
-		break;
-	case -ECONNRESET:
-	case -ENOENT:
-	case -ESHUTDOWN:
-		dev_dbg(&port->dev, "%s - urb shutting down: %d\n",
-			__func__, urb->status);
-		return;
-	default:
-		dev_dbg(&port->dev, "%s - nonzero urb status: %d\n",
-			__func__, urb->status);
-		goto exit;
-	}
-
-	if (length != 2) {
-		dev_dbg(&port->dev, "%s - bad packet size: %d\n",
-			__func__, length);
-		goto exit;
-	}
-
-	if (data[0] == MXU1_CODE_HARDWARE_ERROR) {
-		dev_err(&port->dev, "hardware error: %d\n", data[1]);
-		goto exit;
-	}
-
-	function = mxu1_get_func_from_code(data[0]);
-
-	dev_dbg(&port->dev, "%s - function %d, data 0x%02X\n",
-		 __func__, function, data[1]);
-
-	switch (function) {
-	case MXU1_CODE_DATA_ERROR:
-		dev_dbg(&port->dev, "%s - DATA ERROR, data 0x%02X\n",
-			 __func__, data[1]);
-		break;
-
-	case MXU1_CODE_MODEM_STATUS:
-		msr = data[1];
-		mxu1_handle_new_msr(port, msr);
-		break;
-
-	default:
-		dev_err(&port->dev, "unknown interrupt code: 0x%02X\n",
-			data[1]);
-		break;
-	}
-
-exit:
-	status = usb_submit_urb(urb, GFP_ATOMIC);
-	if (status) {
-		dev_err(&port->dev, "resubmit interrupt urb failed: %d\n",
-			status);
-	}
-}
-
-static struct usb_serial_driver mxu11x0_device = {
-	.driver = {
-		.owner		= THIS_MODULE,
-		.name		= "mxu11x0",
-	},
-	.description		= "MOXA UPort 11x0",
-	.id_table		= mxu1_idtable,
-	.num_ports		= 1,
-	.port_probe             = mxu1_port_probe,
-	.port_remove            = mxu1_port_remove,
-	.attach			= mxu1_startup,
-	.release                = mxu1_release,
-	.open			= mxu1_open,
-	.close			= mxu1_close,
-	.ioctl			= mxu1_ioctl,
-	.set_termios		= mxu1_set_termios,
-	.tiocmget		= mxu1_tiocmget,
-	.tiocmset		= mxu1_tiocmset,
-	.tiocmiwait		= usb_serial_generic_tiocmiwait,
-	.get_icount		= usb_serial_generic_get_icount,
-	.break_ctl		= mxu1_break,
-	.read_int_callback	= mxu1_interrupt_callback,
-};
-
-static struct usb_serial_driver *const serial_drivers[] = {
-	&mxu11x0_device, NULL
-};
-
-module_usb_serial_driver(serial_drivers, mxu1_idtable);
-
-MODULE_AUTHOR("Mathieu Othacehe <m.othacehe@gmail.com>");
-MODULE_DESCRIPTION("MOXA UPort 11x0 USB to Serial Hub Driver");
-MODULE_LICENSE("GPL");
-MODULE_FIRMWARE("moxa/moxa-1110.fw");
-MODULE_FIRMWARE("moxa/moxa-1130.fw");
-MODULE_FIRMWARE("moxa/moxa-1131.fw");
-MODULE_FIRMWARE("moxa/moxa-1150.fw");
-MODULE_FIRMWARE("moxa/moxa-1151.fw");
-- 
cgit v0.10.2


From 17e2df4613be57d0fab68df749f6b8114e453152 Mon Sep 17 00:00:00 2001
From: Dennis Kadioglu <denk@post.com>
Date: Tue, 1 Mar 2016 14:23:29 +0100
Subject: ALSA: usb-audio: Add a quirk for Plantronics DA45

Plantronics DA45 does not support reading the sample rate which leads
to many lines of "cannot get freq at ep 0x4" and "cannot get freq at
ep 0x84". This patch adds the USB ID of the DA45 to quirks.c and
avoids those error messages.

Signed-off-by: Dennis Kadioglu <denk@post.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index 4f6ce1c..c458d60 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -1124,6 +1124,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip)
 	case USB_ID(0x045E, 0x076F): /* MS Lifecam HD-6000 */
 	case USB_ID(0x045E, 0x0772): /* MS Lifecam Studio */
 	case USB_ID(0x045E, 0x0779): /* MS Lifecam HD-3000 */
+	case USB_ID(0x047F, 0xAA05): /* Plantronics DA45 */
 	case USB_ID(0x04D8, 0xFEEA): /* Benchmark DAC1 Pre */
 	case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */
 	case USB_ID(0x21B4, 0x0081): /* AudioQuest DragonFly */
-- 
cgit v0.10.2


From 9acc54beb474c81148e2946603d141cf8716b19f Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Feb 2016 22:13:40 +0100
Subject: mac80211: check PN correctly for GCMP-encrypted fragmented MPDUs

Just like for CCMP we need to check that for GCMP the fragments
have PNs that increment by one; the spec was updated to fix this
security issue and now has the following text:

	The receiver shall discard MSDUs and MMPDUs whose constituent
	MPDU PN values are not incrementing in steps of 1.

Adapt the code for CCMP to work for GCMP as well, luckily the
relevant fields already alias each other so no code duplication
is needed (just check the aliasing with BUILD_BUG_ON.)

Cc: stable@vger.kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b84f6aa..f006f4a 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -92,7 +92,7 @@ struct ieee80211_fragment_entry {
 	u16 extra_len;
 	u16 last_frag;
 	u8 rx_queue;
-	bool ccmp; /* Whether fragments were encrypted with CCMP */
+	bool check_sequential_pn; /* needed for CCMP/GCMP */
 	u8 last_pn[6]; /* PN of the last fragment if CCMP was used */
 };
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index bc08185..bf2c3f2 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1753,7 +1753,7 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
 	entry->seq = seq;
 	entry->rx_queue = rx_queue;
 	entry->last_frag = frag;
-	entry->ccmp = 0;
+	entry->check_sequential_pn = false;
 	entry->extra_len = 0;
 
 	return entry;
@@ -1849,15 +1849,27 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 						 rx->seqno_idx, &(rx->skb));
 		if (rx->key &&
 		    (rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP ||
-		     rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256) &&
+		     rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 ||
+		     rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP ||
+		     rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) &&
 		    ieee80211_has_protected(fc)) {
 			int queue = rx->security_idx;
-			/* Store CCMP PN so that we can verify that the next
-			 * fragment has a sequential PN value. */
-			entry->ccmp = 1;
+
+			/* Store CCMP/GCMP PN so that we can verify that the
+			 * next fragment has a sequential PN value.
+			 */
+			entry->check_sequential_pn = true;
 			memcpy(entry->last_pn,
 			       rx->key->u.ccmp.rx_pn[queue],
 			       IEEE80211_CCMP_PN_LEN);
+			BUILD_BUG_ON(offsetof(struct ieee80211_key,
+					      u.ccmp.rx_pn) !=
+				     offsetof(struct ieee80211_key,
+					      u.gcmp.rx_pn));
+			BUILD_BUG_ON(sizeof(rx->key->u.ccmp.rx_pn[queue]) !=
+				     sizeof(rx->key->u.gcmp.rx_pn[queue]));
+			BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN !=
+				     IEEE80211_GCMP_PN_LEN);
 		}
 		return RX_QUEUED;
 	}
@@ -1872,15 +1884,21 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
 		return RX_DROP_MONITOR;
 	}
 
-	/* Verify that MPDUs within one MSDU have sequential PN values.
-	 * (IEEE 802.11i, 8.3.3.4.5) */
-	if (entry->ccmp) {
+	/* "The receiver shall discard MSDUs and MMPDUs whose constituent
+	 *  MPDU PN values are not incrementing in steps of 1."
+	 * see IEEE P802.11-REVmc/D5.0, 12.5.3.4.4, item d (for CCMP)
+	 * and IEEE P802.11-REVmc/D5.0, 12.5.5.4.4, item d (for GCMP)
+	 */
+	if (entry->check_sequential_pn) {
 		int i;
 		u8 pn[IEEE80211_CCMP_PN_LEN], *rpn;
 		int queue;
+
 		if (!rx->key ||
 		    (rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP &&
-		     rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256))
+		     rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256 &&
+		     rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP &&
+		     rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP_256))
 			return RX_DROP_UNUSABLE;
 		memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN);
 		for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) {
-- 
cgit v0.10.2


From 1ec7bae8bec9b72e347e01330c745ab5cdd66f0e Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Tue, 1 Mar 2016 00:29:00 +0200
Subject: mac80211: Fix Public Action frame RX in AP mode

Public Action frames use special rules for how the BSSID field (Address
3) is set. A wildcard BSSID is used in cases where the transmitter and
recipient are not members of the same BSS. As such, we need to accept
Public Action frames with wildcard BSSID.

Commit db8e17324553 ("mac80211: ignore frames between TDLS peers when
operating as AP") added a rule that drops Action frames to TDLS-peers
based on an Action frame having different DA (Address 1) and BSSID
(Address 3) values. This is not correct since it misses the possibility
of BSSID being a wildcard BSSID in which case the Address 1 would not
necessarily match.

Fix this by allowing mac80211 to accept wildcard BSSID in an Action
frame when in AP mode.

Fixes: db8e17324553 ("mac80211: ignore frames between TDLS peers when operating as AP")
Cc: stable@vger.kernel.org
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index bf2c3f2..60d093f 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3384,6 +3384,7 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
 				return false;
 			/* ignore action frames to TDLS-peers */
 			if (ieee80211_is_action(hdr->frame_control) &&
+			    !is_broadcast_ether_addr(bssid) &&
 			    !ether_addr_equal(bssid, hdr->addr1))
 				return false;
 		}
-- 
cgit v0.10.2


From c36dd3eaf1a674a45b58b922258d6eaa8932e292 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Wed, 24 Feb 2016 12:07:17 +0100
Subject: mac80211: minstrel_ht: fix a logic error in RTS/CTS handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RTS/CTS needs to be enabled if the rate is a fallback rate *or* if it's
a dual-stream rate and the sta is in dynamic SMPS mode.

Cc: stable@vger.kernel.org
Fixes: a3ebb4e1b763 ("mac80211: minstrel_ht: handle peers in dynamic SMPS")
Reported-by: Matías Richart <mrichart@fing.edu.uy>
Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 409b364..370d677 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -872,7 +872,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
 	 *  - if station is in dynamic SMPS (and streams > 1)
 	 *  - for fallback rates, to increase chances of getting through
 	 */
-	if (offset > 0 &&
+	if (offset > 0 ||
 	    (mi->sta->smps_mode == IEEE80211_SMPS_DYNAMIC &&
 	     group->streams > 1)) {
 		ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts;
-- 
cgit v0.10.2


From 9589995e46d89c70c6fa4723c5f3e5ec04c3c3e3 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Mon, 29 Feb 2016 01:02:11 -0600
Subject: CIFS: Fix duplicate line introduced by clone_file_range patch

Commit 04b38d601239b4 ("vfs: pull btrfs clone API to vfs layer")
added a duplicated line (in cifsfs.c) which causes a sparse compile
warning.

Signed-off-by: Steve French <steve.french@primarydata.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c48ca13..2eea403 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1013,7 +1013,6 @@ const struct file_operations cifs_file_strict_ops = {
 	.llseek = cifs_llseek,
 	.unlocked_ioctl	= cifs_ioctl,
 	.clone_file_range = cifs_clone_file_range,
-	.clone_file_range = cifs_clone_file_range,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
-- 
cgit v0.10.2


From 7314d22a2f5bd40468d57768be368c3d9b4bd726 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Sun, 21 Feb 2016 15:16:48 +0100
Subject: i2c: brcmstb: allocate correct amount of memory for regmap

We want the size of the struct, not of a pointer to it. To be future
proof, just dereference the pointer to get the desired type.

Fixes: dd1aa2524bc5 ("i2c: brcmstb: Add Broadcom settop SoC i2c controller driver")
Acked-by: Gregory Fong <gregory.0xf0@gmail.com>
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Kamal Dasu <kdasu.kdev@gmail.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
Cc: stable@kernel.org

diff --git a/drivers/i2c/busses/i2c-brcmstb.c b/drivers/i2c/busses/i2c-brcmstb.c
index 3711df1..4a45408 100644
--- a/drivers/i2c/busses/i2c-brcmstb.c
+++ b/drivers/i2c/busses/i2c-brcmstb.c
@@ -586,8 +586,7 @@ static int brcmstb_i2c_probe(struct platform_device *pdev)
 	if (!dev)
 		return -ENOMEM;
 
-	dev->bsc_regmap = devm_kzalloc(&pdev->dev, sizeof(struct bsc_regs *),
-				       GFP_KERNEL);
+	dev->bsc_regmap = devm_kzalloc(&pdev->dev, sizeof(*dev->bsc_regmap), GFP_KERNEL);
 	if (!dev->bsc_regmap)
 		return -ENOMEM;
 
-- 
cgit v0.10.2


From 197b958c1e76a575d77038cc98b4bebc2134279f Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Tue, 1 Mar 2016 18:30:18 +0100
Subject: ALSA: seq: oss: Don't drain at closing a client

The OSS sequencer client tries to drain the pending events at
releasing.  Unfortunately, as spotted by syzkaller fuzzer, this may
lead to an unkillable process state when the event has been queued at
the far future.  Since the process being released can't be signaled
any longer, it remains and waits for the echo-back event in that far
future.

Back to history, the draining feature was implemented at the time we
misinterpreted POSIX definition for blocking file operation.
Actually, such a behavior is superfluous at release, and we should
just release the device as is instead of keeping it up forever.

This patch just removes the draining call that may block the release
for too long time unexpectedly.

BugLink: http://lkml.kernel.org/r/CACT4Y+Y4kD-aBGj37rf-xBw9bH3GMU6P+MYg4W1e-s-paVD2pg@mail.gmail.com
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/core/seq/oss/seq_oss.c b/sound/core/seq/oss/seq_oss.c
index 8db156b..8cdf489 100644
--- a/sound/core/seq/oss/seq_oss.c
+++ b/sound/core/seq/oss/seq_oss.c
@@ -149,8 +149,6 @@ odev_release(struct inode *inode, struct file *file)
 	if ((dp = file->private_data) == NULL)
 		return 0;
 
-	snd_seq_oss_drain_write(dp);
-
 	mutex_lock(&register_mutex);
 	snd_seq_oss_release(dp);
 	mutex_unlock(&register_mutex);
diff --git a/sound/core/seq/oss/seq_oss_device.h b/sound/core/seq/oss/seq_oss_device.h
index b439243..d7b4d01 100644
--- a/sound/core/seq/oss/seq_oss_device.h
+++ b/sound/core/seq/oss/seq_oss_device.h
@@ -127,7 +127,6 @@ int snd_seq_oss_write(struct seq_oss_devinfo *dp, const char __user *buf, int co
 unsigned int snd_seq_oss_poll(struct seq_oss_devinfo *dp, struct file *file, poll_table * wait);
 
 void snd_seq_oss_reset(struct seq_oss_devinfo *dp);
-void snd_seq_oss_drain_write(struct seq_oss_devinfo *dp);
 
 /* */
 void snd_seq_oss_process_queue(struct seq_oss_devinfo *dp, abstime_t time);
diff --git a/sound/core/seq/oss/seq_oss_init.c b/sound/core/seq/oss/seq_oss_init.c
index 6779e82b..92c96a9 100644
--- a/sound/core/seq/oss/seq_oss_init.c
+++ b/sound/core/seq/oss/seq_oss_init.c
@@ -436,22 +436,6 @@ snd_seq_oss_release(struct seq_oss_devinfo *dp)
 
 
 /*
- * Wait until the queue is empty (if we don't have nonblock)
- */
-void
-snd_seq_oss_drain_write(struct seq_oss_devinfo *dp)
-{
-	if (! dp->timer->running)
-		return;
-	if (is_write_mode(dp->file_mode) && !is_nonblock_mode(dp->file_mode) &&
-	    dp->writeq) {
-		while (snd_seq_oss_writeq_sync(dp->writeq))
-			;
-	}
-}
-
-
-/*
  * reset sequencer devices
  */
 void
-- 
cgit v0.10.2


From 090e77c391dd983c8945b8e2e16d09f378d2e334 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:23 +0000
Subject: cpu/hotplug: Restructure FROZEN state handling

There are only a few callbacks which really care about FROZEN
vs. !FROZEN. No need to have extra states for this.

Publish the frozen state in an extra variable which is updated under
the hotplug lock and let the users interested deal with it w/o
imposing that extra state checks on everyone.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182340.334912357@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d2ca8c3..f2fb549 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -118,6 +118,7 @@ enum {
 
 
 #ifdef CONFIG_SMP
+extern bool cpuhp_tasks_frozen;
 /* Need to know about CPUs going up/down? */
 #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
 #define cpu_notifier(fn, pri) {					\
@@ -177,6 +178,7 @@ extern void cpu_maps_update_done(void);
 #define cpu_notifier_register_done	cpu_maps_update_done
 
 #else	/* CONFIG_SMP */
+#define cpuhp_tasks_frozen	0
 
 #define cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 #define __cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5b9d396..41a6cb8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -29,6 +29,8 @@
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
+bool cpuhp_tasks_frozen;
+EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);
 
 /*
  * The following two APIs (cpu_maps_update_begin/done) must be used when
@@ -207,27 +209,30 @@ int __register_cpu_notifier(struct notifier_block *nb)
 	return raw_notifier_chain_register(&cpu_chain, nb);
 }
 
-static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
+static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call,
 			int *nr_calls)
 {
+	unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0;
+	void *hcpu = (void *)(long)cpu;
+
 	int ret;
 
-	ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
+	ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call,
 					nr_calls);
 
 	return notifier_to_errno(ret);
 }
 
-static int cpu_notify(unsigned long val, void *v)
+static int cpu_notify(unsigned long val, unsigned int cpu)
 {
-	return __cpu_notify(val, v, -1, NULL);
+	return __cpu_notify(val, cpu, -1, NULL);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static void cpu_notify_nofail(unsigned long val, void *v)
+static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
 {
-	BUG_ON(cpu_notify(val, v));
+	BUG_ON(cpu_notify(val, cpu));
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 EXPORT_SYMBOL(__register_cpu_notifier);
@@ -311,27 +316,21 @@ static inline void check_for_tasks(int dead_cpu)
 	read_unlock(&tasklist_lock);
 }
 
-struct take_cpu_down_param {
-	unsigned long mod;
-	void *hcpu;
-};
-
 /* Take this CPU down. */
 static int take_cpu_down(void *_param)
 {
-	struct take_cpu_down_param *param = _param;
-	int err;
+	int err, cpu = smp_processor_id();
 
 	/* Ensure this CPU doesn't handle any more interrupts. */
 	err = __cpu_disable();
 	if (err < 0)
 		return err;
 
-	cpu_notify(CPU_DYING | param->mod, param->hcpu);
+	cpu_notify(CPU_DYING, cpu);
 	/* Give up timekeeping duties */
 	tick_handover_do_timer();
 	/* Park the stopper thread */
-	stop_machine_park((long)param->hcpu);
+	stop_machine_park(cpu);
 	return 0;
 }
 
@@ -339,12 +338,6 @@ static int take_cpu_down(void *_param)
 static int _cpu_down(unsigned int cpu, int tasks_frozen)
 {
 	int err, nr_calls = 0;
-	void *hcpu = (void *)(long)cpu;
-	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
-	struct take_cpu_down_param tcd_param = {
-		.mod = mod,
-		.hcpu = hcpu,
-	};
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -354,10 +347,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	cpu_hotplug_begin();
 
-	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
+	cpuhp_tasks_frozen = tasks_frozen;
+
+	err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls);
 	if (err) {
 		nr_calls--;
-		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
+		__cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL);
 		pr_warn("%s: attempt to take down CPU %u failed\n",
 			__func__, cpu);
 		goto out_release;
@@ -389,10 +384,10 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	/*
 	 * So now all preempt/rcu users must observe !cpu_active().
 	 */
-	err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+	err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
-		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+		cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
 		irq_unlock_sparse();
 		goto out_release;
 	}
@@ -419,14 +414,14 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
 	tick_cleanup_dead_cpu(cpu);
-	cpu_notify_nofail(CPU_DEAD | mod, hcpu);
+	cpu_notify_nofail(CPU_DEAD, cpu);
 
 	check_for_tasks(cpu);
 
 out_release:
 	cpu_hotplug_done();
 	if (!err)
-		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
+		cpu_notify_nofail(CPU_POST_DEAD, cpu);
 	return err;
 }
 
@@ -485,10 +480,8 @@ void smpboot_thread_init(void)
 /* Requires cpu_add_remove_lock to be held */
 static int _cpu_up(unsigned int cpu, int tasks_frozen)
 {
-	int ret, nr_calls = 0;
-	void *hcpu = (void *)(long)cpu;
-	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
 	struct task_struct *idle;
+	int ret, nr_calls = 0;
 
 	cpu_hotplug_begin();
 
@@ -507,7 +500,9 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 	if (ret)
 		goto out;
 
-	ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
+	cpuhp_tasks_frozen = tasks_frozen;
+
+	ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls);
 	if (ret) {
 		nr_calls--;
 		pr_warn("%s: attempt to bring up CPU %u failed\n",
@@ -523,11 +518,11 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 	BUG_ON(!cpu_online(cpu));
 
 	/* Now call notifier in preparation. */
-	cpu_notify(CPU_ONLINE | mod, hcpu);
+	cpu_notify(CPU_ONLINE, cpu);
 
 out_notify:
 	if (ret != 0)
-		__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
+		__cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL);
 out:
 	cpu_hotplug_done();
 
@@ -719,13 +714,7 @@ core_initcall(cpu_hotplug_pm_sync_init);
  */
 void notify_cpu_starting(unsigned int cpu)
 {
-	unsigned long val = CPU_STARTING;
-
-#ifdef CONFIG_PM_SLEEP_SMP
-	if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
-		val = CPU_STARTING_FROZEN;
-#endif /* CONFIG_PM_SLEEP_SMP */
-	cpu_notify(val, (void *)(long)cpu);
+	cpu_notify(CPU_STARTING, cpu);
 }
 
 #endif /* CONFIG_SMP */
-- 
cgit v0.10.2


From ba997462435f48ad1501320e9da8770fd40c59b1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:24 +0000
Subject: cpu/hotplug: Restructure cpu_up code

Split out into separate functions, so we can convert it to a state machine.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182340.429389195@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 41a6cb8..15a4136 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -228,6 +228,43 @@ static int cpu_notify(unsigned long val, unsigned int cpu)
 	return __cpu_notify(val, cpu, -1, NULL);
 }
 
+/* Notifier wrappers for transitioning to state machine */
+static int notify_prepare(unsigned int cpu)
+{
+	int nr_calls = 0;
+	int ret;
+
+	ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls);
+	if (ret) {
+		nr_calls--;
+		printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
+				__func__, cpu);
+		__cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL);
+	}
+	return ret;
+}
+
+static int notify_online(unsigned int cpu)
+{
+	cpu_notify(CPU_ONLINE, cpu);
+	return 0;
+}
+
+static int bringup_cpu(unsigned int cpu)
+{
+	struct task_struct *idle = idle_thread_get(cpu);
+	int ret;
+
+	/* Arch-specific enabling code. */
+	ret = __cpu_up(cpu, idle);
+	if (ret) {
+		cpu_notify(CPU_UP_CANCELED, cpu);
+		return ret;
+	}
+	BUG_ON(!cpu_online(cpu));
+	return 0;
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 
 static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
@@ -481,7 +518,7 @@ void smpboot_thread_init(void)
 static int _cpu_up(unsigned int cpu, int tasks_frozen)
 {
 	struct task_struct *idle;
-	int ret, nr_calls = 0;
+	int ret;
 
 	cpu_hotplug_begin();
 
@@ -496,33 +533,21 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out;
 	}
 
+	cpuhp_tasks_frozen = tasks_frozen;
+
 	ret = smpboot_create_threads(cpu);
 	if (ret)
 		goto out;
 
-	cpuhp_tasks_frozen = tasks_frozen;
-
-	ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls);
-	if (ret) {
-		nr_calls--;
-		pr_warn("%s: attempt to bring up CPU %u failed\n",
-			__func__, cpu);
-		goto out_notify;
-	}
-
-	/* Arch-specific enabling code. */
-	ret = __cpu_up(cpu, idle);
-
-	if (ret != 0)
-		goto out_notify;
-	BUG_ON(!cpu_online(cpu));
+	ret = notify_prepare(cpu);
+	if (ret)
+		goto out;
 
-	/* Now call notifier in preparation. */
-	cpu_notify(CPU_ONLINE, cpu);
+	ret = bringup_cpu(cpu);
+	if (ret)
+		goto out;
 
-out_notify:
-	if (ret != 0)
-		__cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL);
+	notify_online(cpu);
 out:
 	cpu_hotplug_done();
 
-- 
cgit v0.10.2


From 984581728eb4b2e10baed3d606f85a119795b207 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:25 +0000
Subject: cpu/hotplug: Split out cpu down functions

Split cpu_down in separate functions in preparation for state machine
conversion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182340.511796562@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 15a4136..0b5d259 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -266,11 +266,6 @@ static int bringup_cpu(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-
-static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
-{
-	BUG_ON(cpu_notify(val, cpu));
-}
 EXPORT_SYMBOL(register_cpu_notifier);
 EXPORT_SYMBOL(__register_cpu_notifier);
 
@@ -353,6 +348,25 @@ static inline void check_for_tasks(int dead_cpu)
 	read_unlock(&tasklist_lock);
 }
 
+static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
+{
+	BUG_ON(cpu_notify(val, cpu));
+}
+
+static int notify_down_prepare(unsigned int cpu)
+{
+	int err, nr_calls = 0;
+
+	err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls);
+	if (err) {
+		nr_calls--;
+		__cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL);
+		pr_warn("%s: attempt to take down CPU %u failed\n",
+				__func__, cpu);
+	}
+	return err;
+}
+
 /* Take this CPU down. */
 static int take_cpu_down(void *_param)
 {
@@ -371,29 +385,9 @@ static int take_cpu_down(void *_param)
 	return 0;
 }
 
-/* Requires cpu_add_remove_lock to be held */
-static int _cpu_down(unsigned int cpu, int tasks_frozen)
+static int takedown_cpu(unsigned int cpu)
 {
-	int err, nr_calls = 0;
-
-	if (num_online_cpus() == 1)
-		return -EBUSY;
-
-	if (!cpu_online(cpu))
-		return -EINVAL;
-
-	cpu_hotplug_begin();
-
-	cpuhp_tasks_frozen = tasks_frozen;
-
-	err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls);
-	if (err) {
-		nr_calls--;
-		__cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL);
-		pr_warn("%s: attempt to take down CPU %u failed\n",
-			__func__, cpu);
-		goto out_release;
-	}
+	int err;
 
 	/*
 	 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
@@ -426,7 +420,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
 		irq_unlock_sparse();
-		goto out_release;
+		return err;
 	}
 	BUG_ON(cpu_online(cpu));
 
@@ -449,11 +443,40 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	/* This actually kills the CPU. */
 	__cpu_die(cpu);
 
-	/* CPU is completely dead: tell everyone.  Too late to complain. */
 	tick_cleanup_dead_cpu(cpu);
-	cpu_notify_nofail(CPU_DEAD, cpu);
+	return 0;
+}
 
+static int notify_dead(unsigned int cpu)
+{
+	cpu_notify_nofail(CPU_DEAD, cpu);
 	check_for_tasks(cpu);
+	return 0;
+}
+
+/* Requires cpu_add_remove_lock to be held */
+static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
+{
+	int err;
+
+	if (num_online_cpus() == 1)
+		return -EBUSY;
+
+	if (!cpu_online(cpu))
+		return -EINVAL;
+
+	cpu_hotplug_begin();
+
+	cpuhp_tasks_frozen = tasks_frozen;
+
+	err = notify_down_prepare(cpu);
+	if (err)
+		goto out_release;
+	err = takedown_cpu(cpu);
+	if (err)
+		goto out_release;
+
+	notify_dead(cpu);
 
 out_release:
 	cpu_hotplug_done();
-- 
cgit v0.10.2


From 5ba9ac8e2c45ab165e5b4a246f4821d319656e9d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:27 +0000
Subject: cpu/hotplug: Add tracepoints

We want to trace the hotplug machinery. Add tracepoints to track the
invocation of callbacks and their result.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182340.593563875@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/trace/events/cpuhp.h b/include/trace/events/cpuhp.h
new file mode 100644
index 0000000..a72bd93
--- /dev/null
+++ b/include/trace/events/cpuhp.h
@@ -0,0 +1,66 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM cpuhp
+
+#if !defined(_TRACE_CPUHP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_CPUHP_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(cpuhp_enter,
+
+	TP_PROTO(unsigned int cpu,
+		 int target,
+		 int idx,
+		 int (*fun)(unsigned int)),
+
+	TP_ARGS(cpu, target, idx, fun),
+
+	TP_STRUCT__entry(
+		__field( unsigned int,	cpu		)
+		__field( int,		target		)
+		__field( int,		idx		)
+		__field( void *,	fun		)
+	),
+
+	TP_fast_assign(
+		__entry->cpu	= cpu;
+		__entry->target	= target;
+		__entry->idx	= idx;
+		__entry->fun	= fun;
+	),
+
+	TP_printk("cpu: %04u target: %3d step: %3d (%pf)",
+		  __entry->cpu, __entry->target, __entry->idx, __entry->fun)
+);
+
+TRACE_EVENT(cpuhp_exit,
+
+	TP_PROTO(unsigned int cpu,
+		 int state,
+		 int idx,
+		 int ret),
+
+	TP_ARGS(cpu, state, idx, ret),
+
+	TP_STRUCT__entry(
+		__field( unsigned int,	cpu		)
+		__field( int,		state		)
+		__field( int,		idx		)
+		__field( int,		ret		)
+	),
+
+	TP_fast_assign(
+		__entry->cpu	= cpu;
+		__entry->state	= state;
+		__entry->idx	= idx;
+		__entry->ret	= ret;
+	),
+
+	TP_printk(" cpu: %04u  state: %3d step: %3d ret: %d",
+		  __entry->cpu, __entry->state, __entry->idx,  __entry->ret)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
cgit v0.10.2


From cff7d378d3fdbb53db9b6e2578b14855f401cd41 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:28 +0000
Subject: cpu/hotplug: Convert to a state machine for the control processor

Move the split out steps into a callback array and let the cpu_up/down
code iterate through the array functions. For now most of the
callbacks are asymmetric to resemble the current hotplug maze.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182340.671816690@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f2fb549..78989f2 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -16,6 +16,7 @@
 #include <linux/node.h>
 #include <linux/compiler.h>
 #include <linux/cpumask.h>
+#include <linux/cpuhotplug.h>
 
 struct device;
 struct device_node;
@@ -27,6 +28,9 @@ struct cpu {
 	struct device dev;
 };
 
+extern void boot_cpu_init(void);
+extern void boot_cpu_state_init(void);
+
 extern int register_cpu(struct cpu *cpu, int num);
 extern struct device *get_cpu_device(unsigned cpu);
 extern bool cpu_is_hotpluggable(unsigned cpu);
@@ -267,11 +271,6 @@ static inline int disable_nonboot_cpus(void) { return 0; }
 static inline void enable_nonboot_cpus(void) {}
 #endif /* !CONFIG_PM_SLEEP_SMP */
 
-enum cpuhp_state {
-	CPUHP_OFFLINE,
-	CPUHP_ONLINE,
-};
-
 void cpu_startup_entry(enum cpuhp_state state);
 
 void cpu_idle_poll_ctrl(bool enable);
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
new file mode 100644
index 0000000..d55c9e6
--- /dev/null
+++ b/include/linux/cpuhotplug.h
@@ -0,0 +1,13 @@
+#ifndef __CPUHOTPLUG_H
+#define __CPUHOTPLUG_H
+
+enum cpuhp_state {
+	CPUHP_OFFLINE,
+	CPUHP_CREATE_THREADS,
+	CPUHP_NOTIFY_PREPARE,
+	CPUHP_BRINGUP_CPU,
+	CPUHP_NOTIFY_ONLINE,
+	CPUHP_ONLINE,
+};
+
+#endif
diff --git a/init/main.c b/init/main.c
index 58c9e37..c2ea723 100644
--- a/init/main.c
+++ b/init/main.c
@@ -452,20 +452,6 @@ void __init parse_early_param(void)
 	done = 1;
 }
 
-/*
- *	Activate the first processor.
- */
-
-static void __init boot_cpu_init(void)
-{
-	int cpu = smp_processor_id();
-	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
-	set_cpu_online(cpu, true);
-	set_cpu_active(cpu, true);
-	set_cpu_present(cpu, true);
-	set_cpu_possible(cpu, true);
-}
-
 void __init __weak smp_setup_processor_id(void)
 {
 }
@@ -530,6 +516,7 @@ asmlinkage __visible void __init start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
+	boot_cpu_state_init();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
 	build_all_zonelists(NULL, NULL);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0b5d259..3018519 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -22,10 +22,64 @@
 #include <linux/lockdep.h>
 #include <linux/tick.h>
 #include <linux/irq.h>
+
 #include <trace/events/power.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpuhp.h>
 
 #include "smpboot.h"
 
+/**
+ * cpuhp_cpu_state - Per cpu hotplug state storage
+ * @state:	The current cpu state
+ * @target:	The target state
+ */
+struct cpuhp_cpu_state {
+	enum cpuhp_state	state;
+	enum cpuhp_state	target;
+};
+
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+
+/**
+ * cpuhp_step - Hotplug state machine step
+ * @name:	Name of the step
+ * @startup:	Startup function of the step
+ * @teardown:	Teardown function of the step
+ * @skip_onerr:	Do not invoke the functions on error rollback
+ *		Will go away once the notifiers	are gone
+ */
+struct cpuhp_step {
+	const char	*name;
+	int		(*startup)(unsigned int cpu);
+	int		(*teardown)(unsigned int cpu);
+	bool		skip_onerr;
+};
+
+static struct cpuhp_step cpuhp_bp_states[];
+
+/**
+ * cpuhp_invoke_callback _ Invoke the callbacks for a given state
+ * @cpu:	The cpu for which the callback should be invoked
+ * @step:	The step in the state machine
+ * @cb:		The callback function to invoke
+ *
+ * Called from cpu hotplug and from the state register machinery
+ */
+static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step,
+				 int (*cb)(unsigned int))
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	int ret = 0;
+
+	if (cb) {
+		trace_cpuhp_enter(cpu, st->target, step, cb);
+		ret = cb(cpu);
+		trace_cpuhp_exit(cpu, st->state, step, ret);
+	}
+	return ret;
+}
+
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
@@ -454,10 +508,29 @@ static int notify_dead(unsigned int cpu)
 	return 0;
 }
 
+#else
+#define notify_down_prepare	NULL
+#define takedown_cpu		NULL
+#define notify_dead		NULL
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+	for (st->state++; st->state < st->target; st->state++) {
+		struct cpuhp_step *step = cpuhp_bp_states + st->state;
+
+		if (!step->skip_onerr)
+			cpuhp_invoke_callback(cpu, st->state, step->startup);
+	}
+}
+
 /* Requires cpu_add_remove_lock to be held */
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 {
-	int err;
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	int prev_state, ret = 0;
+	bool hasdied = false;
 
 	if (num_online_cpus() == 1)
 		return -EBUSY;
@@ -469,20 +542,25 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	cpuhp_tasks_frozen = tasks_frozen;
 
-	err = notify_down_prepare(cpu);
-	if (err)
-		goto out_release;
-	err = takedown_cpu(cpu);
-	if (err)
-		goto out_release;
+	prev_state = st->state;
+	st->target = CPUHP_OFFLINE;
+	for (; st->state > st->target; st->state--) {
+		struct cpuhp_step *step = cpuhp_bp_states + st->state;
 
-	notify_dead(cpu);
+		ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
+		if (ret) {
+			st->target = prev_state;
+			undo_cpu_down(cpu, st);
+			break;
+		}
+	}
+	hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
 
-out_release:
 	cpu_hotplug_done();
-	if (!err)
+	/* This post dead nonsense must die */
+	if (!ret && hasdied)
 		cpu_notify_nofail(CPU_POST_DEAD, cpu);
-	return err;
+	return ret;
 }
 
 int cpu_down(unsigned int cpu)
@@ -537,11 +615,22 @@ void smpboot_thread_init(void)
 	register_cpu_notifier(&smpboot_thread_notifier);
 }
 
+static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+	for (st->state--; st->state > st->target; st->state--) {
+		struct cpuhp_step *step = cpuhp_bp_states + st->state;
+
+		if (!step->skip_onerr)
+			cpuhp_invoke_callback(cpu, st->state, step->teardown);
+	}
+}
+
 /* Requires cpu_add_remove_lock to be held */
 static int _cpu_up(unsigned int cpu, int tasks_frozen)
 {
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	struct task_struct *idle;
-	int ret;
+	int prev_state, ret = 0;
 
 	cpu_hotplug_begin();
 
@@ -550,6 +639,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out;
 	}
 
+	/* Let it fail before we try to bring the cpu up */
 	idle = idle_thread_get(cpu);
 	if (IS_ERR(idle)) {
 		ret = PTR_ERR(idle);
@@ -558,22 +648,22 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 
 	cpuhp_tasks_frozen = tasks_frozen;
 
-	ret = smpboot_create_threads(cpu);
-	if (ret)
-		goto out;
-
-	ret = notify_prepare(cpu);
-	if (ret)
-		goto out;
-
-	ret = bringup_cpu(cpu);
-	if (ret)
-		goto out;
-
-	notify_online(cpu);
+	prev_state = st->state;
+	st->target = CPUHP_ONLINE;
+	while (st->state < st->target) {
+		struct cpuhp_step *step;
+
+		st->state++;
+		step = cpuhp_bp_states + st->state;
+		ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
+		if (ret) {
+			st->target = prev_state;
+			undo_cpu_up(cpu, st);
+			break;
+		}
+	}
 out:
 	cpu_hotplug_done();
-
 	return ret;
 }
 
@@ -767,6 +857,44 @@ void notify_cpu_starting(unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+/* Boot processor state steps */
+static struct cpuhp_step cpuhp_bp_states[] = {
+	[CPUHP_OFFLINE] = {
+		.name			= "offline",
+		.startup		= NULL,
+		.teardown		= NULL,
+	},
+#ifdef CONFIG_SMP
+	[CPUHP_CREATE_THREADS]= {
+		.name			= "threads:create",
+		.startup		= smpboot_create_threads,
+		.teardown		= NULL,
+	},
+	[CPUHP_NOTIFY_PREPARE] = {
+		.name			= "notify:prepare",
+		.startup		= notify_prepare,
+		.teardown		= notify_dead,
+		.skip_onerr		= true,
+	},
+	[CPUHP_BRINGUP_CPU] = {
+		.name			= "cpu:bringup",
+		.startup		= bringup_cpu,
+		.teardown		= takedown_cpu,
+		.skip_onerr		= true,
+	},
+	[CPUHP_NOTIFY_ONLINE] = {
+		.name			= "notify:online",
+		.startup		= notify_online,
+		.teardown		= notify_down_prepare,
+	},
+#endif
+	[CPUHP_ONLINE] = {
+		.name			= "online",
+		.startup		= NULL,
+		.teardown		= NULL,
+	},
+};
+
 /*
  * cpu_bit_bitmap[] is a special, "compressed" data structure that
  * represents all NR_CPUS bits binary values of 1<<nr.
@@ -826,3 +954,25 @@ void init_cpu_online(const struct cpumask *src)
 {
 	cpumask_copy(&__cpu_online_mask, src);
 }
+
+/*
+ * Activate the first processor.
+ */
+void __init boot_cpu_init(void)
+{
+	int cpu = smp_processor_id();
+
+	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
+	set_cpu_online(cpu, true);
+	set_cpu_active(cpu, true);
+	set_cpu_present(cpu, true);
+	set_cpu_possible(cpu, true);
+}
+
+/*
+ * Must be called _AFTER_ setting up the per_cpu areas
+ */
+void __init boot_cpu_state_init(void)
+{
+	per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE;
+}
-- 
cgit v0.10.2


From 4baa0afc6719cbf36a1e08551484a641926b3fd1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:29 +0000
Subject: cpu/hotplug: Convert the hotplugged cpu work to a state machine

Move the functions which need to run on the hotplugged processor into
a state machine array and let the code iterate through these functions.

In a later state, this will grow synchronization points between the
control processor and the hotplugged processor, so we can move the
various architecture implementations of the synchronizations to the
core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182340.770651526@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index d55c9e6..d9303cc 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -6,6 +6,10 @@ enum cpuhp_state {
 	CPUHP_CREATE_THREADS,
 	CPUHP_NOTIFY_PREPARE,
 	CPUHP_BRINGUP_CPU,
+	CPUHP_AP_OFFLINE,
+	CPUHP_AP_NOTIFY_STARTING,
+	CPUHP_AP_ONLINE,
+	CPUHP_TEARDOWN_CPU,
 	CPUHP_NOTIFY_ONLINE,
 	CPUHP_ONLINE,
 };
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3018519..797723e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -57,6 +57,7 @@ struct cpuhp_step {
 };
 
 static struct cpuhp_step cpuhp_bp_states[];
+static struct cpuhp_step cpuhp_ap_states[];
 
 /**
  * cpuhp_invoke_callback _ Invoke the callbacks for a given state
@@ -304,6 +305,12 @@ static int notify_online(unsigned int cpu)
 	return 0;
 }
 
+static int notify_starting(unsigned int cpu)
+{
+	cpu_notify(CPU_STARTING, cpu);
+	return 0;
+}
+
 static int bringup_cpu(unsigned int cpu)
 {
 	struct task_struct *idle = idle_thread_get(cpu);
@@ -421,9 +428,17 @@ static int notify_down_prepare(unsigned int cpu)
 	return err;
 }
 
+static int notify_dying(unsigned int cpu)
+{
+	cpu_notify(CPU_DYING, cpu);
+	return 0;
+}
+
 /* Take this CPU down. */
 static int take_cpu_down(void *_param)
 {
+	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+	enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
 	int err, cpu = smp_processor_id();
 
 	/* Ensure this CPU doesn't handle any more interrupts. */
@@ -431,7 +446,12 @@ static int take_cpu_down(void *_param)
 	if (err < 0)
 		return err;
 
-	cpu_notify(CPU_DYING, cpu);
+	/* Invoke the former CPU_DYING callbacks */
+	for (; st->state > target; st->state--) {
+		struct cpuhp_step *step = cpuhp_ap_states + st->state;
+
+		cpuhp_invoke_callback(cpu, st->state, step->teardown);
+	}
 	/* Give up timekeeping duties */
 	tick_handover_do_timer();
 	/* Park the stopper thread */
@@ -512,6 +532,7 @@ static int notify_dead(unsigned int cpu)
 #define notify_down_prepare	NULL
 #define takedown_cpu		NULL
 #define notify_dead		NULL
+#define notify_dying		NULL
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -615,6 +636,28 @@ void smpboot_thread_init(void)
 	register_cpu_notifier(&smpboot_thread_notifier);
 }
 
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ */
+void notify_cpu_starting(unsigned int cpu)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+
+	while (st->state < target) {
+		struct cpuhp_step *step;
+
+		st->state++;
+		step = cpuhp_ap_states + st->state;
+		cpuhp_invoke_callback(cpu, st->state, step->startup);
+	}
+}
+
 static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
 	for (st->state--; st->state > st->target; st->state--) {
@@ -842,19 +885,6 @@ core_initcall(cpu_hotplug_pm_sync_init);
 
 #endif /* CONFIG_PM_SLEEP_SMP */
 
-/**
- * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
- * @cpu: cpu that just started
- *
- * This function calls the cpu_chain notifiers with CPU_STARTING.
- * It must be called by the arch code on the new cpu, before the new cpu
- * enables interrupts and before the "boot" cpu returns from __cpu_up().
- */
-void notify_cpu_starting(unsigned int cpu)
-{
-	cpu_notify(CPU_STARTING, cpu);
-}
-
 #endif /* CONFIG_SMP */
 
 /* Boot processor state steps */
@@ -879,8 +909,12 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 	[CPUHP_BRINGUP_CPU] = {
 		.name			= "cpu:bringup",
 		.startup		= bringup_cpu,
+		.teardown		= NULL,
+	},
+	[CPUHP_TEARDOWN_CPU] = {
+		.name			= "cpu:teardown",
+		.startup		= NULL,
 		.teardown		= takedown_cpu,
-		.skip_onerr		= true,
 	},
 	[CPUHP_NOTIFY_ONLINE] = {
 		.name			= "notify:online",
@@ -895,6 +929,23 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 	},
 };
 
+/* Application processor state steps */
+static struct cpuhp_step cpuhp_ap_states[] = {
+#ifdef CONFIG_SMP
+	[CPUHP_AP_NOTIFY_STARTING] = {
+		.name			= "notify:starting",
+		.startup		= notify_starting,
+		.teardown		= notify_dying,
+		.skip_onerr		= true,
+	},
+#endif
+	[CPUHP_ONLINE] = {
+		.name			= "online",
+		.startup		= NULL,
+		.teardown		= NULL,
+	},
+};
+
 /*
  * cpu_bit_bitmap[] is a special, "compressed" data structure that
  * represents all NR_CPUS bits binary values of 1<<nr.
-- 
cgit v0.10.2


From af1f40457da6f79b6a1d7e1198b32f671572abd2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:30 +0000
Subject: cpu/hotplug: Hand in target state to _cpu_up/down

We want to be able to bringup/teardown the cpu to a particular state. Add a
target argument to _cpu_up/down.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182340.862113133@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 797723e..a00f8f6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -547,7 +547,8 @@ static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
 }
 
 /* Requires cpu_add_remove_lock to be held */
-static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
+static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
+			   enum cpuhp_state target)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	int prev_state, ret = 0;
@@ -564,7 +565,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpuhp_tasks_frozen = tasks_frozen;
 
 	prev_state = st->state;
-	st->target = CPUHP_OFFLINE;
+	st->target = target;
 	for (; st->state > st->target; st->state--) {
 		struct cpuhp_step *step = cpuhp_bp_states + st->state;
 
@@ -584,7 +585,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	return ret;
 }
 
-int cpu_down(unsigned int cpu)
+static int do_cpu_down(unsigned int cpu, enum cpuhp_state target)
 {
 	int err;
 
@@ -595,12 +596,16 @@ int cpu_down(unsigned int cpu)
 		goto out;
 	}
 
-	err = _cpu_down(cpu, 0);
+	err = _cpu_down(cpu, 0, target);
 
 out:
 	cpu_maps_update_done();
 	return err;
 }
+int cpu_down(unsigned int cpu)
+{
+	return do_cpu_down(cpu, CPUHP_OFFLINE);
+}
 EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
@@ -669,7 +674,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
 }
 
 /* Requires cpu_add_remove_lock to be held */
-static int _cpu_up(unsigned int cpu, int tasks_frozen)
+static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	struct task_struct *idle;
@@ -692,7 +697,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 	cpuhp_tasks_frozen = tasks_frozen;
 
 	prev_state = st->state;
-	st->target = CPUHP_ONLINE;
+	st->target = target;
 	while (st->state < st->target) {
 		struct cpuhp_step *step;
 
@@ -710,7 +715,7 @@ out:
 	return ret;
 }
 
-int cpu_up(unsigned int cpu)
+static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
 {
 	int err = 0;
 
@@ -734,12 +739,16 @@ int cpu_up(unsigned int cpu)
 		goto out;
 	}
 
-	err = _cpu_up(cpu, 0);
-
+	err = _cpu_up(cpu, 0, target);
 out:
 	cpu_maps_update_done();
 	return err;
 }
+
+int cpu_up(unsigned int cpu)
+{
+	return do_cpu_up(cpu, CPUHP_ONLINE);
+}
 EXPORT_SYMBOL_GPL(cpu_up);
 
 #ifdef CONFIG_PM_SLEEP_SMP
@@ -762,7 +771,7 @@ int disable_nonboot_cpus(void)
 		if (cpu == first_cpu)
 			continue;
 		trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
-		error = _cpu_down(cpu, 1);
+		error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
 		trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
 		if (!error)
 			cpumask_set_cpu(cpu, frozen_cpus);
@@ -812,7 +821,7 @@ void enable_nonboot_cpus(void)
 
 	for_each_cpu(cpu, frozen_cpus) {
 		trace_suspend_resume(TPS("CPU_ON"), cpu, true);
-		error = _cpu_up(cpu, 1);
+		error = _cpu_up(cpu, 1, CPUHP_ONLINE);
 		trace_suspend_resume(TPS("CPU_ON"), cpu, false);
 		if (!error) {
 			pr_info("CPU%d is up\n", cpu);
-- 
cgit v0.10.2


From 98f8cdce1db580b99fce823a48eea2cb2bdb261e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:31 +0000
Subject: cpu/hotplug: Add sysfs state interface

Add a sysfs interface so we can actually see in which state the cpus are in.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182340.942257522@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index a00f8f6..1979b89 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -56,6 +56,7 @@ struct cpuhp_step {
 	bool		skip_onerr;
 };
 
+static DEFINE_MUTEX(cpuhp_state_mutex);
 static struct cpuhp_step cpuhp_bp_states[];
 static struct cpuhp_step cpuhp_ap_states[];
 
@@ -955,6 +956,105 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 	},
 };
 
+static bool cpuhp_is_ap_state(enum cpuhp_state state)
+{
+	return (state > CPUHP_AP_OFFLINE && state < CPUHP_AP_ONLINE);
+}
+
+static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
+{
+	struct cpuhp_step *sp;
+
+	sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
+	return sp + state;
+}
+
+#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
+static ssize_t show_cpuhp_state(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+	return sprintf(buf, "%d\n", st->state);
+}
+static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);
+
+static ssize_t show_cpuhp_target(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+	return sprintf(buf, "%d\n", st->target);
+}
+static DEVICE_ATTR(target, 0444, show_cpuhp_target, NULL);
+
+static struct attribute *cpuhp_cpu_attrs[] = {
+	&dev_attr_state.attr,
+	&dev_attr_target.attr,
+	NULL
+};
+
+static struct attribute_group cpuhp_cpu_attr_group = {
+	.attrs = cpuhp_cpu_attrs,
+	.name = "hotplug",
+	NULL
+};
+
+static ssize_t show_cpuhp_states(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	ssize_t cur, res = 0;
+	int i;
+
+	mutex_lock(&cpuhp_state_mutex);
+	for (i = 0; i <= CPUHP_ONLINE; i++) {
+		struct cpuhp_step *sp = cpuhp_get_step(i);
+
+		if (sp->name) {
+			cur = sprintf(buf, "%3d: %s\n", i, sp->name);
+			buf += cur;
+			res += cur;
+		}
+	}
+	mutex_unlock(&cpuhp_state_mutex);
+	return res;
+}
+static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL);
+
+static struct attribute *cpuhp_cpu_root_attrs[] = {
+	&dev_attr_states.attr,
+	NULL
+};
+
+static struct attribute_group cpuhp_cpu_root_attr_group = {
+	.attrs = cpuhp_cpu_root_attrs,
+	.name = "hotplug",
+	NULL
+};
+
+static int __init cpuhp_sysfs_init(void)
+{
+	int cpu, ret;
+
+	ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
+				 &cpuhp_cpu_root_attr_group);
+	if (ret)
+		return ret;
+
+	for_each_possible_cpu(cpu) {
+		struct device *dev = get_cpu_device(cpu);
+
+		if (!dev)
+			continue;
+		ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+device_initcall(cpuhp_sysfs_init);
+#endif
+
 /*
  * cpu_bit_bitmap[] is a special, "compressed" data structure that
  * represents all NR_CPUS bits binary values of 1<<nr.
-- 
cgit v0.10.2


From 757c989b9994f51b42d6be1bd33c7c12d16a3ac7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:32 +0000
Subject: cpu/hotplug: Make target state writeable

Make it possible to write a target state to the per cpu state file, so we can
switch between states.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.022814799@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1979b89..be9335d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -48,12 +48,14 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
  * @teardown:	Teardown function of the step
  * @skip_onerr:	Do not invoke the functions on error rollback
  *		Will go away once the notifiers	are gone
+ * @cant_stop:	Bringup/teardown can't be stopped at this step
  */
 struct cpuhp_step {
 	const char	*name;
 	int		(*startup)(unsigned int cpu);
 	int		(*teardown)(unsigned int cpu);
 	bool		skip_onerr;
+	bool		cant_stop;
 };
 
 static DEFINE_MUTEX(cpuhp_state_mutex);
@@ -558,7 +560,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	if (num_online_cpus() == 1)
 		return -EBUSY;
 
-	if (!cpu_online(cpu))
+	if (!cpu_present(cpu))
 		return -EINVAL;
 
 	cpu_hotplug_begin();
@@ -683,16 +685,25 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 
 	cpu_hotplug_begin();
 
-	if (cpu_online(cpu) || !cpu_present(cpu)) {
+	if (!cpu_present(cpu)) {
 		ret = -EINVAL;
 		goto out;
 	}
 
-	/* Let it fail before we try to bring the cpu up */
-	idle = idle_thread_get(cpu);
-	if (IS_ERR(idle)) {
-		ret = PTR_ERR(idle);
+	/*
+	 * The caller of do_cpu_up might have raced with another
+	 * caller. Ignore it for now.
+	 */
+	if (st->state >= target)
 		goto out;
+
+	if (st->state == CPUHP_OFFLINE) {
+		/* Let it fail before we try to bring the cpu up */
+		idle = idle_thread_get(cpu);
+		if (IS_ERR(idle)) {
+			ret = PTR_ERR(idle);
+			goto out;
+		}
 	}
 
 	cpuhp_tasks_frozen = tasks_frozen;
@@ -909,27 +920,32 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.name			= "threads:create",
 		.startup		= smpboot_create_threads,
 		.teardown		= NULL,
+		.cant_stop		= true,
 	},
 	[CPUHP_NOTIFY_PREPARE] = {
 		.name			= "notify:prepare",
 		.startup		= notify_prepare,
 		.teardown		= notify_dead,
 		.skip_onerr		= true,
+		.cant_stop		= true,
 	},
 	[CPUHP_BRINGUP_CPU] = {
 		.name			= "cpu:bringup",
 		.startup		= bringup_cpu,
 		.teardown		= NULL,
+		.cant_stop		= true,
 	},
 	[CPUHP_TEARDOWN_CPU] = {
 		.name			= "cpu:teardown",
 		.startup		= NULL,
 		.teardown		= takedown_cpu,
+		.cant_stop		= true,
 	},
 	[CPUHP_NOTIFY_ONLINE] = {
 		.name			= "notify:online",
 		.startup		= notify_online,
 		.teardown		= notify_down_prepare,
+		.cant_stop		= true,
 	},
 #endif
 	[CPUHP_ONLINE] = {
@@ -947,6 +963,7 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.startup		= notify_starting,
 		.teardown		= notify_dying,
 		.skip_onerr		= true,
+		.cant_stop		= true,
 	},
 #endif
 	[CPUHP_ONLINE] = {
@@ -979,6 +996,46 @@ static ssize_t show_cpuhp_state(struct device *dev,
 }
 static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);
 
+static ssize_t write_cpuhp_target(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+	struct cpuhp_step *sp;
+	int target, ret;
+
+	ret = kstrtoint(buf, 10, &target);
+	if (ret)
+		return ret;
+
+#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
+	if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
+		return -EINVAL;
+#else
+	if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
+		return -EINVAL;
+#endif
+
+	ret = lock_device_hotplug_sysfs();
+	if (ret)
+		return ret;
+
+	mutex_lock(&cpuhp_state_mutex);
+	sp = cpuhp_get_step(target);
+	ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
+	mutex_unlock(&cpuhp_state_mutex);
+	if (ret)
+		return ret;
+
+	if (st->state < target)
+		ret = do_cpu_up(dev->id, target);
+	else
+		ret = do_cpu_down(dev->id, target);
+
+	unlock_device_hotplug();
+	return ret ? ret : count;
+}
+
 static ssize_t show_cpuhp_target(struct device *dev,
 				 struct device_attribute *attr, char *buf)
 {
@@ -986,7 +1043,7 @@ static ssize_t show_cpuhp_target(struct device *dev,
 
 	return sprintf(buf, "%d\n", st->target);
 }
-static DEVICE_ATTR(target, 0444, show_cpuhp_target, NULL);
+static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
 
 static struct attribute *cpuhp_cpu_attrs[] = {
 	&dev_attr_state.attr,
@@ -1007,7 +1064,7 @@ static ssize_t show_cpuhp_states(struct device *dev,
 	int i;
 
 	mutex_lock(&cpuhp_state_mutex);
-	for (i = 0; i <= CPUHP_ONLINE; i++) {
+	for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
 		struct cpuhp_step *sp = cpuhp_get_step(i);
 
 		if (sp->name) {
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8bfd1ac..f28f7fa 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1442,6 +1442,19 @@ config DEBUG_BLOCK_EXT_DEVT
 
 	  Say N if you are unsure.
 
+config CPU_HOTPLUG_STATE_CONTROL
+	bool "Enable CPU hotplug state control"
+	depends on DEBUG_KERNEL
+	depends on HOTPLUG_CPU
+	default n
+	help
+	  Allows to write steps between "offline" and "online" to the CPUs
+	  sysfs target file so states can be stepped granular. This is a debug
+	  option for now as the hotplug machinery cannot be stopped and
+	  restarted at arbitrary points yet.
+
+	  Say N if your are unsure.
+
 config NOTIFIER_ERROR_INJECTION
 	tristate "Notifier error injection"
 	depends on DEBUG_KERNEL
-- 
cgit v0.10.2


From 5b7aa87e0482be768486e0c2277aa4122487eb9d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:33 +0000
Subject: cpu/hotplug: Implement setup/removal interface

Implement function which allow to setup/remove hotplug state callbacks.

The default behaviour for setup is to call the startup function for this state
for (or on) all cpus which have a hotplug state >= the installed state.

The default behaviour for removal is to call the teardown function for this
state for (or on) all cpus which have a hotplug state >= the installed state.

This includes rollback to the previous state in case of failure.

A special state is CPUHP_ONLINE_DYN. Its for dynamically registering a hotplug
callback pair. This is for drivers which have no dependencies to avoid that we
need to allocate CPUHP states for each of them

For both setup and remove helper functions are provided, which prevent the
core to issue the callbacks. This simplifies the conversion of existing
hotplug notifiers.

[ Dynamic registering implemented by Sebastian Siewior ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.103464877@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index d9303cc..2993526 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -11,7 +11,74 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_NOTIFY_ONLINE,
+	CPUHP_ONLINE_DYN,
+	CPUHP_ONLINE_DYN_END		= CPUHP_ONLINE_DYN + 30,
 	CPUHP_ONLINE,
 };
 
+int __cpuhp_setup_state(enum cpuhp_state state,	const char *name, bool invoke,
+			int (*startup)(unsigned int cpu),
+			int (*teardown)(unsigned int cpu));
+
+/**
+ * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks
+ * @state:	The state for which the calls are installed
+ * @name:	Name of the callback (will be used in debug output)
+ * @startup:	startup callback function
+ * @teardown:	teardown callback function
+ *
+ * Installs the callback functions and invokes the startup callback on
+ * the present cpus which have already reached the @state.
+ */
+static inline int cpuhp_setup_state(enum cpuhp_state state,
+				    const char *name,
+				    int (*startup)(unsigned int cpu),
+				    int (*teardown)(unsigned int cpu))
+{
+	return __cpuhp_setup_state(state, name, true, startup, teardown);
+}
+
+/**
+ * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the
+ *			       callbacks
+ * @state:	The state for which the calls are installed
+ * @name:	Name of the callback.
+ * @startup:	startup callback function
+ * @teardown:	teardown callback function
+ *
+ * Same as @cpuhp_setup_state except that no calls are executed are invoked
+ * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n.
+ */
+static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state,
+					    const char *name,
+					    int (*startup)(unsigned int cpu),
+					    int (*teardown)(unsigned int cpu))
+{
+	return __cpuhp_setup_state(state, name, false, startup, teardown);
+}
+
+void __cpuhp_remove_state(enum cpuhp_state state, bool invoke);
+
+/**
+ * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown
+ * @state:	The state for which the calls are removed
+ *
+ * Removes the callback functions and invokes the teardown callback on
+ * the present cpus which have already reached the @state.
+ */
+static inline void cpuhp_remove_state(enum cpuhp_state state)
+{
+	__cpuhp_remove_state(state, true);
+}
+
+/**
+ * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking
+ *				teardown
+ * @state:	The state for which the calls are removed
+ */
+static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
+{
+	__cpuhp_remove_state(state, false);
+}
+
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index be9335d..b5eacb9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -973,6 +973,14 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 	},
 };
 
+/* Sanity check for callbacks */
+static int cpuhp_cb_check(enum cpuhp_state state)
+{
+	if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
+		return -EINVAL;
+	return 0;
+}
+
 static bool cpuhp_is_ap_state(enum cpuhp_state state)
 {
 	return (state > CPUHP_AP_OFFLINE && state < CPUHP_AP_ONLINE);
@@ -986,6 +994,222 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
 	return sp + state;
 }
 
+static void cpuhp_store_callbacks(enum cpuhp_state state,
+				  const char *name,
+				  int (*startup)(unsigned int cpu),
+				  int (*teardown)(unsigned int cpu))
+{
+	/* (Un)Install the callbacks for further cpu hotplug operations */
+	struct cpuhp_step *sp;
+
+	mutex_lock(&cpuhp_state_mutex);
+	sp = cpuhp_get_step(state);
+	sp->startup = startup;
+	sp->teardown = teardown;
+	sp->name = name;
+	mutex_unlock(&cpuhp_state_mutex);
+}
+
+static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
+{
+	return cpuhp_get_step(state)->teardown;
+}
+
+/* Helper function to run callback on the target cpu */
+static void cpuhp_on_cpu_cb(void *__cb)
+{
+	int (*cb)(unsigned int cpu) = __cb;
+
+	BUG_ON(cb(smp_processor_id()));
+}
+
+/*
+ * Call the startup/teardown function for a step either on the AP or
+ * on the current CPU.
+ */
+static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
+			    int (*cb)(unsigned int), bool bringup)
+{
+	int ret;
+
+	if (!cb)
+		return 0;
+
+	/*
+	 * This invokes the callback directly for now. In a later step we
+	 * convert that to use cpuhp_invoke_callback().
+	 */
+	if (cpuhp_is_ap_state(state)) {
+		/*
+		 * Note, that a function called on the AP is not
+		 * allowed to fail.
+		 */
+		if (cpu_online(cpu))
+			smp_call_function_single(cpu, cpuhp_on_cpu_cb, cb, 1);
+		return 0;
+	}
+
+	/*
+	 * The non AP bound callbacks can fail on bringup. On teardown
+	 * e.g. module removal we crash for now.
+	 */
+	ret = cb(cpu);
+	BUG_ON(ret && !bringup);
+	return ret;
+}
+
+/*
+ * Called from __cpuhp_setup_state on a recoverable failure.
+ *
+ * Note: The teardown callbacks for rollback are not allowed to fail!
+ */
+static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
+				   int (*teardown)(unsigned int cpu))
+{
+	int cpu;
+
+	if (!teardown)
+		return;
+
+	/* Roll back the already executed steps on the other cpus */
+	for_each_present_cpu(cpu) {
+		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+		int cpustate = st->state;
+
+		if (cpu >= failedcpu)
+			break;
+
+		/* Did we invoke the startup call on that cpu ? */
+		if (cpustate >= state)
+			cpuhp_issue_call(cpu, state, teardown, false);
+	}
+}
+
+/*
+ * Returns a free for dynamic slot assignment of the Online state. The states
+ * are protected by the cpuhp_slot_states mutex and an empty slot is identified
+ * by having no name assigned.
+ */
+static int cpuhp_reserve_state(enum cpuhp_state state)
+{
+	enum cpuhp_state i;
+
+	mutex_lock(&cpuhp_state_mutex);
+	for (i = CPUHP_ONLINE_DYN; i <= CPUHP_ONLINE_DYN_END; i++) {
+		if (cpuhp_bp_states[i].name)
+			continue;
+
+		cpuhp_bp_states[i].name = "Reserved";
+		mutex_unlock(&cpuhp_state_mutex);
+		return i;
+	}
+	mutex_unlock(&cpuhp_state_mutex);
+	WARN(1, "No more dynamic states available for CPU hotplug\n");
+	return -ENOSPC;
+}
+
+/**
+ * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state
+ * @state:	The state to setup
+ * @invoke:	If true, the startup function is invoked for cpus where
+ *		cpu state >= @state
+ * @startup:	startup callback function
+ * @teardown:	teardown callback function
+ *
+ * Returns 0 if successful, otherwise a proper error code
+ */
+int __cpuhp_setup_state(enum cpuhp_state state,
+			const char *name, bool invoke,
+			int (*startup)(unsigned int cpu),
+			int (*teardown)(unsigned int cpu))
+{
+	int cpu, ret = 0;
+	int dyn_state = 0;
+
+	if (cpuhp_cb_check(state) || !name)
+		return -EINVAL;
+
+	get_online_cpus();
+
+	/* currently assignments for the ONLINE state are possible */
+	if (state == CPUHP_ONLINE_DYN) {
+		dyn_state = 1;
+		ret = cpuhp_reserve_state(state);
+		if (ret < 0)
+			goto out;
+		state = ret;
+	}
+
+	cpuhp_store_callbacks(state, name, startup, teardown);
+
+	if (!invoke || !startup)
+		goto out;
+
+	/*
+	 * Try to call the startup callback for each present cpu
+	 * depending on the hotplug state of the cpu.
+	 */
+	for_each_present_cpu(cpu) {
+		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+		int cpustate = st->state;
+
+		if (cpustate < state)
+			continue;
+
+		ret = cpuhp_issue_call(cpu, state, startup, true);
+		if (ret) {
+			cpuhp_rollback_install(cpu, state, teardown);
+			cpuhp_store_callbacks(state, NULL, NULL, NULL);
+			goto out;
+		}
+	}
+out:
+	put_online_cpus();
+	if (!ret && dyn_state)
+		return state;
+	return ret;
+}
+EXPORT_SYMBOL(__cpuhp_setup_state);
+
+/**
+ * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
+ * @state:	The state to remove
+ * @invoke:	If true, the teardown function is invoked for cpus where
+ *		cpu state >= @state
+ *
+ * The teardown callback is currently not allowed to fail. Think
+ * about module removal!
+ */
+void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
+{
+	int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state);
+	int cpu;
+
+	BUG_ON(cpuhp_cb_check(state));
+
+	get_online_cpus();
+
+	if (!invoke || !teardown)
+		goto remove;
+
+	/*
+	 * Call the teardown callback for each present cpu depending
+	 * on the hotplug state of the cpu. This function is not
+	 * allowed to fail currently!
+	 */
+	for_each_present_cpu(cpu) {
+		struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+		int cpustate = st->state;
+
+		if (cpustate >= state)
+			cpuhp_issue_call(cpu, state, teardown, false);
+	}
+remove:
+	cpuhp_store_callbacks(state, NULL, NULL, NULL);
+	put_online_cpus();
+}
+EXPORT_SYMBOL(__cpuhp_remove_state);
+
 #if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
 static ssize_t show_cpuhp_state(struct device *dev,
 				struct device_attribute *attr, char *buf)
-- 
cgit v0.10.2


From 949338e35131c551f7bf54f48a2e3a227af6721b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:35 +0000
Subject: cpu/hotplug: Move scheduler cpu_online notifier to hotplug core

Move the scheduler cpu online notifier part to the hotplug core. This is
anyway the highest priority callback and we need that functionality right now
for the next changes.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.200791046@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 2993526..2f2e5d9 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -10,6 +10,7 @@ enum cpuhp_state {
 	CPUHP_AP_NOTIFY_STARTING,
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
+	CPUHP_CPU_SET_ACTIVE,
 	CPUHP_NOTIFY_ONLINE,
 	CPUHP_ONLINE_DYN,
 	CPUHP_ONLINE_DYN_END		= CPUHP_ONLINE_DYN + 30,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b5eacb9..65e34d3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -666,6 +666,19 @@ void notify_cpu_starting(unsigned int cpu)
 	}
 }
 
+/*
+ * Called from the idle task. We need to set active here, so we can kick off
+ * the stopper thread.
+ */
+static int cpuhp_set_cpu_active(unsigned int cpu)
+{
+	/* The cpu is marked online, set it active now */
+	set_cpu_active(cpu, true);
+	/* Unpark the stopper thread */
+	stop_machine_unpark(cpu);
+	return 0;
+}
+
 static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
 	for (st->state--; st->state > st->target; st->state--) {
@@ -941,6 +954,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.teardown		= takedown_cpu,
 		.cant_stop		= true,
 	},
+	[CPUHP_CPU_SET_ACTIVE] = {
+		.name			= "cpu:active",
+		.startup		= cpuhp_set_cpu_active,
+		.teardown		= NULL,
+	},
 	[CPUHP_NOTIFY_ONLINE] = {
 		.name			= "notify:online",
 		.startup		= notify_online,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9503d59..6266463 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5692,16 +5692,6 @@ static int sched_cpu_active(struct notifier_block *nfb,
 		set_cpu_rq_start_time();
 		return NOTIFY_OK;
 
-	case CPU_ONLINE:
-		/*
-		 * At this point a starting CPU has marked itself as online via
-		 * set_cpu_online(). But it might not yet have marked itself
-		 * as active, which is essential from here on.
-		 */
-		set_cpu_active(cpu, true);
-		stop_machine_unpark(cpu);
-		return NOTIFY_OK;
-
 	case CPU_DOWN_FAILED:
 		set_cpu_active(cpu, true);
 		return NOTIFY_OK;
-- 
cgit v0.10.2


From 931ef163309ee955611f287dc65248b39a65fc9d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:36 +0000
Subject: cpu/hotplug: Unpark smpboot threads from the state machine

Handle the smpboot threads in the state machine.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.295777684@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 78989f2..83f3576 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -78,7 +78,7 @@ enum {
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
-	CPU_PRI_SMPBOOT		= 9,
+
 	/* bring up workqueues before normal notifiers and down after */
 	CPU_PRI_WORKQUEUE_UP	= 5,
 	CPU_PRI_WORKQUEUE_DOWN	= -5,
@@ -172,7 +172,6 @@ static inline void __unregister_cpu_notifier(struct notifier_block *nb)
 }
 #endif
 
-void smpboot_thread_init(void);
 int cpu_up(unsigned int cpu);
 void notify_cpu_starting(unsigned int cpu);
 extern void cpu_maps_update_begin(void);
@@ -221,10 +220,6 @@ static inline void cpu_notifier_register_done(void)
 {
 }
 
-static inline void smpboot_thread_init(void)
-{
-}
-
 #endif /* CONFIG_SMP */
 extern struct bus_type cpu_subsys;
 
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 2f2e5d9..3867910 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -11,6 +11,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_CPU_SET_ACTIVE,
+	CPUHP_SMPBOOT_THREADS,
 	CPUHP_NOTIFY_ONLINE,
 	CPUHP_ONLINE_DYN,
 	CPUHP_ONLINE_DYN_END		= CPUHP_ONLINE_DYN + 30,
diff --git a/init/main.c b/init/main.c
index c2ea723..55563fd 100644
--- a/init/main.c
+++ b/init/main.c
@@ -388,7 +388,6 @@ static noinline void __init_refok rest_init(void)
 	int pid;
 
 	rcu_scheduler_starting();
-	smpboot_thread_init();
 	/*
 	 * We need to spawn init first so that it obtains pid 1, however
 	 * the init task will end up wanting to create kthreads, which, if
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 65e34d3..3ec86bc 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -481,8 +481,6 @@ static int takedown_cpu(unsigned int cpu)
 	else
 		synchronize_rcu();
 
-	smpboot_park_threads(cpu);
-
 	/*
 	 * Prevent irq alloc/free while the dying cpu reorganizes the
 	 * interrupt affinities.
@@ -612,38 +610,6 @@ int cpu_down(unsigned int cpu)
 EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
-/*
- * Unpark per-CPU smpboot kthreads at CPU-online time.
- */
-static int smpboot_thread_call(struct notifier_block *nfb,
-			       unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-
-	case CPU_DOWN_FAILED:
-	case CPU_ONLINE:
-		smpboot_unpark_threads(cpu);
-		break;
-
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block smpboot_thread_notifier = {
-	.notifier_call = smpboot_thread_call,
-	.priority = CPU_PRI_SMPBOOT,
-};
-
-void smpboot_thread_init(void)
-{
-	register_cpu_notifier(&smpboot_thread_notifier);
-}
-
 /**
  * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
  * @cpu: cpu that just started
@@ -959,6 +925,11 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup		= cpuhp_set_cpu_active,
 		.teardown		= NULL,
 	},
+	[CPUHP_SMPBOOT_THREADS] = {
+		.name			= "smpboot:threads",
+		.startup		= smpboot_unpark_threads,
+		.teardown		= smpboot_park_threads,
+	},
 	[CPUHP_NOTIFY_ONLINE] = {
 		.name			= "notify:online",
 		.startup		= notify_online,
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index d264f59..13bc43d 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
 		kthread_unpark(tsk);
 }
 
-void smpboot_unpark_threads(unsigned int cpu)
+int smpboot_unpark_threads(unsigned int cpu)
 {
 	struct smp_hotplug_thread *cur;
 
@@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu)
 		if (cpumask_test_cpu(cpu, cur->cpumask))
 			smpboot_unpark_thread(cur, cpu);
 	mutex_unlock(&smpboot_threads_lock);
+	return 0;
 }
 
 static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
@@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
 		kthread_park(tsk);
 }
 
-void smpboot_park_threads(unsigned int cpu)
+int smpboot_park_threads(unsigned int cpu)
 {
 	struct smp_hotplug_thread *cur;
 
@@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu)
 	list_for_each_entry_reverse(cur, &hotplug_threads, list)
 		smpboot_park_thread(cur, cpu);
 	mutex_unlock(&smpboot_threads_lock);
+	return 0;
 }
 
 static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 72415a0..6b5f020 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -14,7 +14,7 @@ static inline void idle_threads_init(void) { }
 #endif
 
 int smpboot_create_threads(unsigned int cpu);
-void smpboot_park_threads(unsigned int cpu);
-void smpboot_unpark_threads(unsigned int cpu);
+int smpboot_park_threads(unsigned int cpu);
+int smpboot_unpark_threads(unsigned int cpu);
 
 #endif
-- 
cgit v0.10.2


From 2e1a3483ce74d197876e58281925dd4e378a7f28 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:37 +0000
Subject: cpu/hotplug: Split out the state walk into functions

We need that for running callbacks on the AP and the BP.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.374946234@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3ec86bc..9572ca0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -329,10 +329,74 @@ static int bringup_cpu(unsigned int cpu)
 	return 0;
 }
 
+/*
+ * Hotplug state machine related functions
+ */
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st,
+			  struct cpuhp_step *steps)
+{
+	for (st->state++; st->state < st->target; st->state++) {
+		struct cpuhp_step *step = steps + st->state;
+
+		if (!step->skip_onerr)
+			cpuhp_invoke_callback(cpu, st->state, step->startup);
+	}
+}
+
+static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+				struct cpuhp_step *steps, enum cpuhp_state target)
+{
+	enum cpuhp_state prev_state = st->state;
+	int ret = 0;
+
+	for (; st->state > target; st->state--) {
+		struct cpuhp_step *step = steps + st->state;
+
+		ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
+		if (ret) {
+			st->target = prev_state;
+			undo_cpu_down(cpu, st, steps);
+			break;
+		}
+	}
+	return ret;
+}
+
+static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st,
+			struct cpuhp_step *steps)
+{
+	for (st->state--; st->state > st->target; st->state--) {
+		struct cpuhp_step *step = steps + st->state;
+
+		if (!step->skip_onerr)
+			cpuhp_invoke_callback(cpu, st->state, step->teardown);
+	}
+}
+
+static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+			      struct cpuhp_step *steps, enum cpuhp_state target)
+{
+	enum cpuhp_state prev_state = st->state;
+	int ret = 0;
+
+	while (st->state < target) {
+		struct cpuhp_step *step;
+
+		st->state++;
+		step = steps + st->state;
+		ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
+		if (ret) {
+			st->target = prev_state;
+			undo_cpu_up(cpu, st, steps);
+			break;
+		}
+	}
+	return ret;
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 EXPORT_SYMBOL(register_cpu_notifier);
 EXPORT_SYMBOL(__register_cpu_notifier);
-
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
 	cpu_maps_update_begin();
@@ -537,15 +601,6 @@ static int notify_dead(unsigned int cpu)
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
-	for (st->state++; st->state < st->target; st->state++) {
-		struct cpuhp_step *step = cpuhp_bp_states + st->state;
-
-		if (!step->skip_onerr)
-			cpuhp_invoke_callback(cpu, st->state, step->startup);
-	}
-}
 
 /* Requires cpu_add_remove_lock to be held */
 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
@@ -567,16 +622,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 
 	prev_state = st->state;
 	st->target = target;
-	for (; st->state > st->target; st->state--) {
-		struct cpuhp_step *step = cpuhp_bp_states + st->state;
+	ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
 
-		ret = cpuhp_invoke_callback(cpu, st->state, step->teardown);
-		if (ret) {
-			st->target = prev_state;
-			undo_cpu_down(cpu, st);
-			break;
-		}
-	}
 	hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
 
 	cpu_hotplug_done();
@@ -645,22 +692,12 @@ static int cpuhp_set_cpu_active(unsigned int cpu)
 	return 0;
 }
 
-static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
-	for (st->state--; st->state > st->target; st->state--) {
-		struct cpuhp_step *step = cpuhp_bp_states + st->state;
-
-		if (!step->skip_onerr)
-			cpuhp_invoke_callback(cpu, st->state, step->teardown);
-	}
-}
-
 /* Requires cpu_add_remove_lock to be held */
 static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 {
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	struct task_struct *idle;
-	int prev_state, ret = 0;
+	int ret = 0;
 
 	cpu_hotplug_begin();
 
@@ -687,20 +724,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 
 	cpuhp_tasks_frozen = tasks_frozen;
 
-	prev_state = st->state;
 	st->target = target;
-	while (st->state < st->target) {
-		struct cpuhp_step *step;
-
-		st->state++;
-		step = cpuhp_bp_states + st->state;
-		ret = cpuhp_invoke_callback(cpu, st->state, step->startup);
-		if (ret) {
-			st->target = prev_state;
-			undo_cpu_up(cpu, st);
-			break;
-		}
-	}
+	ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
 out:
 	cpu_hotplug_done();
 	return ret;
-- 
cgit v0.10.2


From 4cb28ced23c4f222ff4e3f39898017e52161a9c9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:38 +0000
Subject: cpu/hotplug: Create hotplug threads

In order to let the hotplugged cpu take care of the setup/teardown, we need a
seperate hotplug thread.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.454541272@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9572ca0..9048c33 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -22,6 +22,7 @@
 #include <linux/lockdep.h>
 #include <linux/tick.h>
 #include <linux/irq.h>
+#include <linux/smpboot.h>
 
 #include <trace/events/power.h>
 #define CREATE_TRACE_POINTS
@@ -33,10 +34,24 @@
  * cpuhp_cpu_state - Per cpu hotplug state storage
  * @state:	The current cpu state
  * @target:	The target state
+ * @thread:	Pointer to the hotplug thread
+ * @should_run:	Thread should execute
+ * @cb_stat:	The state for a single callback (install/uninstall)
+ * @cb:		Single callback function (install/uninstall)
+ * @result:	Result of the operation
+ * @done:	Signal completion to the issuer of the task
  */
 struct cpuhp_cpu_state {
 	enum cpuhp_state	state;
 	enum cpuhp_state	target;
+#ifdef CONFIG_SMP
+	struct task_struct	*thread;
+	bool			should_run;
+	enum cpuhp_state	cb_state;
+	int			(*cb)(unsigned int cpu);
+	int			result;
+	struct completion	done;
+#endif
 };
 
 static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
@@ -394,6 +409,134 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
 	return ret;
 }
 
+/*
+ * The cpu hotplug threads manage the bringup and teardown of the cpus
+ */
+static void cpuhp_create(unsigned int cpu)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+	init_completion(&st->done);
+}
+
+static int cpuhp_should_run(unsigned int cpu)
+{
+	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+
+	return st->should_run;
+}
+
+/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
+static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+	enum cpuhp_state target = max((int)st->target, CPUHP_AP_ONLINE);
+
+	return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target);
+}
+
+/* Execute the online startup callbacks. Used to be CPU_ONLINE */
+static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+	return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target);
+}
+
+/*
+ * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
+ * callbacks when a state gets [un]installed at runtime.
+ */
+static void cpuhp_thread_fun(unsigned int cpu)
+{
+	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+	int ret = 0;
+
+	/*
+	 * Paired with the mb() in cpuhp_kick_ap_work and
+	 * cpuhp_invoke_ap_callback, so the work set is consistent visible.
+	 */
+	smp_mb();
+	if (!st->should_run)
+		return;
+
+	st->should_run = false;
+
+	/* Single callback invocation for [un]install ? */
+	if (st->cb) {
+		if (st->cb_state < CPUHP_AP_ONLINE) {
+			local_irq_disable();
+			ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+			local_irq_enable();
+		} else {
+			ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
+		}
+	} else {
+		/* Regular hotplug work */
+		if (st->state < st->target)
+			ret = cpuhp_ap_online(cpu, st);
+		else if (st->state > st->target)
+			ret = cpuhp_ap_offline(cpu, st);
+	}
+	st->result = ret;
+	complete(&st->done);
+}
+
+/* Invoke a single callback on a remote cpu */
+static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
+				    int (*cb)(unsigned int))
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	st->cb_state = state;
+	st->cb = cb;
+	/*
+	 * Make sure the above stores are visible before should_run becomes
+	 * true. Paired with the mb() above in cpuhp_thread_fun()
+	 */
+	smp_mb();
+	st->should_run = true;
+	wake_up_process(st->thread);
+	wait_for_completion(&st->done);
+	return st->result;
+}
+
+/* Regular hotplug invocation of the AP hotplug thread */
+static int cpuhp_kick_ap_work(unsigned int cpu)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	enum cpuhp_state state = st->state;
+
+	trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
+	st->result = 0;
+	st->cb = NULL;
+	/*
+	 * Make sure the above stores are visible before should_run becomes
+	 * true. Paired with the mb() above in cpuhp_thread_fun()
+	 */
+	smp_mb();
+	st->should_run = true;
+	wake_up_process(st->thread);
+	wait_for_completion(&st->done);
+	trace_cpuhp_exit(cpu, st->state, state, st->result);
+	return st->result;
+}
+
+static struct smp_hotplug_thread cpuhp_threads = {
+	.store			= &cpuhp_state.thread,
+	.create			= &cpuhp_create,
+	.thread_should_run	= cpuhp_should_run,
+	.thread_fn		= cpuhp_thread_fun,
+	.thread_comm		= "cpuhp/%u",
+	.selfparking		= true,
+};
+
+void __init cpuhp_threads_init(void)
+{
+	BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
+	kthread_unpark(this_cpu_read(cpuhp_state.thread));
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 EXPORT_SYMBOL(register_cpu_notifier);
 EXPORT_SYMBOL(__register_cpu_notifier);
@@ -997,7 +1140,7 @@ static int cpuhp_cb_check(enum cpuhp_state state)
 
 static bool cpuhp_is_ap_state(enum cpuhp_state state)
 {
-	return (state > CPUHP_AP_OFFLINE && state < CPUHP_AP_ONLINE);
+	return (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE);
 }
 
 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
diff --git a/kernel/smp.c b/kernel/smp.c
index d903c02..822ffb1 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -569,6 +569,7 @@ void __init smp_init(void)
 	unsigned int cpu;
 
 	idle_threads_init();
+	cpuhp_threads_init();
 
 	/* FIXME: This should be done in userspace --RR */
 	for_each_present_cpu(cpu) {
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 6b5f020..485b81c 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -17,4 +17,6 @@ int smpboot_create_threads(unsigned int cpu);
 int smpboot_park_threads(unsigned int cpu);
 int smpboot_unpark_threads(unsigned int cpu);
 
+void __init cpuhp_threads_init(void);
+
 #endif
-- 
cgit v0.10.2


From 1cf4f629d9d246519a1e76c021806f2a51ddba4d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:39 +0000
Subject: cpu/hotplug: Move online calls to hotplugged cpu

Let the hotplugged cpu invoke the setup/teardown callbacks
(CPU_ONLINE/CPU_DOWN_PREPARE) itself.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.536364371@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 3867910..8a715bb 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -11,10 +11,12 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_CPU_SET_ACTIVE,
-	CPUHP_SMPBOOT_THREADS,
-	CPUHP_NOTIFY_ONLINE,
-	CPUHP_ONLINE_DYN,
-	CPUHP_ONLINE_DYN_END		= CPUHP_ONLINE_DYN + 30,
+	CPUHP_KICK_AP_THREAD,
+	CPUHP_BP_ONLINE,
+	CPUHP_AP_SMPBOOT_THREADS,
+	CPUHP_AP_NOTIFY_ONLINE,
+	CPUHP_AP_ONLINE_DYN,
+	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
 	CPUHP_ONLINE,
 };
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9048c33..e220e56 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -429,7 +429,7 @@ static int cpuhp_should_run(unsigned int cpu)
 /* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
 static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
 {
-	enum cpuhp_state target = max((int)st->target, CPUHP_AP_ONLINE);
+	enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
 
 	return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target);
 }
@@ -469,6 +469,9 @@ static void cpuhp_thread_fun(unsigned int cpu)
 			ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
 		}
 	} else {
+		/* Cannot happen .... */
+		BUG_ON(st->state < CPUHP_KICK_AP_THREAD);
+
 		/* Regular hotplug work */
 		if (st->state < st->target)
 			ret = cpuhp_ap_online(cpu, st);
@@ -502,12 +505,8 @@ static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state,
 }
 
 /* Regular hotplug invocation of the AP hotplug thread */
-static int cpuhp_kick_ap_work(unsigned int cpu)
+static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
 {
-	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-	enum cpuhp_state state = st->state;
-
-	trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
 	st->result = 0;
 	st->cb = NULL;
 	/*
@@ -517,6 +516,15 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
 	smp_mb();
 	st->should_run = true;
 	wake_up_process(st->thread);
+}
+
+static int cpuhp_kick_ap_work(unsigned int cpu)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	enum cpuhp_state state = st->state;
+
+	trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
+	__cpuhp_kick_ap_work(st);
 	wait_for_completion(&st->done);
 	trace_cpuhp_exit(cpu, st->state, state, st->result);
 	return st->result;
@@ -688,6 +696,9 @@ static int takedown_cpu(unsigned int cpu)
 	else
 		synchronize_rcu();
 
+	/* Park the hotplug thread */
+	kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
+
 	/*
 	 * Prevent irq alloc/free while the dying cpu reorganizes the
 	 * interrupt affinities.
@@ -765,10 +776,34 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 
 	prev_state = st->state;
 	st->target = target;
+	/*
+	 * If the current CPU state is in the range of the AP hotplug thread,
+	 * then we need to kick the thread.
+	 */
+	if (st->state >= CPUHP_KICK_AP_THREAD) {
+		ret = cpuhp_kick_ap_work(cpu);
+		/*
+		 * The AP side has done the error rollback already. Just
+		 * return the error code..
+		 */
+		if (ret)
+			goto out;
+
+		/*
+		 * We might have stopped still in the range of the AP hotplug
+		 * thread. Nothing to do anymore.
+		 */
+		if (st->state >= CPUHP_KICK_AP_THREAD)
+			goto out;
+	}
+	/*
+	 * The AP brought itself down below CPUHP_KICK_AP_THREAD. So we need
+	 * to do the further cleanups.
+	 */
 	ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
 
 	hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
-
+out:
 	cpu_hotplug_done();
 	/* This post dead nonsense must die */
 	if (!ret && hasdied)
@@ -828,10 +863,13 @@ void notify_cpu_starting(unsigned int cpu)
  */
 static int cpuhp_set_cpu_active(unsigned int cpu)
 {
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
 	/* The cpu is marked online, set it active now */
 	set_cpu_active(cpu, true);
-	/* Unpark the stopper thread */
+	/* Unpark the stopper thread and the hotplug thread */
 	stop_machine_unpark(cpu);
+	kthread_unpark(st->thread);
 	return 0;
 }
 
@@ -868,6 +906,26 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 	cpuhp_tasks_frozen = tasks_frozen;
 
 	st->target = target;
+	/*
+	 * If the current CPU state is in the range of the AP hotplug thread,
+	 * then we need to kick the thread once more.
+	 */
+	if (st->state >= CPUHP_KICK_AP_THREAD) {
+		ret = cpuhp_kick_ap_work(cpu);
+		/*
+		 * The AP side has done the error rollback already. Just
+		 * return the error code..
+		 */
+		if (ret)
+			goto out;
+	}
+
+	/*
+	 * Try to reach the target state. We max out on the BP at
+	 * CPUHP_KICK_AP_THREAD. After that the AP hotplug thread is
+	 * responsible for bringing it up to the target state.
+	 */
+	target = min((int)target, CPUHP_KICK_AP_THREAD);
 	ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
 out:
 	cpu_hotplug_done();
@@ -1093,19 +1151,13 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.startup		= cpuhp_set_cpu_active,
 		.teardown		= NULL,
 	},
-	[CPUHP_SMPBOOT_THREADS] = {
-		.name			= "smpboot:threads",
-		.startup		= smpboot_unpark_threads,
-		.teardown		= smpboot_park_threads,
-	},
-	[CPUHP_NOTIFY_ONLINE] = {
-		.name			= "notify:online",
-		.startup		= notify_online,
-		.teardown		= notify_down_prepare,
-		.cant_stop		= true,
+	[CPUHP_KICK_AP_THREAD] = {
+		.name			= "cpuhp:kickthread",
+		.startup		= cpuhp_kick_ap_work,
+		.teardown		= cpuhp_kick_ap_work,
 	},
 #endif
-	[CPUHP_ONLINE] = {
+	[CPUHP_BP_ONLINE] = {
 		.name			= "online",
 		.startup		= NULL,
 		.teardown		= NULL,
@@ -1122,6 +1174,16 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.skip_onerr		= true,
 		.cant_stop		= true,
 	},
+	[CPUHP_AP_SMPBOOT_THREADS] = {
+		.name			= "smpboot:threads",
+		.startup		= smpboot_unpark_threads,
+		.teardown		= smpboot_park_threads,
+	},
+	[CPUHP_AP_NOTIFY_ONLINE] = {
+		.name			= "notify:online",
+		.startup		= notify_online,
+		.teardown		= notify_down_prepare,
+	},
 #endif
 	[CPUHP_ONLINE] = {
 		.name			= "online",
@@ -1140,7 +1202,9 @@ static int cpuhp_cb_check(enum cpuhp_state state)
 
 static bool cpuhp_is_ap_state(enum cpuhp_state state)
 {
-	return (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE);
+	if (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE)
+		return true;
+	return state > CPUHP_BP_ONLINE;
 }
 
 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
@@ -1172,14 +1236,6 @@ static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
 	return cpuhp_get_step(state)->teardown;
 }
 
-/* Helper function to run callback on the target cpu */
-static void cpuhp_on_cpu_cb(void *__cb)
-{
-	int (*cb)(unsigned int cpu) = __cb;
-
-	BUG_ON(cb(smp_processor_id()));
-}
-
 /*
  * Call the startup/teardown function for a step either on the AP or
  * on the current CPU.
@@ -1191,26 +1247,18 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state,
 
 	if (!cb)
 		return 0;
-
-	/*
-	 * This invokes the callback directly for now. In a later step we
-	 * convert that to use cpuhp_invoke_callback().
-	 */
-	if (cpuhp_is_ap_state(state)) {
-		/*
-		 * Note, that a function called on the AP is not
-		 * allowed to fail.
-		 */
-		if (cpu_online(cpu))
-			smp_call_function_single(cpu, cpuhp_on_cpu_cb, cb, 1);
-		return 0;
-	}
-
 	/*
 	 * The non AP bound callbacks can fail on bringup. On teardown
 	 * e.g. module removal we crash for now.
 	 */
-	ret = cb(cpu);
+#ifdef CONFIG_SMP
+	if (cpuhp_is_ap_state(state))
+		ret = cpuhp_invoke_ap_callback(cpu, state, cb);
+	else
+		ret = cpuhp_invoke_callback(cpu, state, cb);
+#else
+	ret = cpuhp_invoke_callback(cpu, state, cb);
+#endif
 	BUG_ON(ret && !bringup);
 	return ret;
 }
@@ -1252,11 +1300,11 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
 	enum cpuhp_state i;
 
 	mutex_lock(&cpuhp_state_mutex);
-	for (i = CPUHP_ONLINE_DYN; i <= CPUHP_ONLINE_DYN_END; i++) {
-		if (cpuhp_bp_states[i].name)
+	for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
+		if (cpuhp_ap_states[i].name)
 			continue;
 
-		cpuhp_bp_states[i].name = "Reserved";
+		cpuhp_ap_states[i].name = "Reserved";
 		mutex_unlock(&cpuhp_state_mutex);
 		return i;
 	}
@@ -1289,7 +1337,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
 	get_online_cpus();
 
 	/* currently assignments for the ONLINE state are possible */
-	if (state == CPUHP_ONLINE_DYN) {
+	if (state == CPUHP_AP_ONLINE_DYN) {
 		dyn_state = 1;
 		ret = cpuhp_reserve_state(state);
 		if (ret < 0)
-- 
cgit v0.10.2


From fc6d73d67436e7784758a831227bd019547a3f73 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:40 +0000
Subject: arch/hotplug: Call into idle with a proper state

Let the non boot cpus call into idle with the corresponding hotplug state, so
the hotplug core can handle the further bringup. That's a first step to
convert the boot side of the hotplugged cpus to do all the synchronization
with the other side through the state machine. For now it'll only start the
hotplug thread and kick the full bringup of the cpu.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.614102639@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 2f24447f..46bf263 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -168,7 +168,7 @@ smp_callin(void)
 	      cpuid, current, current->active_mm));
 
 	preempt_disable();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 /* Wait until hwrpb->txrdy is clear for cpu.  Return -1 on timeout.  */
diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index 424e937..4cb3add 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -142,7 +142,7 @@ void start_kernel_secondary(void)
 
 	local_irq_enable();
 	preempt_disable();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 /*
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 37312f6..baee702 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -409,7 +409,7 @@ asmlinkage void secondary_start_kernel(void)
 	/*
 	 * OK, it's off to the idle thread for us
 	 */
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index b1adc51..4607657 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -195,7 +195,7 @@ asmlinkage void secondary_start_kernel(void)
 	/*
 	 * OK, it's off to the idle thread for us
 	 */
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 0030e21..23c4ef5 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -333,7 +333,7 @@ void secondary_start_kernel(void)
 
 	/* We are done with local CPU inits, unblock the boot CPU. */
 	set_cpu_online(cpu, true);
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_prepare_boot_cpu(void)
diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c
index ff759f2..983bae7 100644
--- a/arch/hexagon/kernel/smp.c
+++ b/arch/hexagon/kernel/smp.c
@@ -180,7 +180,7 @@ void start_secondary(void)
 
 	local_irq_enable();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 0e76fad..74fe317 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -454,7 +454,7 @@ start_secondary (void *unused)
 	preempt_disable();
 	smp_callin();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 	return 0;
 }
 
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index a468467..f98d2f6 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -432,7 +432,7 @@ int __init start_secondary(void *unused)
 	 */
 	local_flush_tlb_all();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 	return 0;
 }
 
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index c3c6f08..bad1323 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -396,7 +396,7 @@ asmlinkage void secondary_start_kernel(void)
 	/*
 	 * OK, it's off to the idle thread for us
 	 */
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index bd4385a..f2112a8 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -191,7 +191,7 @@ asmlinkage void start_secondary(void)
 	WARN_ON_ONCE(!irqs_disabled());
 	mp_ops->smp_finish();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 static void stop_this_cpu(void *dummy)
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index f984193..426173c 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -675,7 +675,7 @@ int __init start_secondary(void *unused)
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 	init_clockevents();
 #endif
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 	return 0;
 }
 
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 52e8597..c2a9cc5 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -305,7 +305,7 @@ void __init smp_callin(void)
 
 	local_irq_enable();  /* Interrupts have been off until now */
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 
 	/* NOTREACHED */
 	panic("smp_callin() AAAAaaaaahhhh....\n");
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index ec9ec20..cc13d4c 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -727,7 +727,7 @@ void start_secondary(void *unused)
 
 	local_irq_enable();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 
 	BUG();
 }
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 3c65a8e..40a6b4f 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -798,7 +798,7 @@ static void smp_start_secondary(void *cpuvoid)
 	set_cpu_online(smp_processor_id(), true);
 	inc_irq_stat(CPU_RST);
 	local_irq_enable();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 /* Upping and downing of CPUs */
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index de6be00..13f633a 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -203,7 +203,7 @@ asmlinkage void start_secondary(void)
 	set_cpu_online(cpu, true);
 	per_cpu(cpu_state, cpu) = CPU_ONLINE;
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 extern struct {
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index b3a5d81..fb30e7c 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -364,7 +364,7 @@ static void sparc_start_secondary(void *arg)
 	local_irq_enable();
 
 	wmb();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 
 	/* We should never reach here! */
 	BUG();
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 19cd08d..8a6151a 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -134,7 +134,7 @@ void smp_callin(void)
 
 	local_irq_enable();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void cpu_panic(void)
diff --git a/arch/tile/kernel/smpboot.c b/arch/tile/kernel/smpboot.c
index 20d52a9..6c0abaa 100644
--- a/arch/tile/kernel/smpboot.c
+++ b/arch/tile/kernel/smpboot.c
@@ -208,7 +208,7 @@ void online_secondary(void)
 	/* Set up tile-timer clock-event device on this cpu */
 	setup_tile_timer();
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 int __cpu_up(unsigned int cpu, struct task_struct *tidle)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 24d57f7..293b22a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -248,7 +248,7 @@ static void notrace start_secondary(void *unused)
 	x86_cpuinit.setup_percpu_clockev();
 
 	wmb();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 void __init smp_store_boot_cpu_info(void)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 3f4ebf0..3c6d17f 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -112,7 +112,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu)
 		xen_pvh_secondary_vcpu_init(cpu);
 #endif
 	cpu_bringup();
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 static void xen_smp_intr_free(unsigned int cpu)
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 4d02e38..fc4ad21 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -157,7 +157,7 @@ void secondary_start_kernel(void)
 
 	complete(&cpu_running);
 
-	cpu_startup_entry(CPUHP_ONLINE);
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
 static void mx_cpu_start(void *p)
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 8a715bb..4aa263a 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -13,6 +13,7 @@ enum cpuhp_state {
 	CPUHP_CPU_SET_ACTIVE,
 	CPUHP_KICK_AP_THREAD,
 	CPUHP_BP_ONLINE,
+	CPUHP_AP_ONLINE_IDLE,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_NOTIFY_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
-- 
cgit v0.10.2


From 8df3e07e7f21f2ed8d001e6fabf9505946b438aa Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:41 +0000
Subject: cpu/hotplug: Let upcoming cpu bring itself fully up

Let the upcoming cpu kick the hotplug thread and let itself complete the
bringup. That way the controll side can just wait for the completion or later
when we made the hotplug machinery async not care at all.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.697655464@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 4aa263a..ad5d7fc 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -10,9 +10,6 @@ enum cpuhp_state {
 	CPUHP_AP_NOTIFY_STARTING,
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
-	CPUHP_CPU_SET_ACTIVE,
-	CPUHP_KICK_AP_THREAD,
-	CPUHP_BP_ONLINE,
 	CPUHP_AP_ONLINE_IDLE,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_NOTIFY_ONLINE,
@@ -86,4 +83,10 @@ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state)
 	__cpuhp_remove_state(state, false);
 }
 
+#ifdef CONFIG_SMP
+void cpuhp_online_idle(enum cpuhp_state state);
+#else
+static inline void cpuhp_online_idle(enum cpuhp_state state) { }
+#endif
+
 #endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e220e56..f1f880f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -329,6 +329,14 @@ static int notify_starting(unsigned int cpu)
 	return 0;
 }
 
+static int bringup_wait_for_ap(unsigned int cpu)
+{
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+
+	wait_for_completion(&st->done);
+	return st->result;
+}
+
 static int bringup_cpu(unsigned int cpu)
 {
 	struct task_struct *idle = idle_thread_get(cpu);
@@ -340,8 +348,9 @@ static int bringup_cpu(unsigned int cpu)
 		cpu_notify(CPU_UP_CANCELED, cpu);
 		return ret;
 	}
+	ret = bringup_wait_for_ap(cpu);
 	BUG_ON(!cpu_online(cpu));
-	return 0;
+	return ret;
 }
 
 /*
@@ -470,7 +479,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
 		}
 	} else {
 		/* Cannot happen .... */
-		BUG_ON(st->state < CPUHP_KICK_AP_THREAD);
+		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
 
 		/* Regular hotplug work */
 		if (st->state < st->target)
@@ -780,7 +789,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	 * If the current CPU state is in the range of the AP hotplug thread,
 	 * then we need to kick the thread.
 	 */
-	if (st->state >= CPUHP_KICK_AP_THREAD) {
+	if (st->state > CPUHP_TEARDOWN_CPU) {
 		ret = cpuhp_kick_ap_work(cpu);
 		/*
 		 * The AP side has done the error rollback already. Just
@@ -793,11 +802,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 		 * We might have stopped still in the range of the AP hotplug
 		 * thread. Nothing to do anymore.
 		 */
-		if (st->state >= CPUHP_KICK_AP_THREAD)
+		if (st->state > CPUHP_TEARDOWN_CPU)
 			goto out;
 	}
 	/*
-	 * The AP brought itself down below CPUHP_KICK_AP_THREAD. So we need
+	 * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
 	 * to do the further cleanups.
 	 */
 	ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
@@ -859,18 +868,32 @@ void notify_cpu_starting(unsigned int cpu)
 
 /*
  * Called from the idle task. We need to set active here, so we can kick off
- * the stopper thread.
+ * the stopper thread and unpark the smpboot threads. If the target state is
+ * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the
+ * cpu further.
  */
-static int cpuhp_set_cpu_active(unsigned int cpu)
+void cpuhp_online_idle(enum cpuhp_state state)
 {
-	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+	unsigned int cpu = smp_processor_id();
+
+	/* Happens for the boot cpu */
+	if (state != CPUHP_AP_ONLINE_IDLE)
+		return;
+
+	st->state = CPUHP_AP_ONLINE_IDLE;
 
 	/* The cpu is marked online, set it active now */
 	set_cpu_active(cpu, true);
-	/* Unpark the stopper thread and the hotplug thread */
+	/* Unpark the stopper thread and the hotplug thread of this cpu */
 	stop_machine_unpark(cpu);
 	kthread_unpark(st->thread);
-	return 0;
+
+	/* Should we go further up ? */
+	if (st->target > CPUHP_AP_ONLINE_IDLE)
+		__cpuhp_kick_ap_work(st);
+	else
+		complete(&st->done);
 }
 
 /* Requires cpu_add_remove_lock to be held */
@@ -910,7 +933,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 	 * If the current CPU state is in the range of the AP hotplug thread,
 	 * then we need to kick the thread once more.
 	 */
-	if (st->state >= CPUHP_KICK_AP_THREAD) {
+	if (st->state > CPUHP_BRINGUP_CPU) {
 		ret = cpuhp_kick_ap_work(cpu);
 		/*
 		 * The AP side has done the error rollback already. Just
@@ -922,10 +945,10 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 
 	/*
 	 * Try to reach the target state. We max out on the BP at
-	 * CPUHP_KICK_AP_THREAD. After that the AP hotplug thread is
+	 * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
 	 * responsible for bringing it up to the target state.
 	 */
-	target = min((int)target, CPUHP_KICK_AP_THREAD);
+	target = min((int)target, CPUHP_BRINGUP_CPU);
 	ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target);
 out:
 	cpu_hotplug_done();
@@ -1146,22 +1169,7 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.teardown		= takedown_cpu,
 		.cant_stop		= true,
 	},
-	[CPUHP_CPU_SET_ACTIVE] = {
-		.name			= "cpu:active",
-		.startup		= cpuhp_set_cpu_active,
-		.teardown		= NULL,
-	},
-	[CPUHP_KICK_AP_THREAD] = {
-		.name			= "cpuhp:kickthread",
-		.startup		= cpuhp_kick_ap_work,
-		.teardown		= cpuhp_kick_ap_work,
-	},
 #endif
-	[CPUHP_BP_ONLINE] = {
-		.name			= "online",
-		.startup		= NULL,
-		.teardown		= NULL,
-	},
 };
 
 /* Application processor state steps */
@@ -1204,7 +1212,7 @@ static bool cpuhp_is_ap_state(enum cpuhp_state state)
 {
 	if (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE)
 		return true;
-	return state > CPUHP_BP_ONLINE;
+	return state > CPUHP_BRINGUP_CPU;
 }
 
 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 544a713..a4b9813 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -4,6 +4,7 @@
 #include <linux/sched.h>
 #include <linux/cpu.h>
 #include <linux/cpuidle.h>
+#include <linux/cpuhotplug.h>
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
@@ -291,5 +292,6 @@ void cpu_startup_entry(enum cpuhp_state state)
 	boot_init_stack_canary();
 #endif
 	arch_cpu_idle_prepare();
+	cpuhp_online_idle(state);
 	cpu_idle_loop();
 }
-- 
cgit v0.10.2


From e69aab13117efc1987620090e539b4ebeb33a04c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:43 +0000
Subject: cpu/hotplug: Make wait for dead cpu completion based

Kill the busy spinning on the control side and just wait for the hotplugged
cpu to tell that it reached the dead state.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.776157858@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 83f3576..91a48d1 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -276,14 +276,15 @@ void arch_cpu_idle_enter(void);
 void arch_cpu_idle_exit(void);
 void arch_cpu_idle_dead(void);
 
-DECLARE_PER_CPU(bool, cpu_dead_idle);
-
 int cpu_report_state(int cpu);
 int cpu_check_up_prepare(int cpu);
 void cpu_set_state_online(int cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 bool cpu_wait_death(unsigned int cpu, int seconds);
 bool cpu_report_death(void);
+void cpuhp_report_idle_dead(void);
+#else
+static inline void cpuhp_report_idle_dead(void) { }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 
 #endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index ad5d7fc..5d68e15 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -6,6 +6,7 @@ enum cpuhp_state {
 	CPUHP_CREATE_THREADS,
 	CPUHP_NOTIFY_PREPARE,
 	CPUHP_BRINGUP_CPU,
+	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
 	CPUHP_AP_NOTIFY_STARTING,
 	CPUHP_AP_ONLINE,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f1f880f..0e8c07f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -688,6 +688,7 @@ static int take_cpu_down(void *_param)
 
 static int takedown_cpu(unsigned int cpu)
 {
+	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	int err;
 
 	/*
@@ -733,10 +734,8 @@ static int takedown_cpu(unsigned int cpu)
 	 *
 	 * Wait for the stop thread to go away.
 	 */
-	while (!per_cpu(cpu_dead_idle, cpu))
-		cpu_relax();
-	smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
-	per_cpu(cpu_dead_idle, cpu) = false;
+	wait_for_completion(&st->done);
+	BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
 
 	/* Interrupts are moved away from the dying cpu, reenable alloc/free */
 	irq_unlock_sparse();
@@ -756,6 +755,15 @@ static int notify_dead(unsigned int cpu)
 	return 0;
 }
 
+void cpuhp_report_idle_dead(void)
+{
+	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
+
+	BUG_ON(st->state != CPUHP_AP_OFFLINE);
+	st->state = CPUHP_AP_IDLE_DEAD;
+	complete(&st->done);
+}
+
 #else
 #define notify_down_prepare	NULL
 #define takedown_cpu		NULL
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index a4b9813..8abbe89 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -194,8 +194,6 @@ exit_idle:
 	rcu_idle_exit();
 }
 
-DEFINE_PER_CPU(bool, cpu_dead_idle);
-
 /*
  * Generic idle loop implementation
  *
@@ -224,8 +222,7 @@ static void cpu_idle_loop(void)
 			if (cpu_is_offline(smp_processor_id())) {
 				rcu_cpu_notify(NULL, CPU_DYING_IDLE,
 					       (void *)(long)smp_processor_id());
-				smp_mb(); /* all activity before dead. */
-				this_cpu_write(cpu_dead_idle, true);
+				cpuhp_report_idle_dead();
 				arch_cpu_idle_dead();
 			}
 
-- 
cgit v0.10.2


From 27d50c7eeb0f03c3d3ca72aac4d2dd487ca1f3f0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 26 Feb 2016 18:43:44 +0000
Subject: rcu: Make CPU_DYING_IDLE an explicit call

Make the RCU CPU_DYING_IDLE callback an explicit function call, so it gets
invoked at the proper place.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Rik van Riel <riel@redhat.com>
Cc: Rafael Wysocki <rafael.j.wysocki@intel.com>
Cc: "Srivatsa S. Bhat" <srivatsa@mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20160226182341.870167933@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 91a48d1..f9b1fab 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -101,9 +101,7 @@ enum {
 					* Called on the new cpu, just before
 					* enabling interrupts. Must not sleep,
 					* must not fail */
-#define CPU_DYING_IDLE		0x000B /* CPU (unsigned)v dying, reached
-					* idle loop. */
-#define CPU_BROKEN		0x000C /* CPU (unsigned)v did not die properly,
+#define CPU_BROKEN		0x000B /* CPU (unsigned)v did not die properly,
 					* perhaps due to preemption. */
 
 /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index d14a4c3..4149868 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -47,6 +47,8 @@
  * runtime initialization.
  */
 
+struct notifier_block;
+
 typedef	int (*notifier_fn_t)(struct notifier_block *nb,
 			unsigned long action, void *data);
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 14e6f47..fc46fe3 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -332,9 +332,7 @@ void rcu_init(void);
 void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
-struct notifier_block;
-int rcu_cpu_notify(struct notifier_block *self,
-		   unsigned long action, void *hcpu);
+void rcu_report_dead(unsigned int cpu);
 
 #ifndef CONFIG_TINY_RCU
 void rcu_end_inkernel_boot(void);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0e8c07f..ff8059b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -762,6 +762,7 @@ void cpuhp_report_idle_dead(void)
 	BUG_ON(st->state != CPUHP_AP_OFFLINE);
 	st->state = CPUHP_AP_IDLE_DEAD;
 	complete(&st->done);
+	rcu_report_dead(smp_processor_id());
 }
 
 #else
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e41dd41..85b4134 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2607,28 +2607,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 }
 
 /*
- * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
- * function.  We now remove it from the rcu_node tree's ->qsmaskinit
- * bit masks.
- */
-static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
-{
-	unsigned long flags;
-	unsigned long mask;
-	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
-
-	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
-		return;
-
-	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
-	mask = rdp->grpmask;
-	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
-	rnp->qsmaskinitnext &= ~mask;
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
-/*
  * The CPU has been completely removed, and some other CPU is reporting
  * this fact from process context.  Do the remainder of the cleanup,
  * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -4247,6 +4225,43 @@ static void rcu_prepare_cpu(int cpu)
 		rcu_init_percpu_data(cpu, rsp);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function.  We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ */
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
+
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+		return;
+
+	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+	mask = rdp->grpmask;
+	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
+	rnp->qsmaskinitnext &= ~mask;
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+void rcu_report_dead(unsigned int cpu)
+{
+	struct rcu_state *rsp;
+
+	/* QS for any half-done expedited RCU-sched GP. */
+	preempt_disable();
+	rcu_report_exp_rdp(&rcu_sched_state,
+			   this_cpu_ptr(rcu_sched_state.rda), true);
+	preempt_enable();
+	for_each_rcu_flavor(rsp)
+		rcu_cleanup_dying_idle_cpu(cpu, rsp);
+}
+#endif
+
 /*
  * Handle CPU online/offline notification events.
  */
@@ -4278,17 +4293,6 @@ int rcu_cpu_notify(struct notifier_block *self,
 		for_each_rcu_flavor(rsp)
 			rcu_cleanup_dying_cpu(rsp);
 		break;
-	case CPU_DYING_IDLE:
-		/* QS for any half-done expedited RCU-sched GP. */
-		preempt_disable();
-		rcu_report_exp_rdp(&rcu_sched_state,
-				   this_cpu_ptr(rcu_sched_state.rda), true);
-		preempt_enable();
-
-		for_each_rcu_flavor(rsp) {
-			rcu_cleanup_dying_idle_cpu(cpu, rsp);
-		}
-		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 	case CPU_UP_CANCELED:
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8abbe89..bd12c6c 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -220,8 +220,6 @@ static void cpu_idle_loop(void)
 			rmb();
 
 			if (cpu_is_offline(smp_processor_id())) {
-				rcu_cpu_notify(NULL, CPU_DYING_IDLE,
-					       (void *)(long)smp_processor_id());
 				cpuhp_report_idle_dead();
 				arch_cpu_idle_dead();
 			}
-- 
cgit v0.10.2


From a7f4b98896f6aede26431a3017d32d722fcec5cb Mon Sep 17 00:00:00 2001
From: Ivan Vecera <ivecera@redhat.com>
Date: Fri, 26 Feb 2016 09:15:50 +0100
Subject: bna: fix list corruption

Use list_move_tail() to move MAC address entry from list of pending
to list of active entries. Simple list_add_tail() leaves the entry
also in the first list, this leads to list corruption.

Cc: Rasesh Mody <rasesh.mody@qlogic.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Acked-by: Rasesh Mody <rasesh.mody@qlogic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/brocade/bna/bna_tx_rx.c b/drivers/net/ethernet/brocade/bna/bna_tx_rx.c
index 04b0d16..95bc470 100644
--- a/drivers/net/ethernet/brocade/bna/bna_tx_rx.c
+++ b/drivers/net/ethernet/brocade/bna/bna_tx_rx.c
@@ -987,7 +987,7 @@ bna_rxf_ucast_cfg_apply(struct bna_rxf *rxf)
 	if (!list_empty(&rxf->ucast_pending_add_q)) {
 		mac = list_first_entry(&rxf->ucast_pending_add_q,
 				       struct bna_mac, qe);
-		list_add_tail(&mac->qe, &rxf->ucast_active_q);
+		list_move_tail(&mac->qe, &rxf->ucast_active_q);
 		bna_bfi_ucast_req(rxf, mac, BFI_ENET_H2I_MAC_UCAST_ADD_REQ);
 		return 1;
 	}
-- 
cgit v0.10.2


From edffc2178d4ba10a61e150b42ef5e7d797714eb3 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Fri, 26 Feb 2016 18:45:34 +0100
Subject: ppp: lock ppp->flags in ppp_read() and ppp_poll()

ppp_read() and ppp_poll() can be called concurrently with ppp_ioctl().
In this case, ppp_ioctl() might call ppp_ccp_closed(), which may update
ppp->flags while ppp_read() or ppp_poll() is reading it.
The update done by ppp_ccp_closed() isn't atomic due to the bit mask
operation ('ppp->flags &= ~(SC_CCP_OPEN | SC_CCP_UP)'), so concurrent
readers might get transient values.
Reading incorrect ppp->flags may disturb the 'ppp->flags & SC_LOOP_TRAFFIC'
test in ppp_read() and ppp_poll(), which in turn can lead to improper
decision on whether the PPP unit file is ready for reading or not.

Since ppp_ccp_closed() is protected by the Rx and Tx locks (with
ppp_lock()), taking the Rx lock is enough for ppp_read() and ppp_poll()
to guarantee that ppp_ccp_closed() won't update ppp->flags
concurrently.

The same reasoning applies to ppp->n_channels. The 'n_channels' field
can also be written to concurrently by ppp_ioctl() (through
ppp_connect_channel() or ppp_disconnect_channel()). These writes aren't
atomic (simple increment/decrement), but are protected by both the Rx
and Tx locks (like in the ppp->flags case). So holding the Rx lock
before reading ppp->n_channels also prevents concurrent writes.

Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index fc8ad00..e8a5936 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -443,9 +443,14 @@ static ssize_t ppp_read(struct file *file, char __user *buf,
 			 * network traffic (demand mode).
 			 */
 			struct ppp *ppp = PF_TO_PPP(pf);
+
+			ppp_recv_lock(ppp);
 			if (ppp->n_channels == 0 &&
-			    (ppp->flags & SC_LOOP_TRAFFIC) == 0)
+			    (ppp->flags & SC_LOOP_TRAFFIC) == 0) {
+				ppp_recv_unlock(ppp);
 				break;
+			}
+			ppp_recv_unlock(ppp);
 		}
 		ret = -EAGAIN;
 		if (file->f_flags & O_NONBLOCK)
@@ -532,9 +537,12 @@ static unsigned int ppp_poll(struct file *file, poll_table *wait)
 	else if (pf->kind == INTERFACE) {
 		/* see comment in ppp_read */
 		struct ppp *ppp = PF_TO_PPP(pf);
+
+		ppp_recv_lock(ppp);
 		if (ppp->n_channels == 0 &&
 		    (ppp->flags & SC_LOOP_TRAFFIC) == 0)
 			mask |= POLLIN | POLLRDNORM;
+		ppp_recv_unlock(ppp);
 	}
 
 	return mask;
-- 
cgit v0.10.2


From 9a334d39daad92e2c46539b0bb3bd32b68df2af2 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Wed, 3 Feb 2016 20:54:56 +0100
Subject: parisc: Use parentheses around expression in floppy.h

David Binderman reported a style issue in the floppy.h header file:
arch/parisc/include/asm/floppy.h:221: (style) Boolean result is used in bitwise
  operation. Clarify expression with parentheses.

Reported-by: David Binderman <dcb314@hotmail.com>
Cc: David Binderman <dcb314@hotmail.com>
Signed-off-by: Helge Deller <deller@gmx.de>

diff --git a/arch/parisc/include/asm/floppy.h b/arch/parisc/include/asm/floppy.h
index f84ff12..6d8276cd 100644
--- a/arch/parisc/include/asm/floppy.h
+++ b/arch/parisc/include/asm/floppy.h
@@ -33,7 +33,7 @@
  * floppy accesses go through the track buffer.
  */
 #define _CROSS_64KB(a,s,vdma) \
-(!vdma && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
+(!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
 
 #define CROSS_64KB(a,s) _CROSS_64KB(a,s,use_virtual_dma & 1)
 
-- 
cgit v0.10.2


From f5aba91d7f186cba84af966a741a0346de603cd4 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Date: Fri, 26 Feb 2016 19:18:22 +0100
Subject: phy: micrel: Ensure interrupts are reenabled on resume

At least on ksz8081, when getting back from power down, interrupts are
disabled. ensure they are reenabled if they were previously enabled.

This fixes resuming which is failing on the xplained boards from atmel
since 321beec5047a (net: phy: Use interrupts when available in NOLINK
state)

Fixes: 321beec5047a (net: phy: Use interrupts when available in NOLINK state)
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 03833db..a5e265b 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -635,6 +635,21 @@ static void kszphy_get_stats(struct phy_device *phydev,
 		data[i] = kszphy_get_stat(phydev, i);
 }
 
+static int kszphy_resume(struct phy_device *phydev)
+{
+	int value;
+
+	mutex_lock(&phydev->lock);
+
+	value = phy_read(phydev, MII_BMCR);
+	phy_write(phydev, MII_BMCR, value & ~BMCR_PDOWN);
+
+	kszphy_config_intr(phydev);
+	mutex_unlock(&phydev->lock);
+
+	return 0;
+}
+
 static int kszphy_probe(struct phy_device *phydev)
 {
 	const struct kszphy_type *type = phydev->drv->driver_data;
@@ -844,7 +859,7 @@ static struct phy_driver ksphy_driver[] = {
 	.get_strings	= kszphy_get_strings,
 	.get_stats	= kszphy_get_stats,
 	.suspend	= genphy_suspend,
-	.resume		= genphy_resume,
+	.resume		= kszphy_resume,
 }, {
 	.phy_id		= PHY_ID_KSZ8061,
 	.name		= "Micrel KSZ8061",
-- 
cgit v0.10.2


From 99f81afc139c6edd14d77a91ee91685a414a1c66 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Date: Fri, 26 Feb 2016 19:18:23 +0100
Subject: phy: micrel: Disable auto negotiation on startup

Disable auto negotiation on init to properly detect an already plugged
cable at boot.

At boot, when the phy is started, it is in the PHY_UP state.
However, if a cable is plugged at boot, because auto negociation is already
enabled at the time we get the first interrupt, the phy is already running.
But the state machine then switches from PHY_UP to PHY_AN and calls
phy_start_aneg(). phy_start_aneg() will not do anything because aneg is
already enabled on the phy. It will then wait for a interrupt before going
further. This interrupt will never happen unless the cable is unplugged and
then replugged.

It was working properly before 321beec5047a (net: phy: Use interrupts when
available in NOLINK state) because switching to NOLINK meant starting
polling the phy, even if IRQ were enabled.

Fixes: 321beec5047a (net: phy: Use interrupts when available in NOLINK state)
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index a5e265b..dc85f70 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -297,6 +297,17 @@ static int kszphy_config_init(struct phy_device *phydev)
 	if (priv->led_mode >= 0)
 		kszphy_setup_led(phydev, type->led_mode_reg, priv->led_mode);
 
+	if (phy_interrupt_is_valid(phydev)) {
+		int ctl = phy_read(phydev, MII_BMCR);
+
+		if (ctl < 0)
+			return ctl;
+
+		ret = phy_write(phydev, MII_BMCR, ctl & ~BMCR_ANENABLE);
+		if (ret < 0)
+			return ret;
+	}
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 98e8b6c9ac9d1b1e9d1122dfa6783d5d566bb8f7 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 19 Jan 2016 16:08:49 +0100
Subject: parisc: Fix ptrace syscall number and return value modification

Mike Frysinger reported that his ptrace testcase showed strange
behaviour on parisc: It was not possible to avoid a syscall and the
return value of a syscall couldn't be changed.

To modify a syscall number, we were missing to save the new syscall
number to gr20 which is then picked up later in assembly again.

The effect that the return value couldn't be changed is a side-effect of
another bug in the assembly code. When a process is ptraced, userspace
expects each syscall to report entrance and exit of a syscall.  If a
syscall number was given which doesn't exist, we jumped to the normal
syscall exit code instead of informing userspace that the (non-existant)
syscall exits. This unexpected behaviour confuses userspace and thus the
bug was misinterpreted as if we can't change the return value.

This patch fixes both problems and was tested on 64bit kernel with
32bit userspace.

Signed-off-by: Helge Deller <deller@gmx.de>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: stable@vger.kernel.org  # v4.0+
Tested-by: Mike Frysinger <vapier@gentoo.org>

diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 9585c81..ce0b2b4 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -269,14 +269,19 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 long do_syscall_trace_enter(struct pt_regs *regs)
 {
-	long ret = 0;
-
 	/* Do the secure computing check first. */
 	secure_computing_strict(regs->gr[20]);
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    tracehook_report_syscall_entry(regs))
-		ret = -1L;
+	    tracehook_report_syscall_entry(regs)) {
+		/*
+		 * Tracing decided this syscall should not happen or the
+		 * debugger stored an invalid system call number. Skip
+		 * the system call and the system call restart handling.
+		 */
+		regs->gr[20] = -1UL;
+		goto out;
+	}
 
 #ifdef CONFIG_64BIT
 	if (!is_compat_task())
@@ -290,7 +295,8 @@ long do_syscall_trace_enter(struct pt_regs *regs)
 			regs->gr[24] & 0xffffffff,
 			regs->gr[23] & 0xffffffff);
 
-	return ret ? : regs->gr[20];
+out:
+	return regs->gr[20];
 }
 
 void do_syscall_trace_exit(struct pt_regs *regs)
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 3fbd725..fbafa0d 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -343,7 +343,7 @@ tracesys_next:
 #endif
 
 	comiclr,>>=	__NR_Linux_syscalls, %r20, %r0
-	b,n	.Lsyscall_nosys
+	b,n	.Ltracesys_nosys
 
 	LDREGX  %r20(%r19), %r19
 
@@ -359,6 +359,9 @@ tracesys_next:
 	be      0(%sr7,%r19)
 	ldo	R%tracesys_exit(%r2),%r2
 
+.Ltracesys_nosys:
+	ldo	-ENOSYS(%r0),%r28		/* set errno */
+
 	/* Do *not* call this function on the gateway page, because it
 	makes a direct call to syscall_trace. */
 	
-- 
cgit v0.10.2


From b4f09ae6db2a98ff7817c3371aaab39ca22daf80 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 1 Mar 2016 23:18:43 +0100
Subject: parisc: Wire up copy_file_range syscall

Signed-off-by: Helge Deller <deller@gmx.de>

diff --git a/arch/parisc/include/uapi/asm/unistd.h b/arch/parisc/include/uapi/asm/unistd.h
index 35bdccb..b75039f 100644
--- a/arch/parisc/include/uapi/asm/unistd.h
+++ b/arch/parisc/include/uapi/asm/unistd.h
@@ -361,8 +361,9 @@
 #define __NR_membarrier		(__NR_Linux + 343)
 #define __NR_userfaultfd	(__NR_Linux + 344)
 #define __NR_mlock2		(__NR_Linux + 345)
+#define __NR_copy_file_range	(__NR_Linux + 346)
 
-#define __NR_Linux_syscalls	(__NR_mlock2 + 1)
+#define __NR_Linux_syscalls	(__NR_copy_file_range + 1)
 
 
 #define __IGNORE_select		/* newselect */
diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S
index d4ffcfb..585d50f 100644
--- a/arch/parisc/kernel/syscall_table.S
+++ b/arch/parisc/kernel/syscall_table.S
@@ -441,6 +441,7 @@
 	ENTRY_SAME(membarrier)
 	ENTRY_SAME(userfaultfd)
 	ENTRY_SAME(mlock2)		/* 345 */
+	ENTRY_SAME(copy_file_range)
 
 
 .ifne (. - 90b) - (__NR_Linux_syscalls * (91b - 90b))
-- 
cgit v0.10.2


From 40b4f0fd74e46c017814618d67ec9127ff20f157 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 28 Feb 2016 10:03:51 +0800
Subject: sctp: lack the check for ports in sctp_v6_cmp_addr

As the member .cmp_addr of sctp_af_inet6, sctp_v6_cmp_addr should also check
the port of addresses, just like sctp_v4_cmp_addr, cause it's invoked by
sctp_cmp_addr_exact().

Now sctp_v6_cmp_addr just check the port when two addresses have different
family, and lack the port check for two ipv6 addresses. that will make
sctp_hash_cmp() cannot work well.

so fix it by adding ports comparison in sctp_v6_cmp_addr().

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index ec52912..ce46f1c 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -526,6 +526,8 @@ static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
 		}
 		return 0;
 	}
+	if (addr1->v6.sin6_port != addr2->v6.sin6_port)
+		return 0;
 	if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
 		return 0;
 	/* If this is a linklocal address, compare the scope_id. */
-- 
cgit v0.10.2


From 3d73e8fac8f84942f15307d6d9cb1dba843d3fb2 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 28 Feb 2016 10:33:11 +0800
Subject: sctp: sctp_remaddr_seq_show use the wrong variable to dump transport
 info

Now in sctp_remaddr_seq_show(), we use variable *tsp to get the param *v.
but *tsp is also used to traversal transport_addr_list, which will cover
the previous value, and make sctp_transport_put work on the wrong transport.

So fix it by adding a new variable to get the param *v.

Fixes: fba4c330c5b9 ("sctp: hold transport before we access t->asoc in sctp proc")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index ded7d93..963dffc 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -482,7 +482,7 @@ static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v)
 static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
 {
 	struct sctp_association *assoc;
-	struct sctp_transport *tsp;
+	struct sctp_transport *transport, *tsp;
 
 	if (v == SEQ_START_TOKEN) {
 		seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX "
@@ -490,10 +490,10 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
 		return 0;
 	}
 
-	tsp = (struct sctp_transport *)v;
-	if (!sctp_transport_hold(tsp))
+	transport = (struct sctp_transport *)v;
+	if (!sctp_transport_hold(transport))
 		return 0;
-	assoc = tsp->asoc;
+	assoc = transport->asoc;
 
 	list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list,
 				transports) {
@@ -546,7 +546,7 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
 		seq_printf(seq, "\n");
 	}
 
-	sctp_transport_put(tsp);
+	sctp_transport_put(transport);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 735c1e2511a5c4e6c451c8b3d29916c4b0fa39ae Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sun, 28 Feb 2016 22:50:59 +0300
Subject: of_mdio: fix kernel-doc for of_phy_connect()

The 'flags' parameter of the of_phy_connect() function wasn't described
in  the kernel-doc comment...

Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index 39c4be4..365dc7e 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -305,6 +305,7 @@ EXPORT_SYMBOL(of_phy_find_device);
  * @dev: pointer to net_device claiming the phy
  * @phy_np: Pointer to device tree node for the PHY
  * @hndlr: Link state callback for the network device
+ * @flags: flags to pass to the PHY
  * @iface: PHY data interface type
  *
  * If successful, returns a pointer to the phy_device with the embedded
-- 
cgit v0.10.2


From 949024d6702d1da8a266f42226d8402f0df078dc Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sun, 14 Feb 2016 01:26:34 +0300
Subject: ARM: dts: porter: remove enable prop from HS-USB device node

In  the final versions of the Porter board (called "PORTER_C") Renesas
decided to get rid  of the Maxim Integrated MAX3355 OTG chip and didn't
add any other provision to differ the host/gadget mode, so we'll have to
remove  no longer valid "renesas,enable-gpio" property from the HS-USB
device node.  Hopefully, the earlier revisions of the board were never
seen in the wild...

Fixes: c794f6a09a25 ("ARM: shmobile: porter: add HS-USB DT support")
Reported-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>

diff --git a/arch/arm/boot/dts/r8a7791-porter.dts b/arch/arm/boot/dts/r8a7791-porter.dts
index 6713b1e..01d239c 100644
--- a/arch/arm/boot/dts/r8a7791-porter.dts
+++ b/arch/arm/boot/dts/r8a7791-porter.dts
@@ -283,7 +283,6 @@
 	pinctrl-names = "default";
 
 	status = "okay";
-	renesas,enable-gpio = <&gpio5 31 GPIO_ACTIVE_HIGH>;
 };
 
 &usbphy {
-- 
cgit v0.10.2


From 2d02b8bdba322b527c5f5168ce1ca10c2d982a78 Mon Sep 17 00:00:00 2001
From: Timothy Pearson <tpearson@raptorengineeringinc.com>
Date: Fri, 26 Feb 2016 15:29:32 -0600
Subject: drm/ast: Fix incorrect register check for DRAM width

During DRAM initialization on certain ASpeed devices, an incorrect
bit (bit 10) was checked in the "SDRAM Bus Width Status" register
to determine DRAM width.

Query bit 6 instead in accordance with the Aspeed AST2050 datasheet v1.05.

Signed-off-by: Timothy Pearson <tpearson@raptorengineeringinc.com>
Cc: stable@vger.kernel.org
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/ast/ast_main.c b/drivers/gpu/drm/ast/ast_main.c
index 9759009..b1480ac 100644
--- a/drivers/gpu/drm/ast/ast_main.c
+++ b/drivers/gpu/drm/ast/ast_main.c
@@ -227,7 +227,7 @@ static int ast_get_dram_info(struct drm_device *dev)
 	} while (ast_read32(ast, 0x10000) != 0x01);
 	data = ast_read32(ast, 0x10004);
 
-	if (data & 0x400)
+	if (data & 0x40)
 		ast->dram_bus_width = 16;
 	else
 		ast->dram_bus_width = 32;
-- 
cgit v0.10.2


From 04fdbc825ffc02fb098964b92de802fff44e73fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Tue, 1 Mar 2016 14:36:32 +0100
Subject: USB: qcserial: add Sierra Wireless EM74xx device ID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MC74xx and EM74xx modules use different IDs by default, according
to the Lenovo EM7455 driver for Windows.

Cc: <stable@vger.kernel.org>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: Johan Hovold <johan@kernel.org>

diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
index 9c09d5f..1bc6089 100644
--- a/drivers/usb/serial/qcserial.c
+++ b/drivers/usb/serial/qcserial.c
@@ -157,8 +157,10 @@ static const struct usb_device_id id_table[] = {
 	{DEVICE_SWI(0x1199, 0x9056)},	/* Sierra Wireless Modem */
 	{DEVICE_SWI(0x1199, 0x9060)},	/* Sierra Wireless Modem */
 	{DEVICE_SWI(0x1199, 0x9061)},	/* Sierra Wireless Modem */
-	{DEVICE_SWI(0x1199, 0x9070)},	/* Sierra Wireless MC74xx/EM74xx */
-	{DEVICE_SWI(0x1199, 0x9071)},	/* Sierra Wireless MC74xx/EM74xx */
+	{DEVICE_SWI(0x1199, 0x9070)},	/* Sierra Wireless MC74xx */
+	{DEVICE_SWI(0x1199, 0x9071)},	/* Sierra Wireless MC74xx */
+	{DEVICE_SWI(0x1199, 0x9078)},	/* Sierra Wireless EM74xx */
+	{DEVICE_SWI(0x1199, 0x9079)},	/* Sierra Wireless EM74xx */
 	{DEVICE_SWI(0x413c, 0x81a2)},	/* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */
 	{DEVICE_SWI(0x413c, 0x81a3)},	/* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */
 	{DEVICE_SWI(0x413c, 0x81a4)},	/* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */
-- 
cgit v0.10.2


From 0178fd7dcc4451fcb90bec5e91226586962478d2 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 28 Feb 2016 17:35:59 +0200
Subject: mips/kvm: fix ioctl error handling

Returning directly whatever copy_to_user(...) or copy_from_user(...)
returns may not do the right thing if there's a pagefault:
copy_to_user/copy_from_user return the number of bytes not copied in
this case, but ioctls need to return -EFAULT instead.

Fix up kvm on mips to do
	return copy_to_user(...)) ?  -EFAULT : 0;
and
	return copy_from_user(...)) ?  -EFAULT : 0;

everywhere.

Cc: stable@vger.kernel.org
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 8bc3977..3110447 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -702,7 +702,7 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
 	} else if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U128) {
 		void __user *uaddr = (void __user *)(long)reg->addr;
 
-		return copy_to_user(uaddr, vs, 16);
+		return copy_to_user(uaddr, vs, 16) ? -EFAULT : 0;
 	} else {
 		return -EINVAL;
 	}
@@ -732,7 +732,7 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
 	} else if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U128) {
 		void __user *uaddr = (void __user *)(long)reg->addr;
 
-		return copy_from_user(vs, uaddr, 16);
+		return copy_from_user(vs, uaddr, 16) ? -EFAULT : 0;
 	} else {
 		return -EINVAL;
 	}
-- 
cgit v0.10.2


From 2680d6da455b636dd006636780c0f235c6561d70 Mon Sep 17 00:00:00 2001
From: Owen Hofmann <osh@google.com>
Date: Tue, 1 Mar 2016 13:36:13 -0800
Subject: kvm: x86: Update tsc multiplier on change.

vmx.c writes the TSC_MULTIPLIER field in vmx_vcpu_load, but only when a
vcpu has migrated physical cpus. Record the last value written and
update in vmx_vcpu_load on any change, otherwise a cpu migration must
occur for TSC frequency scaling to take effect.

Cc: stable@vger.kernel.org
Fixes: ff2c3a1803775cc72dc6f624b59554956396b0ee
Signed-off-by: Owen Hofmann <osh@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e2951b6..0ff4537 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -596,6 +596,8 @@ struct vcpu_vmx {
 	/* Support for PML */
 #define PML_ENTITY_NUM		512
 	struct page *pml_pg;
+
+	u64 current_tsc_ratio;
 };
 
 enum segment_cache_field {
@@ -2127,14 +2129,16 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 
-		/* Setup TSC multiplier */
-		if (cpu_has_vmx_tsc_scaling())
-			vmcs_write64(TSC_MULTIPLIER,
-				     vcpu->arch.tsc_scaling_ratio);
-
 		vmx->loaded_vmcs->cpu = cpu;
 	}
 
+	/* Setup TSC multiplier */
+	if (kvm_has_tsc_control &&
+	    vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) {
+		vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio;
+		vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio);
+	}
+
 	vmx_vcpu_pi_load(vcpu, cpu);
 }
 
-- 
cgit v0.10.2


From bd90123c4973821775d7ffae9891ad36017702a9 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Mon, 29 Feb 2016 22:49:02 +0200
Subject: drm/i915/skl: Fix power domain suspend sequence

During system suspend we need to first disable power wells then
unitialize the display core. In case power well support is disabled we
did this in the wrong order, so fix this up.

Fixes: d314cd43 ("drm/i915: fix handling of the disable_power_well module option")
CC: stable@vger.kernel.org
CC: Patrik Jakobsson <patrik.jakobsson@linux.intel.com>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Patrik Jakobsson <patrik.jakobsson@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1456778945-5411-1-git-send-email-imre.deak@intel.com
(cherry picked from commit 2622d79bd9d18fd04b650234e6a218c5f95cf308)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>

diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index 678ed34..ef8d24d 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -2303,15 +2303,15 @@ void intel_power_domains_init_hw(struct drm_i915_private *dev_priv, bool resume)
  */
 void intel_power_domains_suspend(struct drm_i915_private *dev_priv)
 {
-	if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv))
-		skl_display_core_uninit(dev_priv);
-
 	/*
 	 * Even if power well support was disabled we still want to disable
 	 * power wells while we are system suspended.
 	 */
 	if (!i915.disable_power_well)
 		intel_display_power_put(dev_priv, POWER_DOMAIN_INIT);
+
+	if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv))
+		skl_display_core_uninit(dev_priv);
 }
 
 /**
-- 
cgit v0.10.2


From eda908967feecf7575a8ab74f1695b0445cf324e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 25 Feb 2016 21:10:28 +0000
Subject: drm/i915: Balance assert_rpm_wakelock_held() for
 !IS_ENABLED(CONFIG_PM)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 09731280028ce03e6a27e1998137f1775a2839f3
Author: Imre Deak <imre.deak@intel.com>
Date:   Wed Feb 17 14:17:42 2016 +0200

    drm/i915: Add helper to get a display power ref if it was already enabled

left the rpm wakelock assertions unbalanced if CONFIG_PM was disabled as
intel_runtime_pm_get_if_in_use() would return true without incrementing
the local bookkeeping required for the assertions.

Fixes: 09731280028c ("drm/i915: Add helper to get a display power ref if it was already enabled")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
CC: Mika Kuoppala <mika.kuoppala@intel.com>
CC: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
CC: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Imre Deak <imre.deak@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1456434628-22574-1-git-send-email-chris@chris-wilson.co.uk
Reviewed-by: Imre Deak <imre.deak@intel.com>
(cherry picked from commit 135dc79efbc119ea5fb34475996983159e6ca31c)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>

diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c
index ef8d24d..4f43d9b 100644
--- a/drivers/gpu/drm/i915/intel_runtime_pm.c
+++ b/drivers/gpu/drm/i915/intel_runtime_pm.c
@@ -2349,22 +2349,20 @@ bool intel_runtime_pm_get_if_in_use(struct drm_i915_private *dev_priv)
 {
 	struct drm_device *dev = dev_priv->dev;
 	struct device *device = &dev->pdev->dev;
-	int ret;
 
-	if (!IS_ENABLED(CONFIG_PM))
-		return true;
+	if (IS_ENABLED(CONFIG_PM)) {
+		int ret = pm_runtime_get_if_in_use(device);
 
-	ret = pm_runtime_get_if_in_use(device);
-
-	/*
-	 * In cases runtime PM is disabled by the RPM core and we get an
-	 * -EINVAL return value we are not supposed to call this function,
-	 * since the power state is undefined. This applies atm to the
-	 * late/early system suspend/resume handlers.
-	 */
-	WARN_ON_ONCE(ret < 0);
-	if (ret <= 0)
-		return false;
+		/*
+		 * In cases runtime PM is disabled by the RPM core and we get
+		 * an -EINVAL return value we are not supposed to call this
+		 * function, since the power state is undefined. This applies
+		 * atm to the late/early system suspend/resume handlers.
+		 */
+		WARN_ON_ONCE(ret < 0);
+		if (ret <= 0)
+			return false;
+	}
 
 	atomic_inc(&dev_priv->pm.wakeref_count);
 	assert_rpm_wakelock_held(dev_priv);
-- 
cgit v0.10.2


From ce0c12b633846a47e103842149a5bac2e5d261ec Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 1 Mar 2016 12:58:06 +0100
Subject: s390/dasd: fix diag 0x250 inline assembly

git commit 1ec2772e0c3c ("s390/diag: add a statistic for diagnose
calls") added function calls to gather diagnose statistics.

In case of the dasd diag driver the function call was added between a
register asm statement which initialized register r2 and the inline
assembly itself.  The function call clobbers the contents of register
r2 and therefore the diag 0x250 call behaves in a more or less random
way.

Fix this by extracting the function call into a separate function like
we do everywhere else.

Fixes: 1ec2772e0c3c ("s390/diag: add a statistic for diagnose calls")
Cc: <stable@vger.kernel.org> # 4.4+
Reported-and-tested-by: Stefan Haberland <sth@linux.vnet.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index cb61f30..277b5c8 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -67,7 +67,7 @@ static const u8 DASD_DIAG_CMS1[] = { 0xc3, 0xd4, 0xe2, 0xf1 };/* EBCDIC CMS1 */
  * and function code cmd.
  * In case of an exception return 3. Otherwise return result of bitwise OR of
  * resulting condition code and DIAG return code. */
-static inline int dia250(void *iob, int cmd)
+static inline int __dia250(void *iob, int cmd)
 {
 	register unsigned long reg2 asm ("2") = (unsigned long) iob;
 	typedef union {
@@ -77,7 +77,6 @@ static inline int dia250(void *iob, int cmd)
 	int rc;
 
 	rc = 3;
-	diag_stat_inc(DIAG_STAT_X250);
 	asm volatile(
 		"	diag	2,%2,0x250\n"
 		"0:	ipm	%0\n"
@@ -91,6 +90,12 @@ static inline int dia250(void *iob, int cmd)
 	return rc;
 }
 
+static inline int dia250(void *iob, int cmd)
+{
+	diag_stat_inc(DIAG_STAT_X250);
+	return __dia250(iob, cmd);
+}
+
 /* Initialize block I/O to DIAG device using the specified blocksize and
  * block offset. On success, return zero and set end_block to contain the
  * number of blocks on the device minus the specified offset. Return non-zero
-- 
cgit v0.10.2


From 2af8c4dc2e9c7038aefe9a822aa1df62e2b106b7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 26 Feb 2016 10:50:34 +0100
Subject: mac80211_hwsim: treat as part of mac80211 for MAINTAINERS

Since I maintain this driver as part of mac80211, add it to
the file list for mac80211; this helps submitters send it to
me instead of Kalle and also makes the build robot apply the
patches for it on the right tree for build attempts.

Acked-by: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index ded9640..25bdfe6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6664,6 +6664,7 @@ S:	Maintained
 F:	Documentation/networking/mac80211-injection.txt
 F:	include/net/mac80211.h
 F:	net/mac80211/
+F:	drivers/net/wireless/mac80211_hwsim.[ch]
 
 MACVLAN DRIVER
 M:	Patrick McHardy <kaber@trash.net>
-- 
cgit v0.10.2


From 4e94ebdd06d5dc72b7a40fc12fc496d601fb7bbc Mon Sep 17 00:00:00 2001
From: Ladi Prosek <lprosek@redhat.com>
Date: Mon, 1 Feb 2016 19:36:31 +0100
Subject: virtio-pci: read the right virtio_pci_notify_cap field

Looks like a copy-paste bug. The value is used as an optimization and a
wrong value probably isn't causing any serious damage. Found when
porting this code to Windows.

Signed-off-by: Ladi Prosek <lprosek@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c
index c0c11fa..7760fc1 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -679,7 +679,7 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev)
 
 	pci_read_config_dword(pci_dev,
 			      notify + offsetof(struct virtio_pci_notify_cap,
-						cap.length),
+						cap.offset),
 			      &notify_offset);
 
 	/* We don't know how many VQs we'll map, ahead of the time.
-- 
cgit v0.10.2


From e1f33be9186363da7955bcb5f0b03e6685544c50 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Tue, 16 Feb 2016 15:54:28 +0100
Subject: vhost: fix error path in vhost_init_used()

We don't want side effects. If something fails, we rollback vq->is_le to
its previous value.

Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index ad2146a..236553e 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1156,6 +1156,8 @@ int vhost_init_used(struct vhost_virtqueue *vq)
 {
 	__virtio16 last_used_idx;
 	int r;
+	bool is_le = vq->is_le;
+
 	if (!vq->private_data) {
 		vq->is_le = virtio_legacy_is_little_endian();
 		return 0;
@@ -1165,15 +1167,20 @@ int vhost_init_used(struct vhost_virtqueue *vq)
 
 	r = vhost_update_used_flags(vq);
 	if (r)
-		return r;
+		goto err;
 	vq->signalled_used_valid = false;
-	if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx))
-		return -EFAULT;
+	if (!access_ok(VERIFY_READ, &vq->used->idx, sizeof vq->used->idx)) {
+		r = -EFAULT;
+		goto err;
+	}
 	r = __get_user(last_used_idx, &vq->used->idx);
 	if (r)
-		return r;
+		goto err;
 	vq->last_used_idx = vhost16_to_cpu(vq, last_used_idx);
 	return 0;
+err:
+	vq->is_le = is_le;
+	return r;
 }
 EXPORT_SYMBOL_GPL(vhost_init_used);
 
-- 
cgit v0.10.2


From d027d45d8a17a4145eab2d841140e9acbb7feb59 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 7 Jun 2015 15:54:30 +0200
Subject: nohz: New tick dependency mask

The tick dependency is evaluated on every IRQ and context switch. This
consists is a batch of checks which determine whether it is safe to
stop the tick or not. These checks are often split in many details:
posix cpu timers, scheduler, sched clock, perf events.... each of which
are made of smaller details: posix cpu timer involves checking process
wide timers then thread wide timers. Perf involves checking freq events
then more per cpu details.

Checking these informations asynchronously every time we update the full
dynticks state bring avoidable overhead and a messy layout.

Let's introduce instead tick dependency masks: one for system wide
dependency (unstable sched clock, freq based perf events), one for CPU
wide dependency (sched, throttling perf events), and task/signal level
dependencies (posix cpu timers). The subsystems are responsible
for setting and clearing their dependency through a set of APIs that will
take care of concurrent dependency mask modifications and kick targets
to restart the relevant CPU tick whenever needed.

This new dependency engine stays beside the old one until all subsystems
having a tick dependency are converted to it.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..307e483 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -719,6 +719,10 @@ struct signal_struct {
 	/* Earliest-expiration cache. */
 	struct task_cputime cputime_expires;
 
+#ifdef CONFIG_NO_HZ_FULL
+	unsigned long tick_dep_mask;
+#endif
+
 	struct list_head cpu_timers[3];
 
 	struct pid *tty_old_pgrp;
@@ -1542,6 +1546,10 @@ struct task_struct {
 		VTIME_SYS,
 	} vtime_snap_whence;
 #endif
+
+#ifdef CONFIG_NO_HZ_FULL
+	unsigned long tick_dep_mask;
+#endif
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	u64 start_time;		/* monotonic time in nsec */
 	u64 real_start_time;	/* boot based time in nsec */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 97fd4e5..d60e5df 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -97,6 +97,18 @@ static inline void tick_broadcast_exit(void)
 	tick_broadcast_oneshot_control(TICK_BROADCAST_EXIT);
 }
 
+enum tick_dep_bits {
+	TICK_DEP_BIT_POSIX_TIMER	= 0,
+	TICK_DEP_BIT_PERF_EVENTS	= 1,
+	TICK_DEP_BIT_SCHED		= 2,
+	TICK_DEP_BIT_CLOCK_UNSTABLE	= 3
+};
+
+#define TICK_DEP_MASK_POSIX_TIMER	(1 << TICK_DEP_BIT_POSIX_TIMER)
+#define TICK_DEP_MASK_PERF_EVENTS	(1 << TICK_DEP_BIT_PERF_EVENTS)
+#define TICK_DEP_MASK_SCHED		(1 << TICK_DEP_BIT_SCHED)
+#define TICK_DEP_MASK_CLOCK_UNSTABLE	(1 << TICK_DEP_BIT_CLOCK_UNSTABLE)
+
 #ifdef CONFIG_NO_HZ_COMMON
 extern int tick_nohz_enabled;
 extern int tick_nohz_tick_stopped(void);
@@ -154,6 +166,72 @@ static inline int housekeeping_any_cpu(void)
 	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
 }
 
+extern void tick_nohz_dep_set(enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_task(struct task_struct *tsk,
+				   enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_task(struct task_struct *tsk,
+				     enum tick_dep_bits bit);
+extern void tick_nohz_dep_set_signal(struct signal_struct *signal,
+				     enum tick_dep_bits bit);
+extern void tick_nohz_dep_clear_signal(struct signal_struct *signal,
+				       enum tick_dep_bits bit);
+
+/*
+ * The below are tick_nohz_[set,clear]_dep() wrappers that optimize off-cases
+ * on top of static keys.
+ */
+static inline void tick_dep_set(enum tick_dep_bits bit)
+{
+	if (tick_nohz_full_enabled())
+		tick_nohz_dep_set(bit);
+}
+
+static inline void tick_dep_clear(enum tick_dep_bits bit)
+{
+	if (tick_nohz_full_enabled())
+		tick_nohz_dep_clear(bit);
+}
+
+static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit)
+{
+	if (tick_nohz_full_cpu(cpu))
+		tick_nohz_dep_set_cpu(cpu, bit);
+}
+
+static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
+{
+	if (tick_nohz_full_cpu(cpu))
+		tick_nohz_dep_clear_cpu(cpu, bit);
+}
+
+static inline void tick_dep_set_task(struct task_struct *tsk,
+				     enum tick_dep_bits bit)
+{
+	if (tick_nohz_full_enabled())
+		tick_nohz_dep_set_task(tsk, bit);
+}
+static inline void tick_dep_clear_task(struct task_struct *tsk,
+				       enum tick_dep_bits bit)
+{
+	if (tick_nohz_full_enabled())
+		tick_nohz_dep_clear_task(tsk, bit);
+}
+static inline void tick_dep_set_signal(struct signal_struct *signal,
+				       enum tick_dep_bits bit)
+{
+	if (tick_nohz_full_enabled())
+		tick_nohz_dep_set_signal(signal, bit);
+}
+static inline void tick_dep_clear_signal(struct signal_struct *signal,
+					 enum tick_dep_bits bit)
+{
+	if (tick_nohz_full_enabled())
+		tick_nohz_dep_clear_signal(signal, bit);
+}
+
 extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void tick_nohz_full_kick_all(void);
@@ -166,6 +244,20 @@ static inline int housekeeping_any_cpu(void)
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
+
+static inline void tick_dep_set(enum tick_dep_bits bit) { }
+static inline void tick_dep_clear(enum tick_dep_bits bit) { }
+static inline void tick_dep_set_cpu(int cpu, enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { }
+static inline void tick_dep_set_task(struct task_struct *tsk,
+				     enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_task(struct task_struct *tsk,
+				       enum tick_dep_bits bit) { }
+static inline void tick_dep_set_signal(struct signal_struct *signal,
+				       enum tick_dep_bits bit) { }
+static inline void tick_dep_clear_signal(struct signal_struct *signal,
+					 enum tick_dep_bits bit) { }
+
 static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void tick_nohz_full_kick(void) { }
 static inline void tick_nohz_full_kick_all(void) { }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 548a4e2..74ab7db 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -158,11 +158,53 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 cpumask_var_t tick_nohz_full_mask;
 cpumask_var_t housekeeping_mask;
 bool tick_nohz_full_running;
+static unsigned long tick_dep_mask;
 
-static bool can_stop_full_tick(void)
+static void trace_tick_dependency(unsigned long dep)
+{
+	if (dep & TICK_DEP_MASK_POSIX_TIMER) {
+		trace_tick_stop(0, "posix timers running\n");
+		return;
+	}
+
+	if (dep & TICK_DEP_MASK_PERF_EVENTS) {
+		trace_tick_stop(0, "perf events running\n");
+		return;
+	}
+
+	if (dep & TICK_DEP_MASK_SCHED) {
+		trace_tick_stop(0, "more than 1 task in runqueue\n");
+		return;
+	}
+
+	if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE)
+		trace_tick_stop(0, "unstable sched clock\n");
+}
+
+static bool can_stop_full_tick(struct tick_sched *ts)
 {
 	WARN_ON_ONCE(!irqs_disabled());
 
+	if (tick_dep_mask) {
+		trace_tick_dependency(tick_dep_mask);
+		return false;
+	}
+
+	if (ts->tick_dep_mask) {
+		trace_tick_dependency(ts->tick_dep_mask);
+		return false;
+	}
+
+	if (current->tick_dep_mask) {
+		trace_tick_dependency(current->tick_dep_mask);
+		return false;
+	}
+
+	if (current->signal->tick_dep_mask) {
+		trace_tick_dependency(current->signal->tick_dep_mask);
+		return false;
+	}
+
 	if (!sched_can_stop_tick()) {
 		trace_tick_stop(0, "more than 1 task in runqueue\n");
 		return false;
@@ -178,9 +220,10 @@ static bool can_stop_full_tick(void)
 		return false;
 	}
 
-	/* sched_clock_tick() needs us? */
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 	/*
+	 * sched_clock_tick() needs us?
+	 *
 	 * TODO: kick full dynticks CPUs when
 	 * sched_clock_stable is set.
 	 */
@@ -199,13 +242,13 @@ static bool can_stop_full_tick(void)
 	return true;
 }
 
-static void nohz_full_kick_work_func(struct irq_work *work)
+static void nohz_full_kick_func(struct irq_work *work)
 {
 	/* Empty, the tick restart happens on tick_nohz_irq_exit() */
 }
 
 static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
-	.func = nohz_full_kick_work_func,
+	.func = nohz_full_kick_func,
 };
 
 /*
@@ -251,6 +294,95 @@ void tick_nohz_full_kick_all(void)
 	preempt_enable();
 }
 
+static void tick_nohz_dep_set_all(unsigned long *dep,
+				  enum tick_dep_bits bit)
+{
+	unsigned long prev;
+
+	prev = fetch_or(dep, BIT_MASK(bit));
+	if (!prev)
+		tick_nohz_full_kick_all();
+}
+
+/*
+ * Set a global tick dependency. Used by perf events that rely on freq and
+ * by unstable clock.
+ */
+void tick_nohz_dep_set(enum tick_dep_bits bit)
+{
+	tick_nohz_dep_set_all(&tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear(enum tick_dep_bits bit)
+{
+	clear_bit(bit, &tick_dep_mask);
+}
+
+/*
+ * Set per-CPU tick dependency. Used by scheduler and perf events in order to
+ * manage events throttling.
+ */
+void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
+{
+	unsigned long prev;
+	struct tick_sched *ts;
+
+	ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+	prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit));
+	if (!prev) {
+		preempt_disable();
+		/* Perf needs local kick that is NMI safe */
+		if (cpu == smp_processor_id()) {
+			tick_nohz_full_kick();
+		} else {
+			/* Remote irq work not NMI-safe */
+			if (!WARN_ON_ONCE(in_nmi()))
+				tick_nohz_full_kick_cpu(cpu);
+		}
+		preempt_enable();
+	}
+}
+
+void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
+{
+	struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+
+	clear_bit(bit, &ts->tick_dep_mask);
+}
+
+/*
+ * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
+ * per task timers.
+ */
+void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
+{
+	/*
+	 * We could optimize this with just kicking the target running the task
+	 * if that noise matters for nohz full users.
+	 */
+	tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
+{
+	clear_bit(bit, &tsk->tick_dep_mask);
+}
+
+/*
+ * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
+ * per process timers.
+ */
+void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+{
+	tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
+}
+
+void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
+{
+	clear_bit(bit, &sig->tick_dep_mask);
+}
+
 /*
  * Re-evaluate the need for the tick as we switch the current task.
  * It might need the tick due to per task/process properties:
@@ -259,15 +391,19 @@ void tick_nohz_full_kick_all(void)
 void __tick_nohz_task_switch(void)
 {
 	unsigned long flags;
+	struct tick_sched *ts;
 
 	local_irq_save(flags);
 
 	if (!tick_nohz_full_cpu(smp_processor_id()))
 		goto out;
 
-	if (tick_nohz_tick_stopped() && !can_stop_full_tick())
-		tick_nohz_full_kick();
+	ts = this_cpu_ptr(&tick_cpu_sched);
 
+	if (ts->tick_stopped) {
+		if (current->tick_dep_mask || current->signal->tick_dep_mask)
+			tick_nohz_full_kick();
+	}
 out:
 	local_irq_restore(flags);
 }
@@ -736,7 +872,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 	if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
 		return;
 
-	if (can_stop_full_tick())
+	if (can_stop_full_tick(ts))
 		tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
 	else if (ts->tick_stopped)
 		tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index a4a8d4e..eb4e325 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -60,6 +60,7 @@ struct tick_sched {
 	u64				next_timer;
 	ktime_t				idle_expires;
 	int				do_timer_last;
+	unsigned long			tick_dep_mask;
 };
 
 extern struct tick_sched *tick_get_tick_sched(int cpu);
-- 
cgit v0.10.2


From e6e6cc22e067a6f44449aa8fd0328404079c3ba5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 11 Dec 2015 03:27:25 +0100
Subject: nohz: Use enum code for tick stop failure tracing message

It makes nohz tracing more lightweight, standard and easier to parse.

Examples:

       user_loop-2904  [007] d..1   517.701126: tick_stop: success=1 dependency=NONE
       user_loop-2904  [007] dn.1   518.021181: tick_stop: success=0 dependency=SCHED
    posix_timers-6142  [007] d..1  1739.027400: tick_stop: success=0 dependency=POSIX_TIMER
       user_loop-5463  [007] dN.1  1185.931939: tick_stop: success=0 dependency=PERF_EVENTS

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/linux/tick.h b/include/linux/tick.h
index d60e5df..96d276a 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -104,6 +104,7 @@ enum tick_dep_bits {
 	TICK_DEP_BIT_CLOCK_UNSTABLE	= 3
 };
 
+#define TICK_DEP_MASK_NONE		0
 #define TICK_DEP_MASK_POSIX_TIMER	(1 << TICK_DEP_BIT_POSIX_TIMER)
 #define TICK_DEP_MASK_PERF_EVENTS	(1 << TICK_DEP_BIT_PERF_EVENTS)
 #define TICK_DEP_MASK_SCHED		(1 << TICK_DEP_BIT_SCHED)
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 073b9ac..5144013 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -328,23 +328,49 @@ TRACE_EVENT(itimer_expire,
 );
 
 #ifdef CONFIG_NO_HZ_COMMON
+
+#define TICK_DEP_NAMES					\
+		tick_dep_name(NONE)			\
+		tick_dep_name(POSIX_TIMER)		\
+		tick_dep_name(PERF_EVENTS)		\
+		tick_dep_name(SCHED)			\
+		tick_dep_name_end(CLOCK_UNSTABLE)
+
+#undef tick_dep_name
+#undef tick_dep_name_end
+
+#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
+#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
+
+TICK_DEP_NAMES
+
+#undef tick_dep_name
+#undef tick_dep_name_end
+
+#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
+#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }
+
+#define show_tick_dep_name(val)				\
+	__print_symbolic(val, TICK_DEP_NAMES)
+
 TRACE_EVENT(tick_stop,
 
-	TP_PROTO(int success, char *error_msg),
+	TP_PROTO(int success, int dependency),
 
-	TP_ARGS(success, error_msg),
+	TP_ARGS(success, dependency),
 
 	TP_STRUCT__entry(
 		__field( int ,		success	)
-		__string( msg, 		error_msg )
+		__field( int ,		dependency )
 	),
 
 	TP_fast_assign(
 		__entry->success	= success;
-		__assign_str(msg, error_msg);
+		__entry->dependency	= dependency;
 	),
 
-	TP_printk("success=%s msg=%s",  __entry->success ? "yes" : "no", __get_str(msg))
+	TP_printk("success=%d dependency=%s",  __entry->success, \
+			show_tick_dep_name(__entry->dependency))
 );
 #endif
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 74ab7db..47e5ac4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -163,22 +163,22 @@ static unsigned long tick_dep_mask;
 static void trace_tick_dependency(unsigned long dep)
 {
 	if (dep & TICK_DEP_MASK_POSIX_TIMER) {
-		trace_tick_stop(0, "posix timers running\n");
+		trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
 		return;
 	}
 
 	if (dep & TICK_DEP_MASK_PERF_EVENTS) {
-		trace_tick_stop(0, "perf events running\n");
+		trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
 		return;
 	}
 
 	if (dep & TICK_DEP_MASK_SCHED) {
-		trace_tick_stop(0, "more than 1 task in runqueue\n");
+		trace_tick_stop(0, TICK_DEP_MASK_SCHED);
 		return;
 	}
 
 	if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE)
-		trace_tick_stop(0, "unstable sched clock\n");
+		trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
 }
 
 static bool can_stop_full_tick(struct tick_sched *ts)
@@ -206,17 +206,17 @@ static bool can_stop_full_tick(struct tick_sched *ts)
 	}
 
 	if (!sched_can_stop_tick()) {
-		trace_tick_stop(0, "more than 1 task in runqueue\n");
+		trace_tick_stop(0, TICK_DEP_MASK_SCHED);
 		return false;
 	}
 
 	if (!posix_cpu_timers_can_stop_tick(current)) {
-		trace_tick_stop(0, "posix timers running\n");
+		trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
 		return false;
 	}
 
 	if (!perf_event_can_stop_tick()) {
-		trace_tick_stop(0, "perf events running\n");
+		trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
 		return false;
 	}
 
@@ -228,7 +228,7 @@ static bool can_stop_full_tick(struct tick_sched *ts)
 	 * sched_clock_stable is set.
 	 */
 	if (!sched_clock_stable()) {
-		trace_tick_stop(0, "unstable sched clock\n");
+		trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
 		/*
 		 * Don't allow the user to think they can get
 		 * full NO_HZ with this machine.
@@ -821,7 +821,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 
 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 		ts->tick_stopped = 1;
-		trace_tick_stop(1, " ");
+		trace_tick_stop(1, TICK_DEP_MASK_NONE);
 	}
 
 	/*
-- 
cgit v0.10.2


From 555e0c1ef7ff49ee5ac3a1eb12de4a2e4722f63d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 16 Jul 2015 17:42:29 +0200
Subject: perf: Migrate perf to use new tick dependency mask model

Instead of providing asynchronous checks for the nohz subsystem to verify
perf event tick dependency, migrate perf to the new mask.

Perf needs the tick for two situations:

1) Freq events. We could set the tick dependency when those are
installed on a CPU context. But setting a global dependency on top of
the global freq events accounting is much easier. If people want that
to be optimized, we can still refine that on the per-CPU tick dependency
level. This patch dooesn't change the current behaviour anyway.

2) Throttled events: this is a per-cpu dependency.

Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b35a61a..d3ff88c 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1108,12 +1108,6 @@ static inline void perf_event_task_tick(void)				{ }
 static inline int perf_event_release_kernel(struct perf_event *event)	{ return 0; }
 #endif
 
-#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)
-extern bool perf_event_can_stop_tick(void);
-#else
-static inline bool perf_event_can_stop_tick(void)			{ return true; }
-#endif
-
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 extern void perf_restore_debug_store(void);
 #else
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 96d276a..0e63db4 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -233,7 +233,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 		tick_nohz_dep_clear_signal(signal, bit);
 }
 
-extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void tick_nohz_full_kick_all(void);
 extern void __tick_nohz_task_switch(void);
@@ -260,7 +259,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 					 enum tick_dep_bits bit) { }
 
 static inline void tick_nohz_full_kick_cpu(int cpu) { }
-static inline void tick_nohz_full_kick(void) { }
 static inline void tick_nohz_full_kick_all(void) { }
 static inline void __tick_nohz_task_switch(void) { }
 #endif
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5946460..f23d480 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3060,17 +3060,6 @@ done:
 	return rotate;
 }
 
-#ifdef CONFIG_NO_HZ_FULL
-bool perf_event_can_stop_tick(void)
-{
-	if (atomic_read(&nr_freq_events) ||
-	    __this_cpu_read(perf_throttled_count))
-		return false;
-	else
-		return true;
-}
-#endif
-
 void perf_event_task_tick(void)
 {
 	struct list_head *head = this_cpu_ptr(&active_ctx_list);
@@ -3081,6 +3070,7 @@ void perf_event_task_tick(void)
 
 	__this_cpu_inc(perf_throttled_seq);
 	throttled = __this_cpu_xchg(perf_throttled_count, 0);
+	tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
 
 	list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
 		perf_adjust_freq_unthr_context(ctx, throttled);
@@ -3511,6 +3501,28 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
 		atomic_dec(&per_cpu(perf_cgroup_events, cpu));
 }
 
+#ifdef CONFIG_NO_HZ_FULL
+static DEFINE_SPINLOCK(nr_freq_lock);
+#endif
+
+static void unaccount_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	spin_lock(&nr_freq_lock);
+	if (atomic_dec_and_test(&nr_freq_events))
+		tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
+	spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void unaccount_freq_event(void)
+{
+	if (tick_nohz_full_enabled())
+		unaccount_freq_event_nohz();
+	else
+		atomic_dec(&nr_freq_events);
+}
+
 static void unaccount_event(struct perf_event *event)
 {
 	bool dec = false;
@@ -3527,7 +3539,7 @@ static void unaccount_event(struct perf_event *event)
 	if (event->attr.task)
 		atomic_dec(&nr_task_events);
 	if (event->attr.freq)
-		atomic_dec(&nr_freq_events);
+		unaccount_freq_event();
 	if (event->attr.context_switch) {
 		dec = true;
 		atomic_dec(&nr_switch_events);
@@ -6349,9 +6361,9 @@ static int __perf_event_overflow(struct perf_event *event,
 		if (unlikely(throttle
 			     && hwc->interrupts >= max_samples_per_tick)) {
 			__this_cpu_inc(perf_throttled_count);
+			tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
 			hwc->interrupts = MAX_INTERRUPTS;
 			perf_log_throttle(event, 0);
-			tick_nohz_full_kick();
 			ret = 1;
 		}
 	}
@@ -7741,6 +7753,27 @@ static void account_event_cpu(struct perf_event *event, int cpu)
 		atomic_inc(&per_cpu(perf_cgroup_events, cpu));
 }
 
+/* Freq events need the tick to stay alive (see perf_event_task_tick). */
+static void account_freq_event_nohz(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	/* Lock so we don't race with concurrent unaccount */
+	spin_lock(&nr_freq_lock);
+	if (atomic_inc_return(&nr_freq_events) == 1)
+		tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
+	spin_unlock(&nr_freq_lock);
+#endif
+}
+
+static void account_freq_event(void)
+{
+	if (tick_nohz_full_enabled())
+		account_freq_event_nohz();
+	else
+		atomic_inc(&nr_freq_events);
+}
+
+
 static void account_event(struct perf_event *event)
 {
 	bool inc = false;
@@ -7756,10 +7789,8 @@ static void account_event(struct perf_event *event)
 		atomic_inc(&nr_comm_events);
 	if (event->attr.task)
 		atomic_inc(&nr_task_events);
-	if (event->attr.freq) {
-		if (atomic_inc_return(&nr_freq_events) == 1)
-			tick_nohz_full_kick_all();
-	}
+	if (event->attr.freq)
+		account_freq_event();
 	if (event->attr.context_switch) {
 		atomic_inc(&nr_switch_events);
 		inc = true;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 47e5ac4..3f79def 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -22,7 +22,6 @@
 #include <linux/module.h>
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
-#include <linux/perf_event.h>
 #include <linux/context_tracking.h>
 
 #include <asm/irq_regs.h>
@@ -215,11 +214,6 @@ static bool can_stop_full_tick(struct tick_sched *ts)
 		return false;
 	}
 
-	if (!perf_event_can_stop_tick()) {
-		trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
-		return false;
-	}
-
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 	/*
 	 * sched_clock_tick() needs us?
@@ -257,7 +251,7 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
  * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
  * is NMI safe.
  */
-void tick_nohz_full_kick(void)
+static void tick_nohz_full_kick(void)
 {
 	if (!tick_nohz_full_cpu(smp_processor_id()))
 		return;
-- 
cgit v0.10.2


From 01d36d0ac390895e719d0dd8ab91ebbbf506d28e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 4 Nov 2015 18:17:10 +0100
Subject: sched: Account rr tasks

In order to evaluate the scheduler tick dependency without probing
context switches, we need to know how much SCHED_RR and SCHED_FIFO tasks
are enqueued as those policies don't have the same preemption
requirements.

To prepare for that, let's account SCHED_RR tasks, we'll be able to
deduce SCHED_FIFO tasks as well from it and the total RT tasks in the
runqueue.

Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8ec86ab..3f1fcff 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1142,12 +1142,27 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
 }
 
 static inline
+unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
+{
+	struct rt_rq *group_rq = group_rt_rq(rt_se);
+	struct task_struct *tsk;
+
+	if (group_rq)
+		return group_rq->rr_nr_running;
+
+	tsk = rt_task_of(rt_se);
+
+	return (tsk->policy == SCHED_RR) ? 1 : 0;
+}
+
+static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 	int prio = rt_se_prio(rt_se);
 
 	WARN_ON(!rt_prio(prio));
 	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
+	rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
 
 	inc_rt_prio(rt_rq, prio);
 	inc_rt_migration(rt_se, rt_rq);
@@ -1160,6 +1175,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
 	WARN_ON(!rt_rq->rt_nr_running);
 	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
+	rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
 
 	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
 	dec_rt_migration(rt_se, rt_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10f1637..f0abfce 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -450,6 +450,7 @@ static inline int rt_bandwidth_enabled(void)
 struct rt_rq {
 	struct rt_prio_array active;
 	unsigned int rt_nr_running;
+	unsigned int rr_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	struct {
 		int curr; /* highest queued rt task prio */
-- 
cgit v0.10.2


From 76d92ac305f23cada3a9b3c48a7ccea5f71019cb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 17 Jul 2015 22:25:49 +0200
Subject: sched: Migrate sched to use new tick dependency mask model

Instead of providing asynchronous checks for the nohz subsystem to verify
sched tick dependency, migrate sched to the new mask.

Everytime a task is enqueued or dequeued, we evaluate the state of the
tick dependency on top of the policy of the tasks in the runqueue, by
order of priority:

SCHED_DEADLINE: Need the tick in order to periodically check for runtime
SCHED_FIFO    : Don't need the tick (no round-robin)
SCHED_RR      : Need the tick if more than 1 task of the same priority
                for round robin (simplified with checking if more than
                one SCHED_RR task no matter what priority).
SCHED_NORMAL  : Need the tick if more than 1 task for round-robin.

We could optimize that further with one flag per sched policy on the tick
dependency mask and perform only the checks relevant to the policy
concerned by an enqueue/dequeue operation.

Since the checks aren't based on the current task anymore, we could get
rid of the task switch hook but it's still needed for posix cpu
timers.

Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 307e483..2b10348 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2364,10 +2364,7 @@ static inline void wake_up_nohz_cpu(int cpu) { }
 #endif
 
 #ifdef CONFIG_NO_HZ_FULL
-extern bool sched_can_stop_tick(void);
 extern u64 scheduler_tick_max_deferment(void);
-#else
-static inline bool sched_can_stop_tick(void) { return false; }
 #endif
 
 #ifdef CONFIG_SCHED_AUTOGROUP
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7142feb..1fad823 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -701,31 +701,36 @@ static inline bool got_nohz_idle_kick(void)
 #endif /* CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
-bool sched_can_stop_tick(void)
+bool sched_can_stop_tick(struct rq *rq)
 {
+	int fifo_nr_running;
+
+	/* Deadline tasks, even if single, need the tick */
+	if (rq->dl.dl_nr_running)
+		return false;
+
 	/*
-	 * FIFO realtime policy runs the highest priority task. Other runnable
-	 * tasks are of a lower priority. The scheduler tick does nothing.
+	 * FIFO realtime policy runs the highest priority task (after DEADLINE).
+	 * Other runnable tasks are of a lower priority. The scheduler tick
+	 * isn't needed.
 	 */
-	if (current->policy == SCHED_FIFO)
+	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+	if (fifo_nr_running)
 		return true;
 
 	/*
 	 * Round-robin realtime tasks time slice with other tasks at the same
-	 * realtime priority. Is this task the only one at this priority?
+	 * realtime priority.
 	 */
-	if (current->policy == SCHED_RR) {
-		struct sched_rt_entity *rt_se = &current->rt;
-
-		return list_is_singular(&rt_se->run_list);
+	if (rq->rt.rr_nr_running) {
+		if (rq->rt.rr_nr_running == 1)
+			return true;
+		else
+			return false;
 	}
 
-	/*
-	 * More than one running task need preemption.
-	 * nr_running update is assumed to be visible
-	 * after IPI is sent from wakers.
-	 */
-	if (this_rq()->nr_running > 1)
+	/* Normal multitasking need periodic preemption checks */
+	if (rq->cfs.nr_running > 1)
 		return false;
 
 	return true;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f0abfce..4f0bca7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1279,6 +1279,35 @@ unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
 
+#ifdef CONFIG_NO_HZ_FULL
+extern bool sched_can_stop_tick(struct rq *rq);
+
+/*
+ * Tick may be needed by tasks in the runqueue depending on their policy and
+ * requirements. If tick is needed, lets send the target an IPI to kick it out of
+ * nohz mode if necessary.
+ */
+static inline void sched_update_tick_dependency(struct rq *rq)
+{
+	int cpu;
+
+	if (!tick_nohz_full_enabled())
+		return;
+
+	cpu = cpu_of(rq);
+
+	if (!tick_nohz_full_cpu(cpu))
+		return;
+
+	if (sched_can_stop_tick(rq))
+		tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+	else
+		tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+#else
+static inline void sched_update_tick_dependency(struct rq *rq) { }
+#endif
+
 static inline void add_nr_running(struct rq *rq, unsigned count)
 {
 	unsigned prev_nr = rq->nr_running;
@@ -1290,26 +1319,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
 		if (!rq->rd->overload)
 			rq->rd->overload = true;
 #endif
-
-#ifdef CONFIG_NO_HZ_FULL
-		if (tick_nohz_full_cpu(rq->cpu)) {
-			/*
-			 * Tick is needed if more than one task runs on a CPU.
-			 * Send the target an IPI to kick it out of nohz mode.
-			 *
-			 * We assume that IPI implies full memory barrier and the
-			 * new value of rq->nr_running is visible on reception
-			 * from the target.
-			 */
-			tick_nohz_full_kick_cpu(rq->cpu);
-		}
-#endif
 	}
+
+	sched_update_tick_dependency(rq);
 }
 
 static inline void sub_nr_running(struct rq *rq, unsigned count)
 {
 	rq->nr_running -= count;
+	/* Check if we still need preemption */
+	sched_update_tick_dependency(rq);
 }
 
 static inline void rq_last_tick_reset(struct rq *rq)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3f79def..f312c60 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -204,11 +204,6 @@ static bool can_stop_full_tick(struct tick_sched *ts)
 		return false;
 	}
 
-	if (!sched_can_stop_tick()) {
-		trace_tick_stop(0, TICK_DEP_MASK_SCHED);
-		return false;
-	}
-
 	if (!posix_cpu_timers_can_stop_tick(current)) {
 		trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
 		return false;
-- 
cgit v0.10.2


From b78783000d5cb7c5994e6742e1d1ce594bfea15b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 17 Jul 2015 22:25:49 +0200
Subject: posix-cpu-timers: Migrate to use new tick dependency mask model

Instead of providing asynchronous checks for the nohz subsystem to verify
posix cpu timers tick dependency, migrate the latter to the new mask.

In order to keep track of the running timers and expose the tick
dependency accordingly, we must probe the timers queuing and dequeuing
on threads and process lists.

Unfortunately it implies both task and signal level dependencies. We
should be able to further optimize this and merge all that on the task
level dependency, at the cost of a bit of complexity and may be overhead.

Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 907f3fd..62d44c1 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -128,9 +128,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer);
 void run_posix_cpu_timers(struct task_struct *task);
 void posix_cpu_timers_exit(struct task_struct *task);
 void posix_cpu_timers_exit_group(struct task_struct *task);
-
-bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk);
-
 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval);
 
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 0e63db4..21f7364 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -234,7 +234,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 }
 
 extern void tick_nohz_full_kick_cpu(int cpu);
-extern void tick_nohz_full_kick_all(void);
 extern void __tick_nohz_task_switch(void);
 #else
 static inline int housekeeping_any_cpu(void)
@@ -259,7 +258,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 					 enum tick_dep_bits bit) { }
 
 static inline void tick_nohz_full_kick_cpu(int cpu) { }
-static inline void tick_nohz_full_kick_all(void) { }
 static inline void __tick_nohz_task_switch(void) { }
 #endif
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index f5e86d2..1cafba8 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -333,7 +333,6 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 	return err;
 }
 
-
 /*
  * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
  * This is called from sys_timer_create() and do_cpu_nanosleep() with the
@@ -517,6 +516,10 @@ static void arm_timer(struct k_itimer *timer)
 				cputime_expires->sched_exp = exp;
 			break;
 		}
+		if (CPUCLOCK_PERTHREAD(timer->it_clock))
+			tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
+		else
+			tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
 	}
 }
 
@@ -582,39 +585,6 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
 	return 0;
 }
 
-#ifdef CONFIG_NO_HZ_FULL
-static void nohz_kick_work_fn(struct work_struct *work)
-{
-	tick_nohz_full_kick_all();
-}
-
-static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
-
-/*
- * We need the IPIs to be sent from sane process context.
- * The posix cpu timers are always set with irqs disabled.
- */
-static void posix_cpu_timer_kick_nohz(void)
-{
-	if (context_tracking_is_enabled())
-		schedule_work(&nohz_kick_work);
-}
-
-bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
-{
-	if (!task_cputime_zero(&tsk->cputime_expires))
-		return false;
-
-	/* Check if cputimer is running. This is accessed without locking. */
-	if (READ_ONCE(tsk->signal->cputimer.running))
-		return false;
-
-	return true;
-}
-#else
-static inline void posix_cpu_timer_kick_nohz(void) { }
-#endif
-
 /*
  * Guts of sys_timer_settime for CPU timers.
  * This is called with the timer locked and interrupts disabled.
@@ -761,8 +731,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 		sample_to_timespec(timer->it_clock,
 				   old_incr, &old->it_interval);
 	}
-	if (!ret)
-		posix_cpu_timer_kick_nohz();
+
 	return ret;
 }
 
@@ -911,6 +880,8 @@ static void check_thread_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
 		}
 	}
+	if (task_cputime_zero(tsk_expires))
+		tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 static inline void stop_process_timers(struct signal_struct *sig)
@@ -919,6 +890,7 @@ static inline void stop_process_timers(struct signal_struct *sig)
 
 	/* Turn off cputimer->running. This is done without locking. */
 	WRITE_ONCE(cputimer->running, false);
+	tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 static u32 onecputick;
@@ -1095,8 +1067,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	arm_timer(timer);
 	unlock_task_sighand(p, &flags);
 
-	/* Kick full dynticks CPUs in case they need to tick on the new timer */
-	posix_cpu_timer_kick_nohz();
 out:
 	timer->it_overrun_last = timer->it_overrun;
 	timer->it_overrun = -1;
@@ -1270,7 +1240,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 		}
 
 		if (!*newval)
-			goto out;
+			return;
 		*newval += now;
 	}
 
@@ -1288,8 +1258,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 			tsk->signal->cputime_expires.virt_exp = *newval;
 		break;
 	}
-out:
-	posix_cpu_timer_kick_nohz();
+
+	tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f312c60..e4f2916 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -204,11 +204,6 @@ static bool can_stop_full_tick(struct tick_sched *ts)
 		return false;
 	}
 
-	if (!posix_cpu_timers_can_stop_tick(current)) {
-		trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
-		return false;
-	}
-
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 	/*
 	 * sched_clock_tick() needs us?
@@ -270,7 +265,7 @@ void tick_nohz_full_kick_cpu(int cpu)
  * Kick all full dynticks CPUs in order to force these to re-evaluate
  * their dependency on the tick and restart it if necessary.
  */
-void tick_nohz_full_kick_all(void)
+static void tick_nohz_full_kick_all(void)
 {
 	int cpu;
 
-- 
cgit v0.10.2


From 4f49b90abb4aca6fe677c95fc352fd0674d489bd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 22 Jul 2015 17:03:52 +0200
Subject: sched-clock: Migrate to use new tick dependency mask model

Instead of checking sched_clock_stable from the nohz subsystem to verify
its tick dependency, migrate it to the new mask in order to include it
to the all-in-one check.

Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index bc54e84..fedb967 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -61,6 +61,7 @@
 #include <linux/static_key.h>
 #include <linux/workqueue.h>
 #include <linux/compiler.h>
+#include <linux/tick.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void)
 {
 	if (!sched_clock_stable())
 		static_key_slow_inc(&__sched_clock_stable);
+
+	tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
 
 void set_sched_clock_stable(void)
@@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work)
 	/* XXX worry about clock continuity */
 	if (sched_clock_stable())
 		static_key_slow_dec(&__sched_clock_stable);
+
+	tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
 }
 
 static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e4f2916..969e670 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -204,25 +204,6 @@ static bool can_stop_full_tick(struct tick_sched *ts)
 		return false;
 	}
 
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-	/*
-	 * sched_clock_tick() needs us?
-	 *
-	 * TODO: kick full dynticks CPUs when
-	 * sched_clock_stable is set.
-	 */
-	if (!sched_clock_stable()) {
-		trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
-		/*
-		 * Don't allow the user to think they can get
-		 * full NO_HZ with this machine.
-		 */
-		WARN_ONCE(tick_nohz_full_running,
-			  "NO_HZ FULL will not work with unstable sched clock");
-		return false;
-	}
-#endif
-
 	return true;
 }
 
-- 
cgit v0.10.2


From eda1d1cf8d18383f19cd2b752f786120efa4768f Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Wed, 24 Feb 2016 17:18:25 -0500
Subject: drm/amdgpu/pm: update current crtc info after setting the powerstate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On CI, we need to see if the number of crtcs changes to determine
whether or not we need to upload the mclk table again.  In practice
we don't currently upload the mclk table again after the initial load.
The only reason you would would be to add new states, e.g., for
arbitrary mclk setting which is not currently supported.

Acked-by: Jordan Lazare <Jordan.Lazare@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index 66855b6..95a4a25 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -649,9 +649,6 @@ force:
 	/* update display watermarks based on new power state */
 	amdgpu_display_bandwidth_update(adev);
 
-	adev->pm.dpm.current_active_crtcs = adev->pm.dpm.new_active_crtcs;
-	adev->pm.dpm.current_active_crtc_count = adev->pm.dpm.new_active_crtc_count;
-
 	/* wait for the rings to drain */
 	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
 		struct amdgpu_ring *ring = adev->rings[i];
@@ -670,6 +667,9 @@ force:
 	/* update displays */
 	amdgpu_dpm_display_configuration_changed(adev);
 
+	adev->pm.dpm.current_active_crtcs = adev->pm.dpm.new_active_crtcs;
+	adev->pm.dpm.current_active_crtc_count = adev->pm.dpm.new_active_crtc_count;
+
 	if (adev->pm.funcs->force_performance_level) {
 		if (adev->pm.dpm.thermal_active) {
 			enum amdgpu_dpm_forced_level level = adev->pm.dpm.forced_level;
-- 
cgit v0.10.2


From 5e031d9fe8b0741f11d49667dfc3ebf5454121fd Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Wed, 24 Feb 2016 17:38:38 -0500
Subject: drm/radeon/pm: update current crtc info after setting the powerstate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On CI, we need to see if the number of crtcs changes to determine
whether or not we need to upload the mclk table again.  In practice
we don't currently upload the mclk table again after the initial load.
The only reason you would would be to add new states, e.g., for
arbitrary mclk setting which is not currently supported.

Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c
index ca3be90..0f14d89 100644
--- a/drivers/gpu/drm/radeon/radeon_pm.c
+++ b/drivers/gpu/drm/radeon/radeon_pm.c
@@ -1080,10 +1080,6 @@ force:
 	/* update display watermarks based on new power state */
 	radeon_bandwidth_update(rdev);
 
-	rdev->pm.dpm.current_active_crtcs = rdev->pm.dpm.new_active_crtcs;
-	rdev->pm.dpm.current_active_crtc_count = rdev->pm.dpm.new_active_crtc_count;
-	rdev->pm.dpm.single_display = single_display;
-
 	/* wait for the rings to drain */
 	for (i = 0; i < RADEON_NUM_RINGS; i++) {
 		struct radeon_ring *ring = &rdev->ring[i];
@@ -1102,6 +1098,10 @@ force:
 	/* update displays */
 	radeon_dpm_display_configuration_changed(rdev);
 
+	rdev->pm.dpm.current_active_crtcs = rdev->pm.dpm.new_active_crtcs;
+	rdev->pm.dpm.current_active_crtc_count = rdev->pm.dpm.new_active_crtc_count;
+	rdev->pm.dpm.single_display = single_display;
+
 	if (rdev->asic->dpm.force_performance_level) {
 		if (rdev->pm.dpm.thermal_active) {
 			enum radeon_dpm_forced_level level = rdev->pm.dpm.forced_level;
-- 
cgit v0.10.2


From dc26a2a2b3f037c2727f514b64e8a865721ec74b Mon Sep 17 00:00:00 2001
From: Rex Zhu <Rex.Zhu@amd.com>
Date: Thu, 25 Feb 2016 17:16:52 +0800
Subject: drm/amd/powerplay: export AMD_PP_EVENT_COMPLETE_INIT task to amdgpu.

This is needed to init the dynamic states without a display.  To be
used in the next commit.

Signed-off-by: Rex Zhu <Rex.Zhu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
index aa67244..589599f 100644
--- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
@@ -402,8 +402,11 @@ int pp_dpm_dispatch_tasks(void *handle, enum amd_pp_event event_id, void *input,
 
 		data.requested_ui_label = power_state_convert(ps);
 		ret = pem_handle_event(pp_handle->eventmgr, event_id, &data);
+		break;
 	}
-	break;
+	case AMD_PP_EVENT_COMPLETE_INIT:
+		ret = pem_handle_event(pp_handle->eventmgr, event_id, &data);
+		break;
 	default:
 		break;
 	}
diff --git a/drivers/gpu/drm/amd/powerplay/eventmgr/eventactionchains.c b/drivers/gpu/drm/amd/powerplay/eventmgr/eventactionchains.c
index 83be3cf..6b52c78 100644
--- a/drivers/gpu/drm/amd/powerplay/eventmgr/eventactionchains.c
+++ b/drivers/gpu/drm/amd/powerplay/eventmgr/eventactionchains.c
@@ -165,6 +165,7 @@ const struct action_chain resume_action_chain = {
 };
 
 static const pem_event_action *complete_init_event[] = {
+	unblock_adjust_power_state_tasks,
 	adjust_power_state_tasks,
 	enable_gfx_clock_gating_tasks,
 	enable_gfx_voltage_island_power_gating_tasks,
-- 
cgit v0.10.2


From 4ea2efae0d117c5b5f44081bab941542d892e758 Mon Sep 17 00:00:00 2001
From: Rex Zhu <Rex.Zhu@amd.com>
Date: Thu, 25 Feb 2016 17:32:45 +0800
Subject: drm/amd/powerplay: send event to notify powerplay all modules are
 initialized.

with this event, powerplay can adjust current power state if needed.

Signed-off-by: Rex Zhu <Rex.Zhu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
index b9d0d55..3cb6d6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_powerplay.c
@@ -143,8 +143,10 @@ static int amdgpu_pp_late_init(void *handle)
 					adev->powerplay.pp_handle);
 
 #ifdef CONFIG_DRM_AMD_POWERPLAY
-	if (adev->pp_enabled)
+	if (adev->pp_enabled) {
 		amdgpu_pm_sysfs_init(adev);
+		amdgpu_dpm_dispatch_task(adev, AMD_PP_EVENT_COMPLETE_INIT, NULL, NULL);
+	}
 #endif
 	return ret;
 }
-- 
cgit v0.10.2


From feebe91aa9a9d99d9ec157612a614fadb79beb99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Fri, 26 Feb 2016 16:18:15 +0100
Subject: drm/amdgpu: apply gfx_v8 fixes to gfx_v7 as well
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We never ported that back to CIK, so we could run into VM faults here.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 7732059..06602df 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -3628,6 +3628,19 @@ static void gfx_v7_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 					unsigned vm_id, uint64_t pd_addr)
 {
 	int usepfp = (ring->type == AMDGPU_RING_TYPE_GFX);
+	uint32_t seq = ring->fence_drv.sync_seq;
+	uint64_t addr = ring->fence_drv.gpu_addr;
+
+	amdgpu_ring_write(ring, PACKET3(PACKET3_WAIT_REG_MEM, 5));
+	amdgpu_ring_write(ring, (WAIT_REG_MEM_MEM_SPACE(1) | /* memory */
+				 WAIT_REG_MEM_FUNCTION(3) | /* equal */
+				 WAIT_REG_MEM_ENGINE(usepfp)));   /* pfp or me */
+	amdgpu_ring_write(ring, addr & 0xfffffffc);
+	amdgpu_ring_write(ring, upper_32_bits(addr) & 0xffffffff);
+	amdgpu_ring_write(ring, seq);
+	amdgpu_ring_write(ring, 0xffffffff);
+	amdgpu_ring_write(ring, 4); /* poll interval */
+
 	if (usepfp) {
 		/* synce CE with ME to prevent CE fetch CEIB before context switch done */
 		amdgpu_ring_write(ring, PACKET3(PACKET3_SWITCH_BUFFER, 0));
-- 
cgit v0.10.2


From 9cac537332f5502c103415b25609548c276a09f8 Mon Sep 17 00:00:00 2001
From: Chunming Zhou <David1.Zhou@amd.com>
Date: Mon, 29 Feb 2016 14:12:38 +0800
Subject: drm/amdgpu/gfx8: specify which engine to wait before vm flush
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Select between me and pfp properly.

Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Cc: stable@vger.kernel.org
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 1c40bd9..7086ac1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -4809,7 +4809,8 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_WAIT_REG_MEM, 5));
 	amdgpu_ring_write(ring, (WAIT_REG_MEM_MEM_SPACE(1) | /* memory */
-		 WAIT_REG_MEM_FUNCTION(3))); /* equal */
+				 WAIT_REG_MEM_FUNCTION(3) | /* equal */
+				 WAIT_REG_MEM_ENGINE(usepfp))); /* pfp or me */
 	amdgpu_ring_write(ring, addr & 0xfffffffc);
 	amdgpu_ring_write(ring, upper_32_bits(addr) & 0xffffffff);
 	amdgpu_ring_write(ring, seq);
-- 
cgit v0.10.2


From b3dae7828399ef316e3fabf7e82c6415cb03a02e Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Thu, 25 Feb 2016 11:24:52 -0500
Subject: drm/amdgpu/cz: enable/disable vce dpm even if vce pg is disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I missed this when cleaning up the vce pg handling.

Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Rex Zhu <Rex.Zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

diff --git a/drivers/gpu/drm/amd/amdgpu/cz_dpm.c b/drivers/gpu/drm/amd/amdgpu/cz_dpm.c
index 9056355..208990a 100644
--- a/drivers/gpu/drm/amd/amdgpu/cz_dpm.c
+++ b/drivers/gpu/drm/amd/amdgpu/cz_dpm.c
@@ -2226,10 +2226,8 @@ static void cz_dpm_powergate_vce(struct amdgpu_device *adev, bool gate)
 		}
 	} else { /*pi->caps_vce_pg*/
 		cz_update_vce_dpm(adev);
-		cz_enable_vce_dpm(adev, true);
+		cz_enable_vce_dpm(adev, !gate);
 	}
-
-	return;
 }
 
 const struct amd_ip_funcs cz_dpm_ip_funcs = {
-- 
cgit v0.10.2


From 370afa7ac5dc6c65cebeaee841ae700eb2387a55 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Mon, 29 Feb 2016 15:29:48 -0500
Subject: drm/amdgpu/powerplay/cz: enable/disable vce dpm independent of vce pg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If we don't disable it when vce is not in use, we use extra power
if vce pg is disabled.

Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Rex Zhu <Rex.Zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/cz_clockpowergating.c b/drivers/gpu/drm/amd/powerplay/hwmgr/cz_clockpowergating.c
index ad77008..ff08ce4 100644
--- a/drivers/gpu/drm/amd/powerplay/hwmgr/cz_clockpowergating.c
+++ b/drivers/gpu/drm/amd/powerplay/hwmgr/cz_clockpowergating.c
@@ -226,7 +226,7 @@ int cz_dpm_powergate_vce(struct pp_hwmgr *hwmgr, bool bgate)
 		}
 	} else {
 		cz_dpm_update_vce_dpm(hwmgr);
-		cz_enable_disable_vce_dpm(hwmgr, true);
+		cz_enable_disable_vce_dpm(hwmgr, !bgate);
 		return 0;
 	}
 
-- 
cgit v0.10.2


From 89913ea615754163ae9a066c9c1b95aa2a2f51b4 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Mon, 29 Feb 2016 16:11:07 -0500
Subject: drm/amdgpu/cz: remove commented out call to enable vce pg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This code path is not currently enabled now that we properly
respect the vce pg flags, so uncomment the actual pg calls
so the code is as it should be we are eventually able to
enable vce pg.

Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Rex Zhu <Rex.Zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

diff --git a/drivers/gpu/drm/amd/amdgpu/cz_dpm.c b/drivers/gpu/drm/amd/amdgpu/cz_dpm.c
index 208990a..e7ef226 100644
--- a/drivers/gpu/drm/amd/amdgpu/cz_dpm.c
+++ b/drivers/gpu/drm/amd/amdgpu/cz_dpm.c
@@ -2202,8 +2202,7 @@ static void cz_dpm_powergate_vce(struct amdgpu_device *adev, bool gate)
 							    AMD_PG_STATE_GATE);
 
 				cz_enable_vce_dpm(adev, false);
-				/* TODO: to figure out why vce can't be poweroff. */
-				/* cz_send_msg_to_smc(adev, PPSMC_MSG_VCEPowerOFF); */
+				cz_send_msg_to_smc(adev, PPSMC_MSG_VCEPowerOFF);
 				pi->vce_power_gated = true;
 			} else {
 				cz_send_msg_to_smc(adev, PPSMC_MSG_VCEPowerON);
-- 
cgit v0.10.2


From 0b39c531cfa12dad54eac238c2e303b994df1ef7 Mon Sep 17 00:00:00 2001
From: Arindam Nath <arindam.nath@amd.com>
Date: Wed, 2 Mar 2016 17:19:01 +0530
Subject: drm/amdgpu: return from atombios_dp_get_dpcd only when error

In amdgpu_connector_hotplug(), we need to start DP link
training only after we have received DPCD. The function
amdgpu_atombios_dp_get_dpcd() returns non-zero value only
when an error condition is met, otherwise returns zero.
So in case the function encounters an error, we need to
skip rest of the code and return from amdgpu_connector_hotplug()
immediately. Only when we are successfull in reading DPCD
pin, we should carry on with turning-on the monitor.

Signed-off-by: Arindam Nath <arindam.nath@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
index 89c3dd6..119cdc2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
@@ -77,7 +77,7 @@ void amdgpu_connector_hotplug(struct drm_connector *connector)
 			} else if (amdgpu_atombios_dp_needs_link_train(amdgpu_connector)) {
 				/* Don't try to start link training before we
 				 * have the dpcd */
-				if (!amdgpu_atombios_dp_get_dpcd(amdgpu_connector))
+				if (amdgpu_atombios_dp_get_dpcd(amdgpu_connector))
 					return;
 
 				/* set it to OFF so that drm_helper_connector_dpms()
-- 
cgit v0.10.2


From 39680f50ae54cbbb6e72ac38b8329dd3eb9105f4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 1 Mar 2016 11:56:22 -0800
Subject: userfaultfd: don't block on the last VM updates at exit time

The exit path will do some final updates to the VM of an exiting process
to inform others of the fact that the process is going away.

That happens, for example, for robust futex state cleanup, but also if
the parent has asked for a TID update when the process exits (we clear
the child tid field in user space).

However, at the time we do those final VM accesses, we've already
stopped accepting signals, so the usual "stop waiting for userfaults on
signal" code in fs/userfaultfd.c no longer works, and the process can
become an unkillable zombie waiting for something that will never
happen.

To solve this, just make handle_userfault() abort any user fault
handling if we're already in the exit path past the signal handling
state being dead (marked by PF_EXITING).

This VM special case is pretty ugly, and it is possible that we should
look at finalizing signals later (or move the VM final accesses
earlier).  But in the meantime this is a fairly minimally intrusive fix.

Reported-and-tested-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5031170..66cdb44 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -287,6 +287,12 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 		goto out;
 
 	/*
+	 * We don't do userfault handling for the final child pid update.
+	 */
+	if (current->flags & PF_EXITING)
+		goto out;
+
+	/*
 	 * Check that we can return VM_FAULT_RETRY.
 	 *
 	 * NOTE: it should become possible to return VM_FAULT_RETRY
-- 
cgit v0.10.2


From 9cd753a1f449b9af49a4c994ffab8c4f99d91f14 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Wed, 2 Mar 2016 13:47:05 +0100
Subject: bnx2x: fix crash on big-endian when adding VLAN

bnx2x crashes during the initialization of the 8021q module on ppc64.
The bug is a missing conversion from le32 in
bnx2x_handle_classification_eqe() when obtaining the cid value from
struct eth_event_data.

The fields in struct eth_event_data should all be declared as
little-endian and conversions added where missing.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
index 27aa080..90a4dba 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
@@ -5207,9 +5207,9 @@ struct e2_integ_data {
  * set mac event data
  */
 struct eth_event_data {
-	u32 echo;
-	u32 reserved0;
-	u32 reserved1;
+	__le32 echo;
+	__le32 reserved0;
+	__le32 reserved1;
 };
 
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 6c4e3a6..b707ba8 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -5280,14 +5280,14 @@ static void bnx2x_handle_classification_eqe(struct bnx2x *bp,
 {
 	unsigned long ramrod_flags = 0;
 	int rc = 0;
-	u32 cid = elem->message.data.eth_event.echo & BNX2X_SWCID_MASK;
+	u32 echo = le32_to_cpu(elem->message.data.eth_event.echo);
+	u32 cid = echo & BNX2X_SWCID_MASK;
 	struct bnx2x_vlan_mac_obj *vlan_mac_obj;
 
 	/* Always push next commands out, don't wait here */
 	__set_bit(RAMROD_CONT, &ramrod_flags);
 
-	switch (le32_to_cpu((__force __le32)elem->message.data.eth_event.echo)
-			    >> BNX2X_SWCID_SHIFT) {
+	switch (echo >> BNX2X_SWCID_SHIFT) {
 	case BNX2X_FILTER_MAC_PENDING:
 		DP(BNX2X_MSG_SP, "Got SETUP_MAC completions\n");
 		if (CNIC_LOADED(bp) && (cid == BNX2X_ISCSI_ETH_CID(bp)))
@@ -5308,8 +5308,7 @@ static void bnx2x_handle_classification_eqe(struct bnx2x *bp,
 		bnx2x_handle_mcast_eqe(bp);
 		return;
 	default:
-		BNX2X_ERR("Unsupported classification command: %d\n",
-			  elem->message.data.eth_event.echo);
+		BNX2X_ERR("Unsupported classification command: 0x%x\n", echo);
 		return;
 	}
 
@@ -5596,10 +5595,8 @@ static void bnx2x_eq_int(struct bnx2x *bp)
 		      BNX2X_STATE_OPENING_WAIT4_PORT):
 		case (EVENT_RING_OPCODE_RSS_UPDATE_RULES |
 		      BNX2X_STATE_CLOSING_WAIT4_HALT):
-			cid = elem->message.data.eth_event.echo &
-				BNX2X_SWCID_MASK;
 			DP(BNX2X_MSG_SP, "got RSS_UPDATE ramrod. CID %d\n",
-			   cid);
+			   SW_CID(elem->message.data.eth_event.echo));
 			rss_raw->clear_pending(rss_raw);
 			break;
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index 9d02734..55fcc21 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -1672,11 +1672,12 @@ void bnx2x_vf_handle_classification_eqe(struct bnx2x *bp,
 {
 	unsigned long ramrod_flags = 0;
 	int rc = 0;
+	u32 echo = le32_to_cpu(elem->message.data.eth_event.echo);
 
 	/* Always push next commands out, don't wait here */
 	set_bit(RAMROD_CONT, &ramrod_flags);
 
-	switch (elem->message.data.eth_event.echo >> BNX2X_SWCID_SHIFT) {
+	switch (echo >> BNX2X_SWCID_SHIFT) {
 	case BNX2X_FILTER_MAC_PENDING:
 		rc = vfq->mac_obj.complete(bp, &vfq->mac_obj, elem,
 					   &ramrod_flags);
@@ -1686,8 +1687,7 @@ void bnx2x_vf_handle_classification_eqe(struct bnx2x *bp,
 					    &ramrod_flags);
 		break;
 	default:
-		BNX2X_ERR("Unsupported classification command: %d\n",
-			  elem->message.data.eth_event.echo);
+		BNX2X_ERR("Unsupported classification command: 0x%x\n", echo);
 		return;
 	}
 	if (rc < 0)
@@ -1755,8 +1755,7 @@ int bnx2x_iov_eq_sp_event(struct bnx2x *bp, union event_ring_elem *elem)
 	case EVENT_RING_OPCODE_MULTICAST_RULES:
 	case EVENT_RING_OPCODE_FILTERS_RULES:
 	case EVENT_RING_OPCODE_RSS_UPDATE_RULES:
-		cid = (elem->message.data.eth_event.echo &
-		       BNX2X_SWCID_MASK);
+		cid = SW_CID(elem->message.data.eth_event.echo);
 		DP(BNX2X_MSG_IOV, "checking filtering comp cid=%d\n", cid);
 		break;
 	case EVENT_RING_OPCODE_VF_FLR:
-- 
cgit v0.10.2


From 5cc5c2d2e5a46c7395a0453f9ea1a446ac4c1b97 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Wed, 2 Mar 2016 13:47:06 +0100
Subject: bnx2x: fix sending VF->PF messages on big-endian

When a VF is sending a message to the PF, it needs to trigger the PF
to tell it the message is ready.
The trigger did not work on ppc64. No interrupt appeared in the PF.

The bug is due to confusion about the layout of struct trigger_vf_zone.
In bnx2x_send_msg2pf() the trigger is written using writeb(), not
writel(), so the attempt to define the struct with a reversed layout on
big-endian is counter-productive.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
index 90a4dba..47b13ed 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
@@ -5114,15 +5114,9 @@ struct vf_pf_channel_zone_trigger {
  * zone that triggers the in-bound interrupt
  */
 struct trigger_vf_zone {
-#if defined(__BIG_ENDIAN)
-	u16 reserved1;
-	u8 reserved0;
-	struct vf_pf_channel_zone_trigger vf_pf_channel;
-#elif defined(__LITTLE_ENDIAN)
 	struct vf_pf_channel_zone_trigger vf_pf_channel;
 	u8 reserved0;
 	u16 reserved1;
-#endif
 	u32 reserved2;
 };
 
-- 
cgit v0.10.2


From a524ef77d54d7e12045841f4fcc430c1850f71f2 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Wed, 2 Mar 2016 13:47:07 +0100
Subject: bnx2x: fix receive of VF->PF mailbox messages by the PF on big-endian

On ppc64 the PF did not receive messages from VFs correctly.

Fields of struct vf_pf_event_data are little-endian.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
index 47b13ed..1058591 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
@@ -5213,9 +5213,9 @@ struct eth_event_data {
 struct vf_pf_event_data {
 	u8 vf_id;
 	u8 reserved0;
-	u16 reserved1;
-	u32 msg_addr_lo;
-	u32 msg_addr_hi;
+	__le16 reserved1;
+	__le32 msg_addr_lo;
+	__le32 msg_addr_hi;
 };
 
 /*
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index 1374e53..bfae300 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -2187,8 +2187,10 @@ void bnx2x_vf_mbx_schedule(struct bnx2x *bp,
 
 	/* Update VFDB with current message and schedule its handling */
 	mutex_lock(&BP_VFDB(bp)->event_mutex);
-	BP_VF_MBX(bp, vf_idx)->vf_addr_hi = vfpf_event->msg_addr_hi;
-	BP_VF_MBX(bp, vf_idx)->vf_addr_lo = vfpf_event->msg_addr_lo;
+	BP_VF_MBX(bp, vf_idx)->vf_addr_hi =
+		le32_to_cpu(vfpf_event->msg_addr_hi);
+	BP_VF_MBX(bp, vf_idx)->vf_addr_lo =
+		le32_to_cpu(vfpf_event->msg_addr_lo);
 	BP_VFDB(bp)->event_occur |= (1ULL << vf_idx);
 	mutex_unlock(&BP_VFDB(bp)->event_mutex);
 
-- 
cgit v0.10.2


From ca4f2d5036be809b0f82ce38a7663d237efa3986 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Wed, 2 Mar 2016 13:47:08 +0100
Subject: bnx2x: access cfc_del_event only if the opcode is CFC_DEL

It's not really a bug, but it was odd that bnx2x_eq_int() read the
message data as if it were a cfc_del_event regardless of the event type.
It's cleaner to access only the appropriate member of union event_data
after checking the event opcode.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index b707ba8..0e0bcbd 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -5477,9 +5477,6 @@ static void bnx2x_eq_int(struct bnx2x *bp)
 			goto next_spqe;
 		}
 
-		/* elem CID originates from FW; actually LE */
-		cid = SW_CID((__force __le32)
-			     elem->message.data.cfc_del_event.cid);
 		opcode = elem->message.opcode;
 
 		/* handle eq element */
@@ -5502,6 +5499,11 @@ static void bnx2x_eq_int(struct bnx2x *bp)
 			 * we may want to verify here that the bp state is
 			 * HALTING
 			 */
+
+			/* elem CID originates from FW; actually LE */
+			cid = SW_CID((__force __le32)
+				     elem->message.data.cfc_del_event.cid);
+
 			DP(BNX2X_MSG_SP,
 			   "got delete ramrod for MULTI[%d]\n", cid);
 
-- 
cgit v0.10.2


From da472731d8d0a50c451f5116477ffd9b3165f309 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Wed, 2 Mar 2016 13:47:09 +0100
Subject: bnx2x: define fields of struct cfc_del_event_data as little-endian

There were no missing endianness conversions in this case, but the
fields of struct cfc_del_event_data should be defined as little-endian
to get rid of the ugly (__force __le32) casts.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
index 1058591..a24909a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
@@ -4896,9 +4896,9 @@ struct c2s_pri_trans_table_entry {
  * cfc delete event data
  */
 struct cfc_del_event_data {
-	u32 cid;
-	u32 reserved0;
-	u32 reserved1;
+	__le32 cid;
+	__le32 reserved0;
+	__le32 reserved1;
 };
 
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 0e0bcbd..9f5716a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -5501,8 +5501,7 @@ static void bnx2x_eq_int(struct bnx2x *bp)
 			 */
 
 			/* elem CID originates from FW; actually LE */
-			cid = SW_CID((__force __le32)
-				     elem->message.data.cfc_del_event.cid);
+			cid = SW_CID(elem->message.data.cfc_del_event.cid);
 
 			DP(BNX2X_MSG_SP,
 			   "got delete ramrod for MULTI[%d]\n", cid);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index 55fcc21..632daff 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -1747,8 +1747,7 @@ int bnx2x_iov_eq_sp_event(struct bnx2x *bp, union event_ring_elem *elem)
 
 	switch (opcode) {
 	case EVENT_RING_OPCODE_CFC_DEL:
-		cid = SW_CID((__force __le32)
-			     elem->message.data.cfc_del_event.cid);
+		cid = SW_CID(elem->message.data.cfc_del_event.cid);
 		DP(BNX2X_MSG_IOV, "checking cfc-del comp cid=%d\n", cid);
 		break;
 	case EVENT_RING_OPCODE_CLASSIFICATION_RULES:
-- 
cgit v0.10.2


From 8d1c6bcb0f7f6e9f1481bcd3a1c52e75cc1f7fa4 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Wed, 2 Mar 2016 13:47:10 +0100
Subject: bnx2x: define event data reserved fields as little-endian

For consistency with other event data structs and to lessen
the chance of a mistake should one of the reserved fields become
used in the future, define the reserved fields as little-endian.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
index a24909a..91874d2 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
@@ -5224,9 +5224,9 @@ struct vf_pf_event_data {
 struct vf_flr_event_data {
 	u8 vf_id;
 	u8 reserved0;
-	u16 reserved1;
-	u32 reserved2;
-	u32 reserved3;
+	__le16 reserved1;
+	__le32 reserved2;
+	__le32 reserved3;
 };
 
 /*
@@ -5235,9 +5235,9 @@ struct vf_flr_event_data {
 struct malicious_vf_event_data {
 	u8 vf_id;
 	u8 err_id;
-	u16 reserved1;
-	u32 reserved2;
-	u32 reserved3;
+	__le16 reserved1;
+	__le32 reserved2;
+	__le32 reserved3;
 };
 
 /*
-- 
cgit v0.10.2


From 7e88009b9b2a22a20064abdb2a6536eeb3bcf432 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Wed, 2 Mar 2016 13:47:11 +0100
Subject: bnx2x: fix indentation in bnx2x_sp_task()

Fix a case of misleading code indentation.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 9f5716a..2bf9c87 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -5682,7 +5682,7 @@ static void bnx2x_sp_task(struct work_struct *work)
 		if (status & BNX2X_DEF_SB_IDX) {
 			struct bnx2x_fastpath *fp = bnx2x_fcoe_fp(bp);
 
-		if (FCOE_INIT(bp) &&
+			if (FCOE_INIT(bp) &&
 			    (bnx2x_has_rx_work(fp) || bnx2x_has_tx_work(fp))) {
 				/* Prevent local bottom-halves from running as
 				 * we are going to change the local NAPI list.
-- 
cgit v0.10.2


From 607e681111800f9bbfcd823a385a45dd0295dfb8 Mon Sep 17 00:00:00 2001
From: Chunhao Lin <hau@realtek.com>
Date: Thu, 3 Mar 2016 00:59:15 +0800
Subject: r8169: Enable RX_MULTI_EN for RTL_GIGA_MAC_VER_41~48

For RTL8168G/RTL8168H/RTL8411B/RTL8107E, enable this flag to eliminate
message "AMD-Vi: Event logged [IO_PAGE_FAULT device=01:00.0 domain=0x0002
address=0x0000000000003000 flags=0x0050] in dmesg.

Signed-off-by: Chunhao Lin <hau@realtek.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 4caeacb..dd2cf37 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4933,8 +4933,6 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp)
 		RTL_W32(RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST);
 		break;
 	case RTL_GIGA_MAC_VER_40:
-		RTL_W32(RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF);
-		break;
 	case RTL_GIGA_MAC_VER_41:
 	case RTL_GIGA_MAC_VER_42:
 	case RTL_GIGA_MAC_VER_43:
@@ -4943,8 +4941,6 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp)
 	case RTL_GIGA_MAC_VER_46:
 	case RTL_GIGA_MAC_VER_47:
 	case RTL_GIGA_MAC_VER_48:
-		RTL_W32(RxConfig, RX128_INT_EN | RX_DMA_BURST | RX_EARLY_OFF);
-		break;
 	case RTL_GIGA_MAC_VER_49:
 	case RTL_GIGA_MAC_VER_50:
 	case RTL_GIGA_MAC_VER_51:
-- 
cgit v0.10.2


From 59a7c2fd336eaafb030aac9c91ac21d136a99f33 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 29 Feb 2016 21:17:09 +0200
Subject: net/mlx5e: Remove wrong poll CQ optimization

With the MLX5E_CQ_HAS_CQES optimization flag, the following buggy
flow might occur:
- Suppose RX is always busy, TX has a single packet every second.
- We poll a single TX cqe and clear its flag.
- We never arm it again as RX is always busy.
- TX CQ flag is never changed, and new TX cqes are not polled.

We revert this optimization.

Fixes: e586b3b0baee ('net/mlx5: Ethernet Datapath files')
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index aac071a..614a602 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -304,14 +304,9 @@ enum {
 	MLX5E_RQ_STATE_POST_WQES_ENABLE,
 };
 
-enum cq_flags {
-	MLX5E_CQ_HAS_CQES = 1,
-};
-
 struct mlx5e_cq {
 	/* data path - accessed per cqe */
 	struct mlx5_cqwq           wq;
-	unsigned long              flags;
 
 	/* data path - accessed per napi poll */
 	struct napi_struct        *napi;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index dd959d9..3fd6a58 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -230,10 +230,6 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 	struct mlx5e_rq *rq = container_of(cq, struct mlx5e_rq, cq);
 	int work_done;
 
-	/* avoid accessing cq (dma coherent memory) if not needed */
-	if (!test_and_clear_bit(MLX5E_CQ_HAS_CQES, &cq->flags))
-		return 0;
-
 	for (work_done = 0; work_done < budget; work_done++) {
 		struct mlx5e_rx_wqe *wqe;
 		struct mlx5_cqe64 *cqe;
@@ -279,8 +275,5 @@ wq_ll_pop:
 	/* ensure cq space is freed before enabling more cqes */
 	wmb();
 
-	if (work_done == budget)
-		set_bit(MLX5E_CQ_HAS_CQES, &cq->flags);
-
 	return work_done;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 2c3fba0..2beea8c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -335,10 +335,6 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq)
 	u16 sqcc;
 	int i;
 
-	/* avoid accessing cq (dma coherent memory) if not needed */
-	if (!test_and_clear_bit(MLX5E_CQ_HAS_CQES, &cq->flags))
-		return false;
-
 	sq = container_of(cq, struct mlx5e_sq, cq);
 
 	npkts = 0;
@@ -422,10 +418,6 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq)
 				netif_tx_wake_queue(sq->txq);
 				sq->stats.wake++;
 	}
-	if (i == MLX5E_TX_CQ_POLL_BUDGET) {
-		set_bit(MLX5E_CQ_HAS_CQES, &cq->flags);
-		return true;
-	}
 
-	return false;
+	return (i == MLX5E_TX_CQ_POLL_BUDGET);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 4ac8d71..66d51a7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -88,7 +88,6 @@ void mlx5e_completion_event(struct mlx5_core_cq *mcq)
 {
 	struct mlx5e_cq *cq = container_of(mcq, struct mlx5e_cq, mcq);
 
-	set_bit(MLX5E_CQ_HAS_CQES, &cq->flags);
 	set_bit(MLX5E_CHANNEL_NAPI_SCHED, &cq->channel->flags);
 	barrier();
 	napi_schedule(cq->napi);
-- 
cgit v0.10.2


From ab0394fe2c258fdb5086c51a251b28f8ee7ab35c Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 29 Feb 2016 21:17:10 +0200
Subject: net/mlx5e: Fix LRO modify

Ethtool LRO enable/disable is broken, as of today we only modify TCP
TIRs in order to apply the requested configuration.

Hardware requires that all TIRs pointing to the same RQ should share the
same LRO configuration. For that all other TIRs' LRO fields must be
modified as well.

Fixes: 5c50368f3831 ('net/mlx5e: Light-weight netdev open/stop')
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d4e1c30..137b05e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1317,7 +1317,7 @@ static void mlx5e_build_tir_ctx_lro(void *tirc, struct mlx5e_priv *priv)
 			      lro_timer_supported_periods[2]));
 }
 
-static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt)
+static int mlx5e_modify_tirs_lro(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 
@@ -1325,6 +1325,7 @@ static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt)
 	void *tirc;
 	int inlen;
 	int err;
+	int tt;
 
 	inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
 	in = mlx5_vzalloc(inlen);
@@ -1336,7 +1337,11 @@ static int mlx5e_modify_tir_lro(struct mlx5e_priv *priv, int tt)
 
 	mlx5e_build_tir_ctx_lro(tirc, priv);
 
-	err = mlx5_core_modify_tir(mdev, priv->tirn[tt], in, inlen);
+	for (tt = 0; tt < MLX5E_NUM_TT; tt++) {
+		err = mlx5_core_modify_tir(mdev, priv->tirn[tt], in, inlen);
+		if (err)
+			break;
+	}
 
 	kvfree(in);
 
@@ -1885,8 +1890,10 @@ static int mlx5e_set_features(struct net_device *netdev,
 			mlx5e_close_locked(priv->netdev);
 
 		priv->params.lro_en = !!(features & NETIF_F_LRO);
-		mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV4_TCP);
-		mlx5e_modify_tir_lro(priv, MLX5E_TT_IPV6_TCP);
+		err = mlx5e_modify_tirs_lro(priv);
+		if (err)
+			mlx5_core_warn(priv->mdev, "lro modify failed, %d\n",
+				       err);
 
 		if (was_opened)
 			err = mlx5e_open_locked(priv->netdev);
-- 
cgit v0.10.2


From 0ad9b20415a461332611666301e7812900a15ad4 Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 29 Feb 2016 21:17:11 +0200
Subject: net/mlx5e: Fix soft lockup when HW Timestamping is enabled

Readers/Writers lock for SW timecounter was acquired without disabling
interrupts on local CPU.

The problematic scenario:
* HW timestamping is enabled
* Timestamp overflow periodic service task is running on local CPU and
  holding write_lock for SW timecounter
* Completion arrives, triggers interrupt for local CPU.
  Interrupt routine calls napi_schedule(), which triggers rx/tx
  skb process.
  An attempt to read SW timecounter using read_lock is done, which is
  already locked by a writer on the same CPU and cause soft lockup.

Add irqsave/irqrestore for when using the readers/writers lock for
writing.

Fixes: ef9814deafd0 ('net/mlx5e: Add HW timestamping (TS) support')
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
index be65435..2018eeb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
@@ -62,10 +62,11 @@ static void mlx5e_timestamp_overflow(struct work_struct *work)
 	struct delayed_work *dwork = to_delayed_work(work);
 	struct mlx5e_tstamp *tstamp = container_of(dwork, struct mlx5e_tstamp,
 						   overflow_work);
+	unsigned long flags;
 
-	write_lock(&tstamp->lock);
+	write_lock_irqsave(&tstamp->lock, flags);
 	timecounter_read(&tstamp->clock);
-	write_unlock(&tstamp->lock);
+	write_unlock_irqrestore(&tstamp->lock, flags);
 	schedule_delayed_work(&tstamp->overflow_work, tstamp->overflow_period);
 }
 
@@ -136,10 +137,11 @@ static int mlx5e_ptp_settime(struct ptp_clock_info *ptp,
 	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
 						   ptp_info);
 	u64 ns = timespec64_to_ns(ts);
+	unsigned long flags;
 
-	write_lock(&tstamp->lock);
+	write_lock_irqsave(&tstamp->lock, flags);
 	timecounter_init(&tstamp->clock, &tstamp->cycles, ns);
-	write_unlock(&tstamp->lock);
+	write_unlock_irqrestore(&tstamp->lock, flags);
 
 	return 0;
 }
@@ -150,10 +152,11 @@ static int mlx5e_ptp_gettime(struct ptp_clock_info *ptp,
 	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
 						   ptp_info);
 	u64 ns;
+	unsigned long flags;
 
-	write_lock(&tstamp->lock);
+	write_lock_irqsave(&tstamp->lock, flags);
 	ns = timecounter_read(&tstamp->clock);
-	write_unlock(&tstamp->lock);
+	write_unlock_irqrestore(&tstamp->lock, flags);
 
 	*ts = ns_to_timespec64(ns);
 
@@ -164,10 +167,11 @@ static int mlx5e_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta)
 {
 	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
 						   ptp_info);
+	unsigned long flags;
 
-	write_lock(&tstamp->lock);
+	write_lock_irqsave(&tstamp->lock, flags);
 	timecounter_adjtime(&tstamp->clock, delta);
-	write_unlock(&tstamp->lock);
+	write_unlock_irqrestore(&tstamp->lock, flags);
 
 	return 0;
 }
@@ -176,6 +180,7 @@ static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
 {
 	u64 adj;
 	u32 diff;
+	unsigned long flags;
 	int neg_adj = 0;
 	struct mlx5e_tstamp *tstamp = container_of(ptp, struct mlx5e_tstamp,
 						  ptp_info);
@@ -189,11 +194,11 @@ static int mlx5e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta)
 	adj *= delta;
 	diff = div_u64(adj, 1000000000ULL);
 
-	write_lock(&tstamp->lock);
+	write_lock_irqsave(&tstamp->lock, flags);
 	timecounter_read(&tstamp->clock);
 	tstamp->cycles.mult = neg_adj ? tstamp->nominal_c_mult - diff :
 					tstamp->nominal_c_mult + diff;
-	write_unlock(&tstamp->lock);
+	write_unlock_irqrestore(&tstamp->lock, flags);
 
 	return 0;
 }
-- 
cgit v0.10.2


From bdfc028de1b3cd59490d5413a5c87b0fa50040c2 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 29 Feb 2016 21:17:12 +0200
Subject: net/mlx5e: Fix ethtool RX hash func configuration change

We should modify TIRs explicitly to apply the new RSS configuration.
The light ndo close/open calls do not "refresh" them.

Fixes: 2d75b2bc8a8c ('net/mlx5e: Add ethtool RSS configuration options')
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 614a602..976bddb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -447,6 +447,8 @@ enum mlx5e_traffic_types {
 	MLX5E_NUM_TT,
 };
 
+#define IS_HASHING_TT(tt) (tt != MLX5E_TT_ANY)
+
 enum mlx5e_rqt_ix {
 	MLX5E_INDIRECTION_RQT,
 	MLX5E_SINGLE_RQ_RQT,
@@ -613,6 +615,7 @@ void mlx5e_enable_vlan_filter(struct mlx5e_priv *priv);
 void mlx5e_disable_vlan_filter(struct mlx5e_priv *priv);
 
 int mlx5e_redirect_rqt(struct mlx5e_priv *priv, enum mlx5e_rqt_ix rqt_ix);
+void mlx5e_build_tir_ctx_hash(void *tirc, struct mlx5e_priv *priv);
 
 int mlx5e_open_locked(struct net_device *netdev);
 int mlx5e_close_locked(struct net_device *netdev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 65624ac..64af1b0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -703,18 +703,36 @@ static int mlx5e_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key,
 	return 0;
 }
 
+static void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *tirc = MLX5_ADDR_OF(modify_tir_in, in, ctx);
+	int i;
+
+	MLX5_SET(modify_tir_in, in, bitmask.hash, 1);
+	mlx5e_build_tir_ctx_hash(tirc, priv);
+
+	for (i = 0; i < MLX5E_NUM_TT; i++)
+		if (IS_HASHING_TT(i))
+			mlx5_core_modify_tir(mdev, priv->tirn[i], in, inlen);
+}
+
 static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
 			  const u8 *key, const u8 hfunc)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
-	bool close_open;
-	int err = 0;
+	int inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+	void *in;
 
 	if ((hfunc != ETH_RSS_HASH_NO_CHANGE) &&
 	    (hfunc != ETH_RSS_HASH_XOR) &&
 	    (hfunc != ETH_RSS_HASH_TOP))
 		return -EINVAL;
 
+	in = mlx5_vzalloc(inlen);
+	if (!in)
+		return -ENOMEM;
+
 	mutex_lock(&priv->state_lock);
 
 	if (indir) {
@@ -723,11 +741,6 @@ static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
 		mlx5e_redirect_rqt(priv, MLX5E_INDIRECTION_RQT);
 	}
 
-	close_open = (key || (hfunc != ETH_RSS_HASH_NO_CHANGE)) &&
-		     test_bit(MLX5E_STATE_OPENED, &priv->state);
-	if (close_open)
-		mlx5e_close_locked(dev);
-
 	if (key)
 		memcpy(priv->params.toeplitz_hash_key, key,
 		       sizeof(priv->params.toeplitz_hash_key));
@@ -735,12 +748,13 @@ static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
 	if (hfunc != ETH_RSS_HASH_NO_CHANGE)
 		priv->params.rss_hfunc = hfunc;
 
-	if (close_open)
-		err = mlx5e_open_locked(priv->netdev);
+	mlx5e_modify_tirs_hash(priv, in, inlen);
 
 	mutex_unlock(&priv->state_lock);
 
-	return err;
+	kvfree(in);
+
+	return 0;
 }
 
 static int mlx5e_get_rxnfc(struct net_device *netdev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 137b05e..34b1049 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1317,6 +1317,21 @@ static void mlx5e_build_tir_ctx_lro(void *tirc, struct mlx5e_priv *priv)
 			      lro_timer_supported_periods[2]));
 }
 
+void mlx5e_build_tir_ctx_hash(void *tirc, struct mlx5e_priv *priv)
+{
+	MLX5_SET(tirc, tirc, rx_hash_fn,
+		 mlx5e_rx_hash_fn(priv->params.rss_hfunc));
+	if (priv->params.rss_hfunc == ETH_RSS_HASH_TOP) {
+		void *rss_key = MLX5_ADDR_OF(tirc, tirc,
+					     rx_hash_toeplitz_key);
+		size_t len = MLX5_FLD_SZ_BYTES(tirc,
+					       rx_hash_toeplitz_key);
+
+		MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
+		memcpy(rss_key, priv->params.toeplitz_hash_key, len);
+	}
+}
+
 static int mlx5e_modify_tirs_lro(struct mlx5e_priv *priv)
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
@@ -1677,17 +1692,7 @@ static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 *tirc, int tt)
 	default:
 		MLX5_SET(tirc, tirc, indirect_table,
 			 priv->rqtn[MLX5E_INDIRECTION_RQT]);
-		MLX5_SET(tirc, tirc, rx_hash_fn,
-			 mlx5e_rx_hash_fn(priv->params.rss_hfunc));
-		if (priv->params.rss_hfunc == ETH_RSS_HASH_TOP) {
-			void *rss_key = MLX5_ADDR_OF(tirc, tirc,
-						     rx_hash_toeplitz_key);
-			size_t len = MLX5_FLD_SZ_BYTES(tirc,
-						       rx_hash_toeplitz_key);
-
-			MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
-			memcpy(rss_key, priv->params.toeplitz_hash_key, len);
-		}
+		mlx5e_build_tir_ctx_hash(tirc, priv);
 		break;
 	}
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 51f1e54..58eef02 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4245,7 +4245,9 @@ struct mlx5_ifc_modify_tir_bitmask_bits {
 
 	u8         reserved_at_20[0x1b];
 	u8         self_lb_en[0x1];
-	u8         reserved_at_3c[0x3];
+	u8         reserved_at_3c[0x1];
+	u8         hash[0x1];
+	u8         reserved_at_3e[0x1];
 	u8         lro[0x1];
 };
 
-- 
cgit v0.10.2


From 85082dba0a5059c538cfa786d07f5ec5370d22fe Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@mellanox.com>
Date: Mon, 29 Feb 2016 21:17:13 +0200
Subject: net/mlx5e: Correctly handle RSS indirection table when changing
 number of channels

Upon changing num_channels, reset the RSS indirection table to
match the new value.

Fixes: 2d75b2bc8a8c ('net/mlx5e: Add ethtool RSS configuration options')
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 976bddb..d0a57d5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -619,6 +619,8 @@ void mlx5e_build_tir_ctx_hash(void *tirc, struct mlx5e_priv *priv);
 
 int mlx5e_open_locked(struct net_device *netdev);
 int mlx5e_close_locked(struct net_device *netdev);
+void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+				   int num_channels);
 
 static inline void mlx5e_tx_notify_hw(struct mlx5e_sq *sq,
 				      struct mlx5e_tx_wqe *wqe, int bf_sz)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 64af1b0..5abeb00 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -385,6 +385,8 @@ static int mlx5e_set_channels(struct net_device *dev,
 		mlx5e_close_locked(dev);
 
 	priv->params.num_channels = count;
+	mlx5e_build_default_indir_rqt(priv->params.indirection_rqt,
+				      MLX5E_INDIR_RQT_SIZE, count);
 
 	if (was_opened)
 		err = mlx5e_open_locked(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 34b1049..02689ca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1199,7 +1199,6 @@ static void mlx5e_fill_indir_rqt_rqns(struct mlx5e_priv *priv, void *rqtc)
 			ix = mlx5e_bits_invert(i, MLX5E_LOG_INDIR_RQT_SIZE);
 
 		ix = priv->params.indirection_rqt[ix];
-		ix = ix % priv->params.num_channels;
 		MLX5_SET(rqtc, rqtc, rq_num[i],
 			 test_bit(MLX5E_STATE_OPENED, &priv->state) ?
 			 priv->channel[ix]->rq.rqn :
@@ -2101,12 +2100,20 @@ u16 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev)
 	       2 /*sizeof(mlx5e_tx_wqe.inline_hdr_start)*/;
 }
 
+void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, int len,
+				   int num_channels)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		indirection_rqt[i] = i % num_channels;
+}
+
 static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 				    struct net_device *netdev,
 				    int num_channels)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
-	int i;
 
 	priv->params.log_sq_size           =
 		MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
@@ -2130,8 +2137,8 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
 	netdev_rss_key_fill(priv->params.toeplitz_hash_key,
 			    sizeof(priv->params.toeplitz_hash_key));
 
-	for (i = 0; i < MLX5E_INDIR_RQT_SIZE; i++)
-		priv->params.indirection_rqt[i] = i % num_channels;
+	mlx5e_build_default_indir_rqt(priv->params.indirection_rqt,
+				      MLX5E_INDIR_RQT_SIZE, num_channels);
 
 	priv->params.lro_wqe_sz            =
 		MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
-- 
cgit v0.10.2


From b081da5ee1860d1874381a413a1b71d8f9f67e83 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Mon, 29 Feb 2016 21:17:14 +0200
Subject: net/mlx5e: Add rx/tx bytes software counters

Sum up rx/tx bytes in software as we do for rx/tx packets, to be reported
in upcoming statistics fix.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index d0a57d5..5b17532 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -223,6 +223,7 @@ struct mlx5e_pport_stats {
 
 static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
 	"packets",
+	"bytes",
 	"csum_none",
 	"csum_sw",
 	"lro_packets",
@@ -232,16 +233,18 @@ static const char rq_stats_strings[][ETH_GSTRING_LEN] = {
 
 struct mlx5e_rq_stats {
 	u64 packets;
+	u64 bytes;
 	u64 csum_none;
 	u64 csum_sw;
 	u64 lro_packets;
 	u64 lro_bytes;
 	u64 wqe_err;
-#define NUM_RQ_STATS 6
+#define NUM_RQ_STATS 7
 };
 
 static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
 	"packets",
+	"bytes",
 	"tso_packets",
 	"tso_bytes",
 	"csum_offload_none",
@@ -253,6 +256,7 @@ static const char sq_stats_strings[][ETH_GSTRING_LEN] = {
 
 struct mlx5e_sq_stats {
 	u64 packets;
+	u64 bytes;
 	u64 tso_packets;
 	u64 tso_bytes;
 	u64 csum_offload_none;
@@ -260,7 +264,7 @@ struct mlx5e_sq_stats {
 	u64 wake;
 	u64 dropped;
 	u64 nop;
-#define NUM_SQ_STATS 8
+#define NUM_SQ_STATS 9
 };
 
 struct mlx5e_stats {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 3fd6a58..59658b9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -263,6 +263,7 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 
 		mlx5e_build_rx_skb(cqe, rq, skb);
 		rq->stats.packets++;
+		rq->stats.bytes += be32_to_cpu(cqe->byte_cnt);
 		napi_gro_receive(cq->napi, skb);
 
 wq_ll_pop:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 2beea8c..bb4eeeb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -179,6 +179,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 	unsigned int skb_len = skb->len;
 	u8  opcode = MLX5_OPCODE_SEND;
 	dma_addr_t dma_addr = 0;
+	unsigned int num_bytes;
 	bool bf = false;
 	u16 headlen;
 	u16 ds_cnt;
@@ -204,8 +205,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 		opcode       = MLX5_OPCODE_LSO;
 		ihs          = skb_transport_offset(skb) + tcp_hdrlen(skb);
 		payload_len  = skb->len - ihs;
-		wi->num_bytes = skb->len +
-				(skb_shinfo(skb)->gso_segs - 1) * ihs;
+		num_bytes = skb->len + (skb_shinfo(skb)->gso_segs - 1) * ihs;
 		sq->stats.tso_packets++;
 		sq->stats.tso_bytes += payload_len;
 	} else {
@@ -213,9 +213,11 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 		     !skb->xmit_more &&
 		     !skb_shinfo(skb)->nr_frags;
 		ihs = mlx5e_get_inline_hdr_size(sq, skb, bf);
-		wi->num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
+		num_bytes = max_t(unsigned int, skb->len, ETH_ZLEN);
 	}
 
+	wi->num_bytes = num_bytes;
+
 	if (skb_vlan_tag_present(skb)) {
 		mlx5e_insert_vlan(eseg->inline_hdr_start, skb, ihs, &skb_data,
 				  &skb_len);
@@ -307,6 +309,7 @@ static netdev_tx_t mlx5e_sq_xmit(struct mlx5e_sq *sq, struct sk_buff *skb)
 	sq->bf_budget = bf ? sq->bf_budget - 1 : 0;
 
 	sq->stats.packets++;
+	sq->stats.bytes += num_bytes;
 	return NETDEV_TX_OK;
 
 dma_unmap_wqe_err:
-- 
cgit v0.10.2


From faf4478bf8b7b5e6c94cb07a52a00e4d1cd647c6 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Mon, 29 Feb 2016 21:17:15 +0200
Subject: net/mlx5e: Provide correct packet/bytes statistics

Using the HW VPort counters for traffic (rx/tx packets/bytes)
statistics is wrong. This is because frames dropped due to steering or
out of buffer will be counted as received. To fix that, we move to use
the packet/bytes accounting done by the driver for what the netdev
reports out.

Fixes: f62b8bb8f2d3 ('net/mlx5: Extend mlx5_core to support [...]')
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 02689ca..402994b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -141,6 +141,10 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 		return;
 
 	/* Collect firts the SW counters and then HW for consistency */
+	s->rx_packets		= 0;
+	s->rx_bytes		= 0;
+	s->tx_packets		= 0;
+	s->tx_bytes		= 0;
 	s->tso_packets		= 0;
 	s->tso_bytes		= 0;
 	s->tx_queue_stopped	= 0;
@@ -155,6 +159,8 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 	for (i = 0; i < priv->params.num_channels; i++) {
 		rq_stats = &priv->channel[i]->rq.stats;
 
+		s->rx_packets	+= rq_stats->packets;
+		s->rx_bytes	+= rq_stats->bytes;
 		s->lro_packets	+= rq_stats->lro_packets;
 		s->lro_bytes	+= rq_stats->lro_bytes;
 		s->rx_csum_none	+= rq_stats->csum_none;
@@ -164,6 +170,8 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 		for (j = 0; j < priv->params.num_tc; j++) {
 			sq_stats = &priv->channel[i]->sq[j].stats;
 
+			s->tx_packets		+= sq_stats->packets;
+			s->tx_bytes		+= sq_stats->bytes;
 			s->tso_packets		+= sq_stats->tso_packets;
 			s->tso_bytes		+= sq_stats->tso_bytes;
 			s->tx_queue_stopped	+= sq_stats->stopped;
@@ -225,23 +233,6 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
 	s->tx_broadcast_bytes   =
 		MLX5_GET_CTR(out, transmitted_eth_broadcast.octets);
 
-	s->rx_packets =
-		s->rx_unicast_packets +
-		s->rx_multicast_packets +
-		s->rx_broadcast_packets;
-	s->rx_bytes =
-		s->rx_unicast_bytes +
-		s->rx_multicast_bytes +
-		s->rx_broadcast_bytes;
-	s->tx_packets =
-		s->tx_unicast_packets +
-		s->tx_multicast_packets +
-		s->tx_broadcast_packets;
-	s->tx_bytes =
-		s->tx_unicast_bytes +
-		s->tx_multicast_bytes +
-		s->tx_broadcast_bytes;
-
 	/* Update calculated offload counters */
 	s->tx_csum_offload = s->tx_packets - tx_offload_none;
 	s->rx_csum_good    = s->rx_packets - s->rx_csum_none -
-- 
cgit v0.10.2


From 03a79f31ef5fb7a2298258432f3dc0f558f24d48 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Wed, 2 Mar 2016 17:47:44 +0200
Subject: net/mlx4_core: Fix lockdep warning in handling of mac/vlan tables

In the mac and vlan register/unregister/replace functions, the driver locks
the mac table mutex (or vlan table mutex) on both ports.

We move to use mutex_lock_nested() to prevent warnings, such as the one below.

[ 101.828445] =============================================
[ 101.834820] [ INFO: possible recursive locking detected ]
[ 101.841199] 4.5.0-rc2+  #49 Not tainted
[ 101.850251] ---------------------------------------------
[ 101.856621] modprobe/3054 is trying to acquire lock:
[ 101.862514] (&table->mutex#2){+.+.+.}, at: [<ffffffffa079c10e>] __mlx4_register_mac+0x87e/0xa90 [mlx4_core]
[ 101.874598]
[ 101.874598] but task is already holding lock:
[ 101.881703] (&table->mutex#2){+.+.+.}, at: [<ffffffffa079c0f0>] __mlx4_register_mac+0x860/0xa90 [mlx4_core]
[ 101.893776]
[ 101.893776] other info that might help us debug this:
[ 101.901658] Possible unsafe locking scenario:
[ 101.901658]
[ 101.908859] CPU0
[ 101.911923] ----
[ 101.914985] lock(&table->mutex#2);
[ 101.919595] lock(&table->mutex#2);
[ 101.924199]
[ 101.924199] * DEADLOCK *
[ 101.924199]
[ 101.931643] May be due to missing lock nesting notation

Fixes: 5f61385d2ebc ('net/mlx4_core: Keep VLAN/MAC tables mirrored in multifunc HA mode')
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Suggested-by: Doron Tsur <doront@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index 787b7bb..211c650 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -193,10 +193,10 @@ int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 	if (need_mf_bond) {
 		if (port == 1) {
 			mutex_lock(&table->mutex);
-			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
 		} else {
 			mutex_lock(&dup_table->mutex);
-			mutex_lock(&table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
 		}
 	} else {
 		mutex_lock(&table->mutex);
@@ -389,10 +389,10 @@ void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, u64 mac)
 	if (dup) {
 		if (port == 1) {
 			mutex_lock(&table->mutex);
-			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
 		} else {
 			mutex_lock(&dup_table->mutex);
-			mutex_lock(&table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
 		}
 	} else {
 		mutex_lock(&table->mutex);
@@ -479,10 +479,10 @@ int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac)
 	if (dup) {
 		if (port == 1) {
 			mutex_lock(&table->mutex);
-			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
 		} else {
 			mutex_lock(&dup_table->mutex);
-			mutex_lock(&table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
 		}
 	} else {
 		mutex_lock(&table->mutex);
@@ -588,10 +588,10 @@ int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
 	if (need_mf_bond) {
 		if (port == 1) {
 			mutex_lock(&table->mutex);
-			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
 		} else {
 			mutex_lock(&dup_table->mutex);
-			mutex_lock(&table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
 		}
 	} else {
 		mutex_lock(&table->mutex);
@@ -764,10 +764,10 @@ void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan)
 	if (dup) {
 		if (port == 1) {
 			mutex_lock(&table->mutex);
-			mutex_lock(&dup_table->mutex);
+			mutex_lock_nested(&dup_table->mutex, SINGLE_DEPTH_NESTING);
 		} else {
 			mutex_lock(&dup_table->mutex);
-			mutex_lock(&table->mutex);
+			mutex_lock_nested(&table->mutex, SINGLE_DEPTH_NESTING);
 		}
 	} else {
 		mutex_lock(&table->mutex);
-- 
cgit v0.10.2


From 00ada91039fc03100cb4fab41d04f9b30f27340f Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Wed, 2 Mar 2016 17:47:45 +0200
Subject: net/mlx4_core: Check the correct limitation on VFs for HA mode

The limit of 63 is only for virtual functions while the actual enforcement
was for VFs plus physical functions, fix that.

Fixes: e57968a10bc1 ('net/mlx4_core: Support the HA mode for SRIOV VFs too')
Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 2cc3c62..f8674ae 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1256,6 +1256,7 @@ err_set_port:
 static int mlx4_mf_bond(struct mlx4_dev *dev)
 {
 	int err = 0;
+	int nvfs;
 	struct mlx4_slaves_pport slaves_port1;
 	struct mlx4_slaves_pport slaves_port2;
 	DECLARE_BITMAP(slaves_port_1_2, MLX4_MFUNC_MAX);
@@ -1272,11 +1273,18 @@ static int mlx4_mf_bond(struct mlx4_dev *dev)
 		return -EINVAL;
 	}
 
+	/* number of virtual functions is number of total functions minus one
+	 * physical function for each port.
+	 */
+	nvfs = bitmap_weight(slaves_port1.slaves, dev->persist->num_vfs + 1) +
+		bitmap_weight(slaves_port2.slaves, dev->persist->num_vfs + 1) - 2;
+
 	/* limit on maximum allowed VFs */
-	if ((bitmap_weight(slaves_port1.slaves, dev->persist->num_vfs + 1) +
-	    bitmap_weight(slaves_port2.slaves, dev->persist->num_vfs + 1)) >
-	    MAX_MF_BOND_ALLOWED_SLAVES)
+	if (nvfs > MAX_MF_BOND_ALLOWED_SLAVES) {
+		mlx4_warn(dev, "HA mode is not supported for %d VFs (max %d are allowed)\n",
+			  nvfs, MAX_MF_BOND_ALLOWED_SLAVES);
 		return -EINVAL;
+	}
 
 	if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) {
 		mlx4_warn(dev, "HA mode unsupported for NON DMFS steering\n");
-- 
cgit v0.10.2


From 6e5224224faa50ec4c8949dcefadf895e565f0d1 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Wed, 2 Mar 2016 17:47:46 +0200
Subject: net/mlx4_core: Allow resetting VF admin mac to zero

The VF administrative mac addresses (stored in the PF driver) are
initialized to zero when the PF driver starts up.

These addresses may be modified in the PF driver through ndo calls
initiated by iproute2 or libvirt.

While we allow the PF/host to change the VF admin mac address from zero
to a valid unicast mac, we do not allow restoring the VF admin mac to
zero. We currently only allow changing this mac to a different unicast mac.

This leads to problems when libvirt scripts are used to deal with
VF mac addresses, and libvirt attempts to revoke the mac so this
host will not use it anymore.

Fix this by allowing resetting a VF administrative MAC back to zero.

Fixes: 8f7ba3ca12f6 ('net/mlx4: Add set VF mac address support')
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Reported-by: Moshe Levi <moshele@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index f191a16..21e2c09 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2245,7 +2245,7 @@ static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
 	struct mlx4_en_dev *mdev = en_priv->mdev;
 	u64 mac_u64 = mlx4_mac_to_u64(mac);
 
-	if (!is_valid_ether_addr(mac))
+	if (is_multicast_ether_addr(mac))
 		return -EINVAL;
 
 	return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64);
-- 
cgit v0.10.2


From 8afb6c474b0dd8f9cf7e122a19c71924566fdb97 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabinv@axis.com>
Date: Mon, 29 Feb 2016 16:22:30 +0100
Subject: dwc_eth_qos: fix race condition in dwceqos_start_xmit

The xmit handler and the tx_reclaim tasklet had a race on the tx_free
variable which could lead to a tx timeout if tx_free was updated after
the tx complete interrupt.

Signed-off-by: Rabin Vincent <rabinv@axis.com>
Signed-off-by: Lars Persson <larper@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/synopsys/dwc_eth_qos.c b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
index fc8bbff..926db2d 100644
--- a/drivers/net/ethernet/synopsys/dwc_eth_qos.c
+++ b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
@@ -2178,12 +2178,10 @@ static int dwceqos_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 		((trans.initial_descriptor + trans.nr_descriptors) %
 		 DWCEQOS_TX_DCNT));
 
-	dwceqos_tx_finalize(skb, lp, &trans);
-
-	netdev_sent_queue(ndev, skb->len);
-
 	spin_lock_bh(&lp->tx_lock);
 	lp->tx_free -= trans.nr_descriptors;
+	dwceqos_tx_finalize(skb, lp, &trans);
+	netdev_sent_queue(ndev, skb->len);
 	spin_unlock_bh(&lp->tx_lock);
 
 	ndev->trans_start = jiffies;
-- 
cgit v0.10.2


From d4dc35f26e1017e2a632339ecae43706cbc06986 Mon Sep 17 00:00:00 2001
From: Lars Persson <lars.persson@axis.com>
Date: Mon, 29 Feb 2016 16:22:31 +0100
Subject: dwc_eth_qos: release descriptors outside netif_tx_lock

To prepare for using the CMA, we can not be in atomic context when
de-allocating DMA buffers.

The tx lock was needed only to protect the hw reset against the xmit
handler. Now we briefly grab the tx lock while stopping the queue to
make sure no thread is inside or will enter the xmit handler.

Signed-off-by: Lars Persson <larper@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/synopsys/dwc_eth_qos.c b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
index 926db2d..53d48c0 100644
--- a/drivers/net/ethernet/synopsys/dwc_eth_qos.c
+++ b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
@@ -1918,15 +1918,17 @@ static int dwceqos_stop(struct net_device *ndev)
 	phy_stop(lp->phy_dev);
 
 	tasklet_disable(&lp->tx_bdreclaim_tasklet);
-	netif_stop_queue(ndev);
 	napi_disable(&lp->napi);
 
-	dwceqos_drain_dma(lp);
+	/* Stop all tx before we drain the tx dma. */
+	netif_tx_lock_bh(lp->ndev);
+	netif_stop_queue(ndev);
+	netif_tx_unlock_bh(lp->ndev);
 
-	netif_tx_lock(lp->ndev);
+	dwceqos_drain_dma(lp);
 	dwceqos_reset_hw(lp);
+
 	dwceqos_descriptor_free(lp);
-	netif_tx_unlock(lp->ndev);
 
 	return 0;
 }
-- 
cgit v0.10.2


From e8b0c32eae319e287d0329b4148c9a1a51525053 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabinv@axis.com>
Date: Mon, 29 Feb 2016 16:22:32 +0100
Subject: dwc_eth_qos: use GFP_KERNEL in dma_alloc_coherent()

Since we are in non-atomic context here we can pass GFP_KERNEL to
dma_alloc_coherent(). This enables use of the CMA.

Signed-off-by: Rabin Vincent <rabinv@axis.com>
Signed-off-by: Lars Persson <larper@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/synopsys/dwc_eth_qos.c b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
index 53d48c0..3ca2d5c 100644
--- a/drivers/net/ethernet/synopsys/dwc_eth_qos.c
+++ b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
@@ -1113,7 +1113,7 @@ static int dwceqos_descriptor_init(struct net_local *lp)
 	/* Allocate DMA descriptors */
 	size = DWCEQOS_RX_DCNT * sizeof(struct dwceqos_dma_desc);
 	lp->rx_descs = dma_alloc_coherent(lp->ndev->dev.parent, size,
-			&lp->rx_descs_addr, 0);
+			&lp->rx_descs_addr, GFP_KERNEL);
 	if (!lp->rx_descs)
 		goto err_out;
 	lp->rx_descs_tail_addr = lp->rx_descs_addr +
@@ -1121,7 +1121,7 @@ static int dwceqos_descriptor_init(struct net_local *lp)
 
 	size = DWCEQOS_TX_DCNT * sizeof(struct dwceqos_dma_desc);
 	lp->tx_descs = dma_alloc_coherent(lp->ndev->dev.parent, size,
-			&lp->tx_descs_addr, 0);
+			&lp->tx_descs_addr, GFP_KERNEL);
 	if (!lp->tx_descs)
 		goto err_out;
 	lp->tx_descs_tail_addr = lp->tx_descs_addr +
-- 
cgit v0.10.2


From 016a91c64de6621e5917b2ddfe2afc0abb0a884c Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabinv@axis.com>
Date: Mon, 29 Feb 2016 16:22:33 +0100
Subject: dwc_eth_qos: use DWCEQOS_MSG_DEFAULT

Since debug is hardcoded to 3, the defaults in the DWCEQOS_MSG_DEFAULT
macro are never used, which does not seem to be the intended behaviour
here.  Set debug to -1 like other drivers so that DWCEQOS_MSG_DEFAULT is
actually used by default.

Signed-off-by: Rabin Vincent <rabinv@axis.com>
Signed-off-by: Lars Persson <larper@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/synopsys/dwc_eth_qos.c b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
index 3ca2d5c..6897c1d 100644
--- a/drivers/net/ethernet/synopsys/dwc_eth_qos.c
+++ b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
@@ -426,7 +426,7 @@
 #define DWC_MMC_RXOCTETCOUNT_GB          0x0784
 #define DWC_MMC_RXPACKETCOUNT_GB         0x0780
 
-static int debug = 3;
+static int debug = -1;
 module_param(debug, int, 0);
 MODULE_PARM_DESC(debug, "DWC_eth_qos debug level (0=none,...,16=all)");
 
-- 
cgit v0.10.2


From cd5e41234729966153595fd3c2a7912cc4f95bc2 Mon Sep 17 00:00:00 2001
From: Lars Persson <lars.persson@axis.com>
Date: Mon, 29 Feb 2016 16:22:34 +0100
Subject: dwc_eth_qos: do phy_start before resetting hardware

This reverts the changed init order from commit 3647bc35bd42
("dwc_eth_qos: Reset hardware before PHY start") and makes another fix
for the race.

It turned out that the reset state machine of the dwceqos hardware
requires PHY clocks to be present in order to complete the reset
cycle.

To plug the race with the phy state machine we defer link speed
setting until the hardware init has finished.

Signed-off-by: Lars Persson <larper@axis.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/synopsys/dwc_eth_qos.c b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
index 6897c1d..af11ed1 100644
--- a/drivers/net/ethernet/synopsys/dwc_eth_qos.c
+++ b/drivers/net/ethernet/synopsys/dwc_eth_qos.c
@@ -650,6 +650,11 @@ struct net_local {
 	u32 mmc_tx_counters_mask;
 
 	struct dwceqos_flowcontrol flowcontrol;
+
+	/* Tracks the intermediate state of phy started but hardware
+	 * init not finished yet.
+	 */
+	bool phy_defer;
 };
 
 static void dwceqos_read_mmc_counters(struct net_local *lp, u32 rx_mask,
@@ -901,6 +906,9 @@ static void dwceqos_adjust_link(struct net_device *ndev)
 	struct phy_device *phydev = lp->phy_dev;
 	int status_change = 0;
 
+	if (lp->phy_defer)
+		return;
+
 	if (phydev->link) {
 		if ((lp->speed != phydev->speed) ||
 		    (lp->duplex != phydev->duplex)) {
@@ -1635,6 +1643,12 @@ static void dwceqos_init_hw(struct net_local *lp)
 	regval = dwceqos_read(lp, REG_DWCEQOS_MAC_CFG);
 	dwceqos_write(lp, REG_DWCEQOS_MAC_CFG,
 		      regval | DWCEQOS_MAC_CFG_TE | DWCEQOS_MAC_CFG_RE);
+
+	lp->phy_defer = false;
+	mutex_lock(&lp->phy_dev->lock);
+	phy_read_status(lp->phy_dev);
+	dwceqos_adjust_link(lp->ndev);
+	mutex_unlock(&lp->phy_dev->lock);
 }
 
 static void dwceqos_tx_reclaim(unsigned long data)
@@ -1880,9 +1894,13 @@ static int dwceqos_open(struct net_device *ndev)
 	}
 	netdev_reset_queue(ndev);
 
+	/* The dwceqos reset state machine requires all phy clocks to complete,
+	 * hence the unusual init order with phy_start first.
+	 */
+	lp->phy_defer = true;
+	phy_start(lp->phy_dev);
 	dwceqos_init_hw(lp);
 	napi_enable(&lp->napi);
-	phy_start(lp->phy_dev);
 
 	netif_start_queue(ndev);
 	tasklet_enable(&lp->tx_bdreclaim_tasklet);
@@ -1915,8 +1933,6 @@ static int dwceqos_stop(struct net_device *ndev)
 {
 	struct net_local *lp = netdev_priv(ndev);
 
-	phy_stop(lp->phy_dev);
-
 	tasklet_disable(&lp->tx_bdreclaim_tasklet);
 	napi_disable(&lp->napi);
 
@@ -1927,6 +1943,7 @@ static int dwceqos_stop(struct net_device *ndev)
 
 	dwceqos_drain_dma(lp);
 	dwceqos_reset_hw(lp);
+	phy_stop(lp->phy_dev);
 
 	dwceqos_descriptor_free(lp);
 
-- 
cgit v0.10.2


From 6bd58f09e1d8cc6c50a824c00bf0d617919986a1 Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:19 -0800
Subject: time: Add cycles to nanoseconds translation

The timekeeping code does not currently provide a way to translate
externally provided clocksource cycles to system time. The cycle count
is always provided by the result clocksource read() method internal to
the timekeeping code. The added function timekeeping_cycles_to_ns()
calculated a nanosecond value from a cycle count that can be added to
tk_read_base.base value yielding the current system time. This allows
clocksource cycle values external to the timekeeping code to provide a
cycle count that can be transformed to system time.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 34b4ced..4243d28 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -298,17 +298,34 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
 static inline u32 arch_gettimeoffset(void) { return 0; }
 #endif
 
+static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
+					  cycle_t delta)
+{
+	s64 nsec;
+
+	nsec = delta * tkr->mult + tkr->xtime_nsec;
+	nsec >>= tkr->shift;
+
+	/* If arch requires, add in get_arch_timeoffset() */
+	return nsec + arch_gettimeoffset();
+}
+
 static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 {
 	cycle_t delta;
-	s64 nsec;
 
 	delta = timekeeping_get_delta(tkr);
+	return timekeeping_delta_to_ns(tkr, delta);
+}
 
-	nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift;
+static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
+					    cycle_t cycles)
+{
+	cycle_t delta;
 
-	/* If arch requires, add in get_arch_timeoffset() */
-	return nsec + arch_gettimeoffset();
+	/* calculate the delta since the last update_wall_time */
+	delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
+	return timekeeping_delta_to_ns(tkr, delta);
 }
 
 /**
-- 
cgit v0.10.2


From 9da0f49c8767cc0ef6101cb21156cf4380ed50dd Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:20 -0800
Subject: time: Add timekeeping snapshot code capturing system time and counter

In the current timekeeping code there isn't any interface to
atomically capture the current relationship between the system counter
and system time. ktime_get_snapshot() returns this triple (counter,
monotonic raw, realtime) in the system_time_snapshot struct.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Moved structure definitions around to clean things up,
 fixed cycles_t/cycle_t confusion.]
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index ec89d84..7817591 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -267,6 +267,24 @@ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw,
 				        struct timespec64 *ts_real);
 
 /*
+ * struct system_time_snapshot - simultaneous raw/real time capture with
+ *	counter value
+ * @cycles:	Clocksource counter value to produce the system times
+ * @real:	Realtime system time
+ * @raw:	Monotonic raw system time
+ */
+struct system_time_snapshot {
+	cycle_t		cycles;
+	ktime_t		real;
+	ktime_t		raw;
+};
+
+/*
+ * Simultaneously snapshot realtime and monotonic raw clocks
+ */
+extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);
+
+/*
  * Persistent clock related interfaces
  */
 extern int persistent_clock_is_local;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4243d28..89b4695 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -874,6 +874,36 @@ time64_t __ktime_get_real_seconds(void)
 	return tk->xtime_sec;
 }
 
+/**
+ * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
+ * @systime_snapshot:	pointer to struct receiving the system time snapshot
+ */
+void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	unsigned long seq;
+	ktime_t base_raw;
+	ktime_t base_real;
+	s64 nsec_raw;
+	s64 nsec_real;
+	cycle_t now;
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+
+		now = tk->tkr_mono.read(tk->tkr_mono.clock);
+		base_real = ktime_add(tk->tkr_mono.base,
+				      tk_core.timekeeper.offs_real);
+		base_raw = tk->tkr_raw.base;
+		nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
+		nsec_raw  = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	systime_snapshot->cycles = now;
+	systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
+	systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
+}
+EXPORT_SYMBOL_GPL(ktime_get_snapshot);
 
 #ifdef CONFIG_NTP_PPS
 
-- 
cgit v0.10.2


From ba26621e63ce6dc481d90ab9f6902e058d4ea39a Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:21 -0800
Subject: time: Remove duplicated code in ktime_get_raw_and_real()

The code in ktime_get_snapshot() is a superset of the code in
ktime_get_raw_and_real() code. Further, ktime_get_raw_and_real() is
called only by the PPS code, pps_get_ts(). Consolidate the
pps_get_ts() code into a single function calling ktime_get_snapshot()
and eliminate ktime_get_raw_and_real(). A side effect of this is that
the raw and real results of pps_get_ts() correspond to exactly the
same clock cycle. Previously these values represented separate reads
of the system clock.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 54bf148..35ac903 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -111,22 +111,17 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
 	kt->nsec = ts.tv_nsec;
 }
 
-#ifdef CONFIG_NTP_PPS
-
 static inline void pps_get_ts(struct pps_event_time *ts)
 {
-	ktime_get_raw_and_real_ts64(&ts->ts_raw, &ts->ts_real);
-}
+	struct system_time_snapshot snap;
 
-#else /* CONFIG_NTP_PPS */
-
-static inline void pps_get_ts(struct pps_event_time *ts)
-{
-	ktime_get_real_ts64(&ts->ts_real);
+	ktime_get_snapshot(&snap);
+	ts->ts_real = ktime_to_timespec64(snap.real);
+#ifdef CONFIG_NTP_PPS
+	ts->ts_raw = ktime_to_timespec64(snap.raw);
+#endif
 }
 
-#endif /* CONFIG_NTP_PPS */
-
 /* Subtract known time delay from PPS event time(s) */
 static inline void pps_sub_ts(struct pps_event_time *ts, struct timespec64 delta)
 {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 89b4695..af19a49 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -888,6 +888,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 	s64 nsec_real;
 	cycle_t now;
 
+	WARN_ON_ONCE(timekeeping_suspended);
+
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
@@ -905,44 +907,6 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 }
 EXPORT_SYMBOL_GPL(ktime_get_snapshot);
 
-#ifdef CONFIG_NTP_PPS
-
-/**
- * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format
- * @ts_raw:	pointer to the timespec to be set to raw monotonic time
- * @ts_real:	pointer to the timespec to be set to the time of day
- *
- * This function reads both the time of day and raw monotonic time at the
- * same time atomically and stores the resulting timestamps in timespec
- * format.
- */
-void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned long seq;
-	s64 nsecs_raw, nsecs_real;
-
-	WARN_ON_ONCE(timekeeping_suspended);
-
-	do {
-		seq = read_seqcount_begin(&tk_core.seq);
-
-		*ts_raw = tk->raw_time;
-		ts_real->tv_sec = tk->xtime_sec;
-		ts_real->tv_nsec = 0;
-
-		nsecs_raw  = timekeeping_get_ns(&tk->tkr_raw);
-		nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
-
-	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	timespec64_add_ns(ts_raw, nsecs_raw);
-	timespec64_add_ns(ts_real, nsecs_real);
-}
-EXPORT_SYMBOL(ktime_get_raw_and_real_ts64);
-
-#endif /* CONFIG_NTP_PPS */
-
 /**
  * do_gettimeofday - Returns the time of day in a timeval
  * @tv:		pointer to the timeval to be set
-- 
cgit v0.10.2


From 8006c24595cab106bcb9da12d35e32e14ff492df Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:22 -0800
Subject: time: Add driver cross timestamp interface for higher precision time
 synchronization

ACKNOWLEDGMENT: cross timestamp code was developed by Thomas Gleixner
<tglx@linutronix.de>. It has changed considerably and any mistakes are
mine.

The precision with which events on multiple networked systems can be
synchronized using, as an example, PTP (IEEE 1588, 802.1AS) is limited
by the precision of the cross timestamps between the system clock and
the device (timestamp) clock. Precision here is the degree of
simultaneity when capturing the cross timestamp.

Currently the PTP cross timestamp is captured in software using the
PTP device driver ioctl PTP_SYS_OFFSET. Reads of the device clock are
interleaved with reads of the realtime clock. At best, the precision
of this cross timestamp is on the order of several microseconds due to
software latencies. Sub-microsecond precision is required for
industrial control and some media applications. To achieve this level
of precision hardware supported cross timestamping is needed.

The function get_device_system_crosstimestamp() allows device drivers
to return a cross timestamp with system time properly scaled to
nanoseconds.  The realtime value is needed to discipline that clock
using PTP and the monotonic raw value is used for applications that
don't require a "real" time, but need an unadjusted clock time.  The
get_device_system_crosstimestamp() code calls back into the driver to
ensure that the system counter is within the current timekeeping
update interval.

Modern Intel hardware provides an Always Running Timer (ART) which is
exactly related to TSC through a known frequency ratio. The ART is
routed to devices on the system and is used to precisely and
simultaneously capture the device clock with the ART.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Reworked to remove extra structures and simplify calling]
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 7817591..4a2ca65 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -280,6 +280,41 @@ struct system_time_snapshot {
 };
 
 /*
+ * struct system_device_crosststamp - system/device cross-timestamp
+ *	(syncronized capture)
+ * @device:		Device time
+ * @sys_realtime:	Realtime simultaneous with device time
+ * @sys_monoraw:	Monotonic raw simultaneous with device time
+ */
+struct system_device_crosststamp {
+	ktime_t device;
+	ktime_t sys_realtime;
+	ktime_t sys_monoraw;
+};
+
+/*
+ * struct system_counterval_t - system counter value with the pointer to the
+ *	corresponding clocksource
+ * @cycles:	System counter value
+ * @cs:		Clocksource corresponding to system counter value. Used by
+ *	timekeeping code to verify comparibility of two cycle values
+ */
+struct system_counterval_t {
+	cycle_t			cycles;
+	struct clocksource	*cs;
+};
+
+/*
+ * Get cross timestamp between system clock and device clock
+ */
+extern int get_device_system_crosststamp(
+			int (*get_time_fn)(ktime_t *device_time,
+				struct system_counterval_t *system_counterval,
+				void *ctx),
+			void *ctx,
+			struct system_device_crosststamp *xtstamp);
+
+/*
  * Simultaneously snapshot realtime and monotonic raw clocks
  */
 extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index af19a49..dba595c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -908,6 +908,62 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 EXPORT_SYMBOL_GPL(ktime_get_snapshot);
 
 /**
+ * get_device_system_crosststamp - Synchronously capture system/device timestamp
+ * @sync_devicetime:	Callback to get simultaneous device time and
+ *	system counter from the device driver
+ * @xtstamp:		Receives simultaneously captured system and device time
+ *
+ * Reads a timestamp from a device and correlates it to system time
+ */
+int get_device_system_crosststamp(int (*get_time_fn)
+				  (ktime_t *device_time,
+				   struct system_counterval_t *sys_counterval,
+				   void *ctx),
+				  void *ctx,
+				  struct system_device_crosststamp *xtstamp)
+{
+	struct system_counterval_t system_counterval;
+	struct timekeeper *tk = &tk_core.timekeeper;
+	ktime_t base_real, base_raw;
+	s64 nsec_real, nsec_raw;
+	unsigned long seq;
+	int ret;
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		/*
+		 * Try to synchronously capture device time and a system
+		 * counter value calling back into the device driver
+		 */
+		ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
+		if (ret)
+			return ret;
+
+		/*
+		 * Verify that the clocksource associated with the captured
+		 * system counter value is the same as the currently installed
+		 * timekeeper clocksource
+		 */
+		if (tk->tkr_mono.clock != system_counterval.cs)
+			return -ENODEV;
+
+		base_real = ktime_add(tk->tkr_mono.base,
+				      tk_core.timekeeper.offs_real);
+		base_raw = tk->tkr_raw.base;
+
+		nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono,
+						     system_counterval.cycles);
+		nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw,
+						    system_counterval.cycles);
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
+	xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
+
+/**
  * do_gettimeofday - Returns the time of day in a timeval
  * @tv:		pointer to the timeval to be set
  *
-- 
cgit v0.10.2


From 2c756feb18d9ec258dbb3a3d11c47e28820690d7 Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:23 -0800
Subject: time: Add history to cross timestamp interface supporting slower
 devices

Another representative use case of time sync and the correlated
clocksource (in addition to PTP noted above) is PTP synchronized
audio.

In a streaming application, as an example, samples will be sent and/or
received by multiple devices with a presentation time that is in terms
of the PTP master clock. Synchronizing the audio output on these
devices requires correlating the audio clock with the PTP master
clock. The more precise this correlation is, the better the audio
quality (i.e. out of sync audio sounds bad).

From an application standpoint, to correlate the PTP master clock with
the audio device clock, the system clock is used as a intermediate
timebase. The transforms such an application would perform are:

    System Clock <-> Audio clock
    System Clock <-> Network Device Clock [<-> PTP Master Clock]

Modern Intel platforms can perform a more accurate cross timestamp in
hardware (ART,audio device clock).  The audio driver requires
ART->system time transforms -- the same as required for the network
driver. These platforms offload audio processing (including
cross-timestamps) to a DSP which to ensure uninterrupted audio
processing, communicates and response to the host only once every
millsecond. As a result is takes up to a millisecond for the DSP to
receive a request, the request is processed by the DSP, the audio
output hardware is polled for completion, the result is copied into
shared memory, and the host is notified. All of these operation occur
on a millisecond cadence.  This transaction requires about 2 ms, but
under heavier workloads it may take up to 4 ms.

Adding a history allows these slow devices the option of providing an
ART value outside of the current interval. In this case, the callback
provided is an accessor function for the previously obtained counter
value. If get_system_device_crosststamp() receives a counter value
previous to cycle_last, it consults the history provided as an
argument in history_ref and interpolates the realtime and monotonic
raw system time using the provided counter value. If there are any
clock discontinuities, e.g. from calling settimeofday(), the monotonic
raw time is interpolated in the usual way, but the realtime clock time
is adjusted by scaling the monotonic raw adjustment.

When an accessor function is used a history argument *must* be
provided. The history is initialized using ktime_get_snapshot() and
must be called before the counter values are read.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Fixed up cycles_t/cycle_t type confusion]
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 2524722..e880054 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -50,6 +50,7 @@ struct tk_read_base {
  * @offs_tai:		Offset clock monotonic -> clock tai
  * @tai_offset:		The current UTC to TAI offset in seconds
  * @clock_was_set_seq:	The sequence number of clock was set events
+ * @cs_was_changed_seq:	The sequence number of clocksource change events
  * @next_leap_ktime:	CLOCK_MONOTONIC time value of a pending leap-second
  * @raw_time:		Monotonic raw base time in timespec64 format
  * @cycle_interval:	Number of clock cycles in one NTP interval
@@ -91,6 +92,7 @@ struct timekeeper {
 	ktime_t			offs_tai;
 	s32			tai_offset;
 	unsigned int		clock_was_set_seq;
+	u8			cs_was_changed_seq;
 	ktime_t			next_leap_ktime;
 	struct timespec64	raw_time;
 
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 4a2ca65..96f37be 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -272,11 +272,15 @@ extern void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw,
  * @cycles:	Clocksource counter value to produce the system times
  * @real:	Realtime system time
  * @raw:	Monotonic raw system time
+ * @clock_was_set_seq:	The sequence number of clock was set events
+ * @cs_was_changed_seq:	The sequence number of clocksource change events
  */
 struct system_time_snapshot {
 	cycle_t		cycles;
 	ktime_t		real;
 	ktime_t		raw;
+	unsigned int	clock_was_set_seq;
+	u8		cs_was_changed_seq;
 };
 
 /*
@@ -312,6 +316,7 @@ extern int get_device_system_crosststamp(
 				struct system_counterval_t *system_counterval,
 				void *ctx),
 			void *ctx,
+			struct system_time_snapshot *history,
 			struct system_device_crosststamp *xtstamp);
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dba595c..931b0b1 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	u64 tmp, ntpinterval;
 	struct clocksource *old_clock;
 
+	++tk->cs_was_changed_seq;
 	old_clock = tk->tkr_mono.clock;
 	tk->tkr_mono.clock = clock;
 	tk->tkr_mono.read = clock->read;
@@ -894,6 +895,8 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 		seq = read_seqcount_begin(&tk_core.seq);
 
 		now = tk->tkr_mono.read(tk->tkr_mono.clock);
+		systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
+		systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
 		base_real = ktime_add(tk->tkr_mono.base,
 				      tk_core.timekeeper.offs_real);
 		base_raw = tk->tkr_raw.base;
@@ -907,10 +910,123 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
 }
 EXPORT_SYMBOL_GPL(ktime_get_snapshot);
 
+/* Scale base by mult/div checking for overflow */
+static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
+{
+	u64 tmp, rem;
+
+	tmp = div64_u64_rem(*base, div, &rem);
+
+	if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
+	    ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
+		return -EOVERFLOW;
+	tmp *= mult;
+	rem *= mult;
+
+	do_div(rem, div);
+	*base = tmp + rem;
+	return 0;
+}
+
+/**
+ * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
+ * @history:			Snapshot representing start of history
+ * @partial_history_cycles:	Cycle offset into history (fractional part)
+ * @total_history_cycles:	Total history length in cycles
+ * @discontinuity:		True indicates clock was set on history period
+ * @ts:				Cross timestamp that should be adjusted using
+ *	partial/total ratio
+ *
+ * Helper function used by get_device_system_crosststamp() to correct the
+ * crosstimestamp corresponding to the start of the current interval to the
+ * system counter value (timestamp point) provided by the driver. The
+ * total_history_* quantities are the total history starting at the provided
+ * reference point and ending at the start of the current interval. The cycle
+ * count between the driver timestamp point and the start of the current
+ * interval is partial_history_cycles.
+ */
+static int adjust_historical_crosststamp(struct system_time_snapshot *history,
+					 cycle_t partial_history_cycles,
+					 cycle_t total_history_cycles,
+					 bool discontinuity,
+					 struct system_device_crosststamp *ts)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	u64 corr_raw, corr_real;
+	bool interp_forward;
+	int ret;
+
+	if (total_history_cycles == 0 || partial_history_cycles == 0)
+		return 0;
+
+	/* Interpolate shortest distance from beginning or end of history */
+	interp_forward = partial_history_cycles > total_history_cycles/2 ?
+		true : false;
+	partial_history_cycles = interp_forward ?
+		total_history_cycles - partial_history_cycles :
+		partial_history_cycles;
+
+	/*
+	 * Scale the monotonic raw time delta by:
+	 *	partial_history_cycles / total_history_cycles
+	 */
+	corr_raw = (u64)ktime_to_ns(
+		ktime_sub(ts->sys_monoraw, history->raw));
+	ret = scale64_check_overflow(partial_history_cycles,
+				     total_history_cycles, &corr_raw);
+	if (ret)
+		return ret;
+
+	/*
+	 * If there is a discontinuity in the history, scale monotonic raw
+	 *	correction by:
+	 *	mult(real)/mult(raw) yielding the realtime correction
+	 * Otherwise, calculate the realtime correction similar to monotonic
+	 *	raw calculation
+	 */
+	if (discontinuity) {
+		corr_real = mul_u64_u32_div
+			(corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
+	} else {
+		corr_real = (u64)ktime_to_ns(
+			ktime_sub(ts->sys_realtime, history->real));
+		ret = scale64_check_overflow(partial_history_cycles,
+					     total_history_cycles, &corr_real);
+		if (ret)
+			return ret;
+	}
+
+	/* Fixup monotonic raw and real time time values */
+	if (interp_forward) {
+		ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
+		ts->sys_realtime = ktime_add_ns(history->real, corr_real);
+	} else {
+		ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
+		ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
+	}
+
+	return 0;
+}
+
+/*
+ * cycle_between - true if test occurs chronologically between before and after
+ */
+static bool cycle_between(cycle_t before, cycle_t test, cycle_t after)
+{
+	if (test > before && test < after)
+		return true;
+	if (test < before && before > after)
+		return true;
+	return false;
+}
+
 /**
  * get_device_system_crosststamp - Synchronously capture system/device timestamp
- * @sync_devicetime:	Callback to get simultaneous device time and
+ * @get_time_fn:	Callback to get simultaneous device time and
  *	system counter from the device driver
+ * @ctx:		Context passed to get_time_fn()
+ * @history_begin:	Historical reference point used to interpolate system
+ *	time when counter provided by the driver is before the current interval
  * @xtstamp:		Receives simultaneously captured system and device time
  *
  * Reads a timestamp from a device and correlates it to system time
@@ -920,13 +1036,18 @@ int get_device_system_crosststamp(int (*get_time_fn)
 				   struct system_counterval_t *sys_counterval,
 				   void *ctx),
 				  void *ctx,
+				  struct system_time_snapshot *history_begin,
 				  struct system_device_crosststamp *xtstamp)
 {
 	struct system_counterval_t system_counterval;
 	struct timekeeper *tk = &tk_core.timekeeper;
+	cycle_t cycles, now, interval_start;
+	unsigned int clock_was_set_seq;
 	ktime_t base_real, base_raw;
 	s64 nsec_real, nsec_raw;
+	u8 cs_was_changed_seq;
 	unsigned long seq;
+	bool do_interp;
 	int ret;
 
 	do {
@@ -946,6 +1067,22 @@ int get_device_system_crosststamp(int (*get_time_fn)
 		 */
 		if (tk->tkr_mono.clock != system_counterval.cs)
 			return -ENODEV;
+		cycles = system_counterval.cycles;
+
+		/*
+		 * Check whether the system counter value provided by the
+		 * device driver is on the current timekeeping interval.
+		 */
+		now = tk->tkr_mono.read(tk->tkr_mono.clock);
+		interval_start = tk->tkr_mono.cycle_last;
+		if (!cycle_between(interval_start, cycles, now)) {
+			clock_was_set_seq = tk->clock_was_set_seq;
+			cs_was_changed_seq = tk->cs_was_changed_seq;
+			cycles = interval_start;
+			do_interp = true;
+		} else {
+			do_interp = false;
+		}
 
 		base_real = ktime_add(tk->tkr_mono.base,
 				      tk_core.timekeeper.offs_real);
@@ -959,6 +1096,38 @@ int get_device_system_crosststamp(int (*get_time_fn)
 
 	xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
 	xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
+
+	/*
+	 * Interpolate if necessary, adjusting back from the start of the
+	 * current interval
+	 */
+	if (do_interp) {
+		cycle_t partial_history_cycles, total_history_cycles;
+		bool discontinuity;
+
+		/*
+		 * Check that the counter value occurs after the provided
+		 * history reference and that the history doesn't cross a
+		 * clocksource change
+		 */
+		if (!history_begin ||
+		    !cycle_between(history_begin->cycles,
+				   system_counterval.cycles, cycles) ||
+		    history_begin->cs_was_changed_seq != cs_was_changed_seq)
+			return -EINVAL;
+		partial_history_cycles = cycles - system_counterval.cycles;
+		total_history_cycles = cycles - history_begin->cycles;
+		discontinuity =
+			history_begin->clock_was_set_seq != clock_was_set_seq;
+
+		ret = adjust_historical_crosststamp(history_begin,
+						    partial_history_cycles,
+						    total_history_cycles,
+						    discontinuity, xtstamp);
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
-- 
cgit v0.10.2


From 92f9e179a702a6adbc11e2fedc76ecd6ffc9e3f7 Mon Sep 17 00:00:00 2001
From: Todd E Brandt <todd.e.brandt@linux.intel.com>
Date: Wed, 2 Mar 2016 16:05:29 -0800
Subject: PM / sleep / x86: Fix crash on graph trace through x86 suspend

Pause/unpause graph tracing around do_suspend_lowlevel as it has
inconsistent call/return info after it jumps to the wakeup vector.
The graph trace buffer will otherwise become misaligned and
may eventually crash and hang on suspend.

To reproduce the issue and test the fix:
Run a function_graph trace over suspend/resume and set the graph
function to suspend_devices_and_enter. This consistently hangs the
system without this fix.

Signed-off-by: Todd Brandt <todd.e.brandt@linux.intel.com>
Cc: All applicable <stable@vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d1daead..adb3eaf 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -16,6 +16,7 @@
 #include <asm/cacheflush.h>
 #include <asm/realmode.h>
 
+#include <linux/ftrace.h>
 #include "../../realmode/rm/wakeup.h"
 #include "sleep.h"
 
@@ -107,7 +108,13 @@ int x86_acpi_suspend_lowlevel(void)
        saved_magic = 0x123456789abcdef0L;
 #endif /* CONFIG_64BIT */
 
+	/*
+	 * Pause/unpause graph tracing around do_suspend_lowlevel as it has
+	 * inconsistent call/return info after it jumps to the wakeup vector.
+	 */
+	pause_graph_tracing();
 	do_suspend_lowlevel();
+	unpause_graph_tracing();
 	return 0;
 }
 
-- 
cgit v0.10.2


From 6f6e151692625e29810f2fe036b4f410540567c8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 3 Mar 2016 09:12:27 +0100
Subject: perf/x86/uncore: Fix build on UP-IOAPIC configs

Commit:

  cf6d445f6897 ("perf/x86/uncore: Track packages, not per CPU data")

reorganized the uncore code to track packages, and introduced a dependency
on MAX_APIC_ID. This constant is not available on UP-IOAPIC builds:

  arch/x86/events/intel/uncore.c:1350:44: error: 'MAX_LOCAL_APIC' undeclared here (not in a function)

Include asm/apicdef.h explicitly to pick it up.

Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 01c6793..79766b9 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -1,6 +1,8 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
+#include <asm/apicdef.h>
+
 #include <linux/perf_event.h>
 #include "../perf_event.h"
 
-- 
cgit v0.10.2


From 58402b6e98f9059420e64b296db6c4cb6f6c1117 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Mon, 29 Feb 2016 09:02:47 +0100
Subject: [media] media.h: use hex values for range offsets,  move connectors
 base up.

Make the base offset hexadecimal to simplify debugging since the base
addresses are hex too.

The offsets for connectors is also changed to start after the 'reserved'
range 0x10000-0x2ffff.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>

diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index f032852..13e19a1 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -66,25 +66,25 @@ struct media_device_info {
 /*
  * DVB entities
  */
-#define MEDIA_ENT_F_DTV_DEMOD		(MEDIA_ENT_F_BASE + 1)
-#define MEDIA_ENT_F_TS_DEMUX		(MEDIA_ENT_F_BASE + 2)
-#define MEDIA_ENT_F_DTV_CA		(MEDIA_ENT_F_BASE + 3)
-#define MEDIA_ENT_F_DTV_NET_DECAP	(MEDIA_ENT_F_BASE + 4)
+#define MEDIA_ENT_F_DTV_DEMOD		(MEDIA_ENT_F_BASE + 0x00001)
+#define MEDIA_ENT_F_TS_DEMUX		(MEDIA_ENT_F_BASE + 0x00002)
+#define MEDIA_ENT_F_DTV_CA		(MEDIA_ENT_F_BASE + 0x00003)
+#define MEDIA_ENT_F_DTV_NET_DECAP	(MEDIA_ENT_F_BASE + 0x00004)
 
 /*
  * I/O entities
  */
-#define MEDIA_ENT_F_IO_DTV		(MEDIA_ENT_F_BASE + 1001)
-#define MEDIA_ENT_F_IO_VBI		(MEDIA_ENT_F_BASE + 1002)
-#define MEDIA_ENT_F_IO_SWRADIO		(MEDIA_ENT_F_BASE + 1003)
+#define MEDIA_ENT_F_IO_DTV		(MEDIA_ENT_F_BASE + 0x01001)
+#define MEDIA_ENT_F_IO_VBI		(MEDIA_ENT_F_BASE + 0x01002)
+#define MEDIA_ENT_F_IO_SWRADIO		(MEDIA_ENT_F_BASE + 0x01003)
 
 /*
  * Connectors
  */
 /* It is a responsibility of the entity drivers to add connectors and links */
-#define MEDIA_ENT_F_CONN_RF		(MEDIA_ENT_F_BASE + 10001)
-#define MEDIA_ENT_F_CONN_SVIDEO		(MEDIA_ENT_F_BASE + 10002)
-#define MEDIA_ENT_F_CONN_COMPOSITE	(MEDIA_ENT_F_BASE + 10003)
+#define MEDIA_ENT_F_CONN_RF		(MEDIA_ENT_F_BASE + 0x30001)
+#define MEDIA_ENT_F_CONN_SVIDEO		(MEDIA_ENT_F_BASE + 0x30002)
+#define MEDIA_ENT_F_CONN_COMPOSITE	(MEDIA_ENT_F_BASE + 0x30003)
 
 /*
  * Don't touch on those. The ranges MEDIA_ENT_F_OLD_BASE and
-- 
cgit v0.10.2


From 02322ac9dee9aff8d8862e8d6660ebe102f492ea Mon Sep 17 00:00:00 2001
From: Simon South <simon@simonsouth.com>
Date: Wed, 2 Mar 2016 23:10:44 -0500
Subject: ALSA: hda - Fix mic issues on Acer Aspire E1-472

This patch applies the microphone-related fix created for the Acer
Aspire E1-572 to the E1-472 as well, as it uses the same Realtek ALC282
CODEC and demonstrates the same issues.

This patch allows an external, headset microphone to be used and limits
the gain on the (quite noisy) internal microphone.

Signed-off-by: Simon South <simon@simonsouth.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 1f357cd..93d2156 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -5412,6 +5412,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
 	SND_PCI_QUIRK(0x1025, 0x080d, "Acer Aspire V5-122P", ALC269_FIXUP_ASPIRE_HEADSET_MIC),
 	SND_PCI_QUIRK(0x1025, 0x0740, "Acer AO725", ALC271_FIXUP_HP_GATE_MIC_JACK),
 	SND_PCI_QUIRK(0x1025, 0x0742, "Acer AO756", ALC271_FIXUP_HP_GATE_MIC_JACK),
+	SND_PCI_QUIRK(0x1025, 0x0762, "Acer Aspire E1-472", ALC271_FIXUP_HP_GATE_MIC_JACK_E1_572),
 	SND_PCI_QUIRK(0x1025, 0x0775, "Acer Aspire E1-572", ALC271_FIXUP_HP_GATE_MIC_JACK_E1_572),
 	SND_PCI_QUIRK(0x1025, 0x079b, "Acer Aspire V5-573G", ALC282_FIXUP_ASPIRE_V5_PINS),
 	SND_PCI_QUIRK(0x1025, 0x106d, "Acer Cloudbook 14", ALC283_FIXUP_CHROME_BOOK),
-- 
cgit v0.10.2


From 71f87b2fc64c2e9b6d53cb817f28711b959d3dfe Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 Mar 2016 10:52:10 +0100
Subject: cpu/hotplug: Plug death reporting race

Paul noticed that the conversion of the death reporting introduced a race
where the outgoing cpu might be delayed after waking the controll processor,
so it might not be able to call rcu_report_dead() before being physically
removed, leading to RCU stalls.

We cant call complete after rcu_report_dead(), so instead of going back to
busy polling, simply issue a function call to do the completion.

Fixes: 27d50c7eeb0f "rcu: Make CPU_DYING_IDLE an explicit call"
Reported-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20160302201127.GA23440@linux.vnet.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index ff8059b..93e9d89 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -755,14 +755,26 @@ static int notify_dead(unsigned int cpu)
 	return 0;
 }
 
+static void cpuhp_complete_idle_dead(void *arg)
+{
+	struct cpuhp_cpu_state *st = arg;
+
+	complete(&st->done);
+}
+
 void cpuhp_report_idle_dead(void)
 {
 	struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
 
 	BUG_ON(st->state != CPUHP_AP_OFFLINE);
-	st->state = CPUHP_AP_IDLE_DEAD;
-	complete(&st->done);
 	rcu_report_dead(smp_processor_id());
+	st->state = CPUHP_AP_IDLE_DEAD;
+	/*
+	 * We cannot call complete after rcu_report_dead() so we delegate it
+	 * to an online cpu.
+	 */
+	smp_call_function_single(cpumask_first(cpu_online_mask),
+				 cpuhp_complete_idle_dead, st, 0);
 }
 
 #else
-- 
cgit v0.10.2


From 82e88ff1ea948d83125a8aaa7c9809f03ccc500f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 Mar 2016 11:11:12 +0100
Subject: hrtimer: Revert CLOCK_MONOTONIC_RAW support

Revert commits:
a6e707ddbdf1: KVM: arm/arm64: timer: Switch to CLOCK_MONOTONIC_RAW
9006a01829a5: hrtimer: Catch illegal clockids
9c808765e88e: hrtimer: Add support for CLOCK_MONOTONIC_RAW

Marc found out, that there are fundamental issues with that patch series
because __hrtimer_get_next_event() and hrtimer_forward() need support for
CLOCK_MONOTONIC_RAW. Nothing which is easily fixed, so revert the whole lot.

Reported-by: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/56D6CEF0.8060607@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index a6d64af..76dd4f0 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -151,7 +151,6 @@ enum  hrtimer_base_type {
 	HRTIMER_BASE_REALTIME,
 	HRTIMER_BASE_BOOTTIME,
 	HRTIMER_BASE_TAI,
-	HRTIMER_BASE_MONOTONIC_RAW,
 	HRTIMER_MAX_CLOCK_BASES,
 };
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index cb0fe70..435b885 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -90,30 +90,19 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
 		},
-		{
-			.index = HRTIMER_BASE_MONOTONIC_RAW,
-			.clockid = CLOCK_MONOTONIC_RAW,
-			.get_time = &ktime_get_raw,
-		},
 	}
 };
 
 static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
-	/* Make sure we catch unsupported clockids */
-	[0 ... MAX_CLOCKS - 1]	= HRTIMER_MAX_CLOCK_BASES,
-
 	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
 	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
-	[CLOCK_MONOTONIC_RAW]	= HRTIMER_BASE_MONOTONIC_RAW,
 	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
 	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
 
 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 {
-	int base = hrtimer_clock_to_base_table[clock_id];
-	BUG_ON(base == HRTIMER_MAX_CLOCK_BASES);
-	return base;
+	return hrtimer_clock_to_base_table[clock_id];
 }
 
 /*
@@ -1279,10 +1268,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
 		if (!(active & 0x01))
 			continue;
 
-		if (unlikely(base->index == HRTIMER_BASE_MONOTONIC_RAW))
-			basenow = ktime_get_raw();
-		else
-			basenow = ktime_add(now, base->offset);
+		basenow = ktime_add(now, base->offset);
 
 		while ((node = timerqueue_getnext(&base->active))) {
 			struct hrtimer *timer;
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 97c5815..69bca18 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -48,7 +48,7 @@ static bool timer_is_armed(struct arch_timer_cpu *timer)
 static void timer_arm(struct arch_timer_cpu *timer, u64 ns)
 {
 	timer->armed = true;
-	hrtimer_start(&timer->timer, ktime_add_ns(ktime_get_raw(), ns),
+	hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns),
 		      HRTIMER_MODE_ABS);
 }
 
@@ -308,7 +308,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
 	INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
-	hrtimer_init(&timer->timer, CLOCK_MONOTONIC_RAW, HRTIMER_MODE_ABS);
+	hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	timer->timer.function = kvm_timer_expire;
 }
 
-- 
cgit v0.10.2


From fb822e6076d972691c5dd33431c10f82dda94ae9 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Date: Wed, 2 Mar 2016 15:25:17 +0530
Subject: powerpc/hw_breakpoint: Fix oops when destroying hw_breakpoint event

When destroying a hw_breakpoint event, the kernel oopses as follows:

  Unable to handle kernel paging request for data at address 0x00000c07
  NIP [c0000000000291d0] arch_unregister_hw_breakpoint+0x40/0x60
  LR [c00000000020b6b4] release_bp_slot+0x44/0x80

Call chain:

  hw_breakpoint_event_init()
    bp->destroy = bp_perf_event_destroy;

  do_exit()
    perf_event_exit_task()
      perf_event_exit_task_context()
        WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
        perf_event_exit_event()
          free_event()
            _free_event()
              bp_perf_event_destroy() // event->destroy(event);
                release_bp_slot()
                  arch_unregister_hw_breakpoint()

perf_event_exit_task_context() sets child_ctx->task as TASK_TOMBSTONE
which is (void *)-1. arch_unregister_hw_breakpoint() tries to fetch
'thread' attribute of 'task' resulting in oops.

Peterz points out that the code shouldn't be using bp->ctx anyway, but
fixing that will require a decent amount of rework. So for now to fix
the oops, check if bp->ctx->task has been set to (void *)-1, before
dereferencing it. We don't use TASK_TOMBSTONE, because that would
require exporting it and it's supposed to be an internal detail.

Fixes: 63b6da39bb38 ("perf: Fix perf_event_exit_task() race")
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 05e804c..aec9a1b 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -109,8 +109,9 @@ void arch_unregister_hw_breakpoint(struct perf_event *bp)
 	 * If the breakpoint is unregistered between a hw_breakpoint_handler()
 	 * and the single_step_dabr_instruction(), then cleanup the breakpoint
 	 * restoration variables to prevent dangling pointers.
+	 * FIXME, this should not be using bp->ctx at all! Sayeth peterz.
 	 */
-	if (bp->ctx && bp->ctx->task)
+	if (bp->ctx && bp->ctx->task && bp->ctx->task != ((void *)-1L))
 		bp->ctx->task->thread.last_hit_ubp = NULL;
 }
 
-- 
cgit v0.10.2


From f9a5978ac4ede901fa73d7c28ae1c5d89bc2a46a Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 3 Mar 2016 10:53:48 +0100
Subject: perf tools: Fix locale handling in pmu parsing

Ingo reported regression on display format of big numbers, which is
missing separators (in default perf stat output).

 triton:~/tip> perf stat -a sleep 1
         ...
         127008602      cycles                    #    0.011 GHz
         279538533      stalled-cycles-frontend   #  220.09% frontend cycles idle
         119213269      instructions              #    0.94  insn per cycle

This is caused by recent change:

  perf stat: Check existence of frontend/backed stalled cycles

that added call to pmu_have_event, that subsequently calls
perf_pmu__parse_scale, which has a bug in locale handling.

The lc string returned from setlocale, that we use to store old locale
value, may be allocated in static storage. Getting a dynamic copy to
make it survive another setlocale call.

  $ perf stat ls
         ...
         2,360,602      cycles                    #    3.080 GHz
         2,703,090      instructions              #    1.15  insn per cycle
           546,031      branches                  #  712.511 M/sec

Committer note:

Since the patch introducing the regression didn't made to perf/core,
move it to just before where the regression was introduced, so that we
don't break bisection for this feature.

Reported-by: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20160303095348.GA24511@krava.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index ce61f79..d8cd038 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -124,6 +124,17 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
 	lc = setlocale(LC_NUMERIC, NULL);
 
 	/*
+	 * The lc string may be allocated in static storage,
+	 * so get a dynamic copy to make it survive setlocale
+	 * call below.
+	 */
+	lc = strdup(lc);
+	if (!lc) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	/*
 	 * force to C locale to ensure kernel
 	 * scale string is converted correctly.
 	 * kernel uses default C locale.
@@ -135,6 +146,8 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
 	/* restore locale */
 	setlocale(LC_NUMERIC, lc);
 
+	free((char *) lc);
+
 	ret = 0;
 error:
 	close(fd);
-- 
cgit v0.10.2


From 9dec4473abe7967c204fe700baf5344ade34e9c8 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 26 Feb 2016 16:27:56 -0800
Subject: perf stat: Check existence of frontend/backed stalled cycles

Only put the frontend/backend stalled cycles into the default perf stat
events when the CPU actually supports them.

This avoids empty columns with --metric-only on newer Intel CPUs.

Committer note:

Before:

  $ perf stat ls

    Performance counter stats for 'ls':

          1.080893     task-clock (msec)      #    0.619 CPUs utilized
                 0     context-switches       #    0.000 K/sec
                 0     cpu-migrations         #    0.000 K/sec
                97     page-faults            #    0.090 M/sec
         3,327,741     cycles                 #    3.079 GHz
   <not supported>     stalled-cycles-frontend
   <not supported>     stalled-cycles-backend
         1,609,544     instructions           #    0.48  insn per cycle
           319,117     branches               #  295.235 M/sec
            12,246     branch-misses          #    3.84% of all branches

       0.001746508 seconds time elapsed
  $

After:

  $ perf stat ls

    Performance counter stats for 'ls':

          0.693948     task-clock (msec)      #    0.662 CPUs utilized
                 0     context-switches       #    0.000 K/sec
                 0     cpu-migrations         #    0.000 K/sec
                95     page-faults            #    0.137 M/sec
         1,792,509     cycles                 #    2.583 GHz
         1,599,047     instructions           #    0.89  insn per cycle
           316,328     branches               #  455.838 M/sec
            12,453     branch-misses          #    3.94% of all branches

       0.001048987 seconds time elapsed
  $

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1456532881-26621-2-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 8c0bc0f..24f222d 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1441,7 +1441,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st)
  */
 static int add_default_attributes(void)
 {
-	struct perf_event_attr default_attrs[] = {
+	struct perf_event_attr default_attrs0[] = {
 
   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK		},
   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES	},
@@ -1449,8 +1449,14 @@ static int add_default_attributes(void)
   { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS		},
 
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES		},
+};
+	struct perf_event_attr frontend_attrs[] = {
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND	},
+};
+	struct perf_event_attr backend_attrs[] = {
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND	},
+};
+	struct perf_event_attr default_attrs1[] = {
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS		},
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS	},
   { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES		},
@@ -1567,7 +1573,19 @@ static int add_default_attributes(void)
 	}
 
 	if (!evsel_list->nr_entries) {
-		if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0)
+		if (perf_evlist__add_default_attrs(evsel_list, default_attrs0) < 0)
+			return -1;
+		if (pmu_have_event("cpu", "stalled-cycles-frontend")) {
+			if (perf_evlist__add_default_attrs(evsel_list,
+						frontend_attrs) < 0)
+				return -1;
+		}
+		if (pmu_have_event("cpu", "stalled-cycles-backend")) {
+			if (perf_evlist__add_default_attrs(evsel_list,
+						backend_attrs) < 0)
+				return -1;
+		}
+		if (perf_evlist__add_default_attrs(evsel_list, default_attrs1) < 0)
 			return -1;
 	}
 
-- 
cgit v0.10.2


From 6122d57e9f7c6cb0f0aa276fbd3a12e3af826ef2 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 26 Feb 2016 09:31:56 +0000
Subject: perf data: Support converting data from bpf_perf_event_output()

bpf_perf_event_output() outputs data through sample->raw_data. This
patch adds support to convert those data into CTF. A python script then
can be used to process output data from BPF programs.

Test result:

  # cat ./test_bpf_output_2.c
  /************************ BEGIN **************************/
  #include <uapi/linux/bpf.h>
  struct bpf_map_def {
 	unsigned int type;
 	unsigned int key_size;
 	unsigned int value_size;
 	unsigned int max_entries;
  };
  #define SEC(NAME) __attribute__((section(NAME), used))
  static u64 (*ktime_get_ns)(void) =
 	(void *)BPF_FUNC_ktime_get_ns;
  static int (*trace_printk)(const char *fmt, int fmt_size, ...) =
 	(void *)BPF_FUNC_trace_printk;
  static int (*get_smp_processor_id)(void) =
 	(void *)BPF_FUNC_get_smp_processor_id;
  static int (*perf_event_output)(void *, struct bpf_map_def *, int, void *, unsigned long) =
 	(void *)BPF_FUNC_perf_event_output;

  struct bpf_map_def SEC("maps") channel = {
 	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 	.key_size = sizeof(int),
 	.value_size = sizeof(u32),
 	.max_entries = __NR_CPUS__,
  };

  static inline int __attribute__((always_inline))
  func(void *ctx, int type)
  {
 	struct {
 		u64 ktime;
 		int type;
 	} __attribute__((packed)) output_data;
 	char error_data[] = "Error: failed to output\n";
 	int err;

 	output_data.type = type;
 	output_data.ktime = ktime_get_ns();
 	err = perf_event_output(ctx, &channel, get_smp_processor_id(),
 				&output_data, sizeof(output_data));
 	if (err)
 		trace_printk(error_data, sizeof(error_data));
 	return 0;
  }
  SEC("func_begin=sys_nanosleep")
  int func_begin(void *ctx) {return func(ctx, 1);}
  SEC("func_end=sys_nanosleep%return")
  int func_end(void *ctx) { return func(ctx, 2);}
  char _license[] SEC("license") = "GPL";
  int _version SEC("version") = LINUX_VERSION_CODE;
  /************************* END ***************************/

  # ./perf record -e bpf-output/no-inherit,name=evt/ \
                 -e ./test_bpf_output_2.c/map:channel.event=evt/ \
                 usleep 100000
  [ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.012 MB perf.data (2 samples) ]

  # ./perf script
          usleep 14942 92503.198504: evt:  ffffffff810e0ba1 sys_nanosleep (/lib/modules/4.3.0....
          usleep 14942 92503.298562: evt:  ffffffff810585e9 kretprobe_trampoline_holder (/lib....

  # ./perf data convert --to-ctf ./out.ctf
  [ perf data convert: Converted 'perf.data' into CTF data './out.ctf' ]
  [ perf data convert: Converted and wrote 0.000 MB (2 samples) ]

  # babeltrace ./out.ctf
  [01:41:43.198504134] (+?.?????????) evt: { cpu_id = 0 }, { perf_ip = 0xFFFFFFFF810E0BA1, perf_tid = 14942, perf_pid = 14942, perf_id = 1044, raw_len = 3, raw_data = [ [0] = 0x32C0C07B, [1] = 0x5421, [2] = 0x1 ] }
  [01:41:43.298562257] (+0.100058123) evt: { cpu_id = 0 }, { perf_ip = 0xFFFFFFFF810585E9, perf_tid = 14942, perf_pid = 14942, perf_id = 1044, raw_len = 3, raw_data = [ [0] = 0x38B77FAA, [1] = 0x5421, [2] = 0x2 ] }

  # cat ./test_bpf_output_2.py
  from babeltrace import TraceCollection
  tc = TraceCollection()
  tc.add_trace('./out.ctf', 'ctf')
  d = {1:[], 2:[]}
  for event in tc.events:
     if not event.name.startswith('evt'):
         continue
     raw_data = event['raw_data']
     (time, type) = ((raw_data[0] + (raw_data[1] << 32)), raw_data[2])
     d[type].append(time)
  print(list(map(lambda i: d[2][i] - d[1][i], range(len(d[1])))));

  # python3 ./test_bpf_output_2.py
  [100056879]

Committer note:

Make sure you have python3-devel installed, not python-devel, which may
be for python2, which will lead to some "PyInstance_Type" errors. Also
make sure that you use the right libbabeltrace, because it is shipped
in Fedora, for instance, but an older version.

To build libbabeltrace's python binding one also needs to use:

 ./configure --enable-python-bindings

And then set PYTHONPATH=/usr/local/lib64/python3.4/site-packages/.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456479154-136027-9-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 6729f4d..1f608a6 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -352,6 +352,84 @@ static int add_tracepoint_values(struct ctf_writer *cw,
 	return ret;
 }
 
+static int
+add_bpf_output_values(struct bt_ctf_event_class *event_class,
+		      struct bt_ctf_event *event,
+		      struct perf_sample *sample)
+{
+	struct bt_ctf_field_type *len_type, *seq_type;
+	struct bt_ctf_field *len_field, *seq_field;
+	unsigned int raw_size = sample->raw_size;
+	unsigned int nr_elements = raw_size / sizeof(u32);
+	unsigned int i;
+	int ret;
+
+	if (nr_elements * sizeof(u32) != raw_size)
+		pr_warning("Incorrect raw_size (%u) in bpf output event, skip %lu bytes\n",
+			   raw_size, nr_elements * sizeof(u32) - raw_size);
+
+	len_type = bt_ctf_event_class_get_field_by_name(event_class, "raw_len");
+	len_field = bt_ctf_field_create(len_type);
+	if (!len_field) {
+		pr_err("failed to create 'raw_len' for bpf output event\n");
+		ret = -1;
+		goto put_len_type;
+	}
+
+	ret = bt_ctf_field_unsigned_integer_set_value(len_field, nr_elements);
+	if (ret) {
+		pr_err("failed to set field value for raw_len\n");
+		goto put_len_field;
+	}
+	ret = bt_ctf_event_set_payload(event, "raw_len", len_field);
+	if (ret) {
+		pr_err("failed to set payload to raw_len\n");
+		goto put_len_field;
+	}
+
+	seq_type = bt_ctf_event_class_get_field_by_name(event_class, "raw_data");
+	seq_field = bt_ctf_field_create(seq_type);
+	if (!seq_field) {
+		pr_err("failed to create 'raw_data' for bpf output event\n");
+		ret = -1;
+		goto put_seq_type;
+	}
+
+	ret = bt_ctf_field_sequence_set_length(seq_field, len_field);
+	if (ret) {
+		pr_err("failed to set length of 'raw_data'\n");
+		goto put_seq_field;
+	}
+
+	for (i = 0; i < nr_elements; i++) {
+		struct bt_ctf_field *elem_field =
+			bt_ctf_field_sequence_get_field(seq_field, i);
+
+		ret = bt_ctf_field_unsigned_integer_set_value(elem_field,
+				((u32 *)(sample->raw_data))[i]);
+
+		bt_ctf_field_put(elem_field);
+		if (ret) {
+			pr_err("failed to set raw_data[%d]\n", i);
+			goto put_seq_field;
+		}
+	}
+
+	ret = bt_ctf_event_set_payload(event, "raw_data", seq_field);
+	if (ret)
+		pr_err("failed to set payload for raw_data\n");
+
+put_seq_field:
+	bt_ctf_field_put(seq_field);
+put_seq_type:
+	bt_ctf_field_type_put(seq_type);
+put_len_field:
+	bt_ctf_field_put(len_field);
+put_len_type:
+	bt_ctf_field_type_put(len_type);
+	return ret;
+}
+
 static int add_generic_values(struct ctf_writer *cw,
 			      struct bt_ctf_event *event,
 			      struct perf_evsel *evsel,
@@ -597,6 +675,12 @@ static int process_sample_event(struct perf_tool *tool,
 			return -1;
 	}
 
+	if (perf_evsel__is_bpf_output(evsel)) {
+		ret = add_bpf_output_values(event_class, event, sample);
+		if (ret)
+			return -1;
+	}
+
 	cs = ctf_stream(cw, get_sample_cpu(cw, sample, evsel));
 	if (cs) {
 		if (is_flush_needed(cs))
@@ -744,6 +828,25 @@ static int add_tracepoint_types(struct ctf_writer *cw,
 	return ret;
 }
 
+static int add_bpf_output_types(struct ctf_writer *cw,
+				struct bt_ctf_event_class *class)
+{
+	struct bt_ctf_field_type *len_type = cw->data.u32;
+	struct bt_ctf_field_type *seq_base_type = cw->data.u32_hex;
+	struct bt_ctf_field_type *seq_type;
+	int ret;
+
+	ret = bt_ctf_event_class_add_field(class, len_type, "raw_len");
+	if (ret)
+		return ret;
+
+	seq_type = bt_ctf_field_type_sequence_create(seq_base_type, "raw_len");
+	if (!seq_type)
+		return -1;
+
+	return bt_ctf_event_class_add_field(class, seq_type, "raw_data");
+}
+
 static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
 			     struct bt_ctf_event_class *event_class)
 {
@@ -755,7 +858,8 @@ static int add_generic_types(struct ctf_writer *cw, struct perf_evsel *evsel,
 	 *                              ctf event header
 	 *   PERF_SAMPLE_READ         - TODO
 	 *   PERF_SAMPLE_CALLCHAIN    - TODO
-	 *   PERF_SAMPLE_RAW          - tracepoint fields are handled separately
+	 *   PERF_SAMPLE_RAW          - tracepoint fields and BPF output
+	 *                              are handled separately
 	 *   PERF_SAMPLE_BRANCH_STACK - TODO
 	 *   PERF_SAMPLE_REGS_USER    - TODO
 	 *   PERF_SAMPLE_STACK_USER   - TODO
@@ -824,6 +928,12 @@ static int add_event(struct ctf_writer *cw, struct perf_evsel *evsel)
 			goto err;
 	}
 
+	if (perf_evsel__is_bpf_output(evsel)) {
+		ret = add_bpf_output_types(cw, event_class);
+		if (ret)
+			goto err;
+	}
+
 	ret = bt_ctf_stream_class_add_event_class(cw->stream_class, event_class);
 	if (ret) {
 		pr("Failed to add event class into stream.\n");
-- 
cgit v0.10.2


From f8dd2d5ff953bc498d682ae8022439c940a7d5c4 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 26 Feb 2016 09:31:57 +0000
Subject: perf data: Explicitly set byte order for integer types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After babeltrace commit 5cec03e402aa ("ir: copy variants and sequences
when setting a field path"), 'perf data convert' gets incorrect result
if there's bpf output data. For example:

 # perf data convert --to-ctf ./out.ctf
 # babeltrace ./out.ctf
 [10:44:31.186045346] (+?.?????????) evt: { cpu_id = 0 }, { perf_ip = 0xFFFFFFFF810E7DD1, perf_tid = 23819, perf_pid = 23819, perf_id = 518, raw_len = 3, raw_data = [ [0] = 0xC028E32F, [1] = 0x815D0100, [2] = 0x1000000 ] }
 [10:44:31.286101003] (+0.100055657) evt: { cpu_id = 0 }, { perf_ip = 0xFFFFFFFF8105B609, perf_tid = 23819, perf_pid = 23819, perf_id = 518, raw_len = 3, raw_data = [ [0] = 0x35D9F1EB, [1] = 0x15D81, [2] = 0x2 ] }

The expected result of the first sample should be:

 raw_data = [ [0] = 0x2FE328C0, [1] = 0x15D81, [2] = 0x1 ] }

however, 'perf data convert' output big endian value to resuling CTF
file.

The reason is a internal change (or a bug?) of babeltrace.

Before this patch, at the first add_bpf_output_values(), byte order of
all integer type is uncertain (is 0, neither 1234 (le) nor 4321 (be)).
It would be fixed by:

perf_evlist__deliver_sample
 -> process_sample_event
   -> ctf_stream
      ...
      ->bt_ctf_trace_add_stream_class
        ->bt_ctf_field_type_structure_set_byte_order
          ->bt_ctf_field_type_integer_set_byte_order

during creating the stream.

However, the babeltrace commit mentioned above duplicates types in
sequence to prevent potential conflict in following call stack and link
the newly allocated type into the 'raw_data' sequence:

perf_evlist__deliver_sample
 -> process_sample_event
   -> ctf_stream
      ...
      -> bt_ctf_trace_add_stream_class
        -> bt_ctf_stream_class_resolve_types
           ...
           -> bt_ctf_field_type_sequence_copy
             ->bt_ctf_field_type_integer_copy

This happens before byte order setting, so only the newly allocated
type is initialized, the byte order of original type perf choose to
create the first raw_data is still uncertain.

Byte order in CTF output is not related to byte order in perf.data.
Setting it to anything other than BT_CTF_BYTE_ORDER_NATIVE solves this
problem (only BT_CTF_BYTE_ORDER_NATIVE needs to be fixed). To reduce
behavior changing, set byte order according to compiling options.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Jeremie Galarneau <jeremie.galarneau@efficios.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jérémie Galarneau <jeremie.galarneau@efficios.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456479154-136027-10-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/data-convert-bt.c b/tools/perf/util/data-convert-bt.c
index 1f608a6..811af89 100644
--- a/tools/perf/util/data-convert-bt.c
+++ b/tools/perf/util/data-convert-bt.c
@@ -1080,6 +1080,12 @@ static struct bt_ctf_field_type *create_int_type(int size, bool sign, bool hex)
 	    bt_ctf_field_type_integer_set_base(type, BT_CTF_INTEGER_BASE_HEXADECIMAL))
 		goto err;
 
+#if __BYTE_ORDER == __BIG_ENDIAN
+	bt_ctf_field_type_set_byte_order(type, BT_CTF_BYTE_ORDER_BIG_ENDIAN);
+#else
+	bt_ctf_field_type_set_byte_order(type, BT_CTF_BYTE_ORDER_LITTLE_ENDIAN);
+#endif
+
 	pr2("Created type: INTEGER %d-bit %ssigned %s\n",
 	    size, sign ? "un" : "", hex ? "hex" : "");
 	return type;
-- 
cgit v0.10.2


From d8871ea71281ed689dc3303d1b50eb00c5d06141 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 26 Feb 2016 09:32:06 +0000
Subject: perf record: Use WARN_ONCE to replace 'if' condition

Commits in a BPF patchkit will extract kernel and module synthesizing
code into a separated function and call it multiple times. This patch
replace 'if (err < 0)' using WARN_ONCE, makes sure the error message
show one time.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456479154-136027-19-git-send-email-wangnan0@huawei.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 7d11162..9dec7e5 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -33,6 +33,7 @@
 #include "util/parse-regs-options.h"
 #include "util/llvm-utils.h"
 #include "util/bpf-loader.h"
+#include "asm/bug.h"
 
 #include <unistd.h>
 #include <sched.h>
@@ -615,17 +616,15 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 
 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
 						 machine);
-	if (err < 0)
-		pr_err("Couldn't record kernel reference relocation symbol\n"
-		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
-		       "Check /proc/kallsyms permission or run as root.\n");
+	WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
+			   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+			   "Check /proc/kallsyms permission or run as root.\n");
 
 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
 					     machine);
-	if (err < 0)
-		pr_err("Couldn't record kernel module information.\n"
-		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
-		       "Check /proc/modules permission or run as root.\n");
+	WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
+			   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+			   "Check /proc/modules permission or run as root.\n");
 
 	if (perf_guest) {
 		machines__process_guests(&session->machines,
-- 
cgit v0.10.2


From c45c86eb70964615bd13b4c1e647bf9ee60c3db9 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 26 Feb 2016 09:32:07 +0000
Subject: perf record: Extract synthesize code to record__synthesize()

Create record__synthesize(). It can be used to create tracking events
for each perf.data after perf supporting splitting into multiple
outputs.

Signed-off-by: He Kuang <hekuang@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456479154-136027-20-git-send-email-wangnan0@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 9dec7e5..cb583b4 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -486,6 +486,74 @@ static void workload_exec_failed_signal(int signo __maybe_unused,
 
 static void snapshot_sig_handler(int sig);
 
+static int record__synthesize(struct record *rec)
+{
+	struct perf_session *session = rec->session;
+	struct machine *machine = &session->machines.host;
+	struct perf_data_file *file = &rec->file;
+	struct record_opts *opts = &rec->opts;
+	struct perf_tool *tool = &rec->tool;
+	int fd = perf_data_file__fd(file);
+	int err = 0;
+
+	if (file->is_pipe) {
+		err = perf_event__synthesize_attrs(tool, session,
+						   process_synthesized_event);
+		if (err < 0) {
+			pr_err("Couldn't synthesize attrs.\n");
+			goto out;
+		}
+
+		if (have_tracepoints(&rec->evlist->entries)) {
+			/*
+			 * FIXME err <= 0 here actually means that
+			 * there were no tracepoints so its not really
+			 * an error, just that we don't need to
+			 * synthesize anything.  We really have to
+			 * return this more properly and also
+			 * propagate errors that now are calling die()
+			 */
+			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
+								  process_synthesized_event);
+			if (err <= 0) {
+				pr_err("Couldn't record tracing data.\n");
+				goto out;
+			}
+			rec->bytes_written += err;
+		}
+	}
+
+	if (rec->opts.full_auxtrace) {
+		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
+					session, process_synthesized_event);
+		if (err)
+			goto out;
+	}
+
+	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
+						 machine);
+	WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
+			   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+			   "Check /proc/kallsyms permission or run as root.\n");
+
+	err = perf_event__synthesize_modules(tool, process_synthesized_event,
+					     machine);
+	WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
+			   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+			   "Check /proc/modules permission or run as root.\n");
+
+	if (perf_guest) {
+		machines__process_guests(&session->machines,
+					 perf_event__synthesize_guest_os, tool);
+	}
+
+	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
+					    process_synthesized_event, opts->sample_address,
+					    opts->proc_map_timeout);
+out:
+	return err;
+}
+
 static int __cmd_record(struct record *rec, int argc, const char **argv)
 {
 	int err;
@@ -580,61 +648,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
 
 	machine = &session->machines.host;
 
-	if (file->is_pipe) {
-		err = perf_event__synthesize_attrs(tool, session,
-						   process_synthesized_event);
-		if (err < 0) {
-			pr_err("Couldn't synthesize attrs.\n");
-			goto out_child;
-		}
-
-		if (have_tracepoints(&rec->evlist->entries)) {
-			/*
-			 * FIXME err <= 0 here actually means that
-			 * there were no tracepoints so its not really
-			 * an error, just that we don't need to
-			 * synthesize anything.  We really have to
-			 * return this more properly and also
-			 * propagate errors that now are calling die()
-			 */
-			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
-								  process_synthesized_event);
-			if (err <= 0) {
-				pr_err("Couldn't record tracing data.\n");
-				goto out_child;
-			}
-			rec->bytes_written += err;
-		}
-	}
-
-	if (rec->opts.full_auxtrace) {
-		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
-					session, process_synthesized_event);
-		if (err)
-			goto out_delete_session;
-	}
-
-	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
-						 machine);
-	WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
-			   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
-			   "Check /proc/kallsyms permission or run as root.\n");
-
-	err = perf_event__synthesize_modules(tool, process_synthesized_event,
-					     machine);
-	WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
-			   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
-			   "Check /proc/modules permission or run as root.\n");
-
-	if (perf_guest) {
-		machines__process_guests(&session->machines,
-					 perf_event__synthesize_guest_os, tool);
-	}
-
-	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
-					    process_synthesized_event, opts->sample_address,
-					    opts->proc_map_timeout);
-	if (err != 0)
+	err = record__synthesize(rec);
+	if (err < 0)
 		goto out_child;
 
 	if (rec->realtime_prio) {
-- 
cgit v0.10.2


From e1ab48ba63ee6b2494a67bb60bf99692ecdaca7c Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 26 Feb 2016 09:32:10 +0000
Subject: perf record: Introduce record__finish_output() to finish a perf.data

Move code for finalizing 'perf.data' to record__finish_output(). It will
be used by following commits to split output to multiple files.

Signed-off-by: He Kuang <hekuang@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456479154-136027-23-git-send-email-wangnan0@huawei.com
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index cb583b4..46e2772 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -468,6 +468,29 @@ static void record__init_features(struct record *rec)
 	perf_header__clear_feat(&session->header, HEADER_STAT);
 }
 
+static void
+record__finish_output(struct record *rec)
+{
+	struct perf_data_file *file = &rec->file;
+	int fd = perf_data_file__fd(file);
+
+	if (file->is_pipe)
+		return;
+
+	rec->session->header.data_size += rec->bytes_written;
+	file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
+
+	if (!rec->no_buildid) {
+		process_buildids(rec);
+
+		if (rec->buildid_all)
+			dsos__hit_all(rec->session);
+	}
+	perf_session__write_header(rec->session, rec->evlist, fd, true);
+
+	return;
+}
+
 static volatile int workload_exec_errno;
 
 /*
@@ -785,18 +808,8 @@ out_child:
 	/* this will be recalculated during process_buildids() */
 	rec->samples = 0;
 
-	if (!err && !file->is_pipe) {
-		rec->session->header.data_size += rec->bytes_written;
-		file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
-
-		if (!rec->no_buildid) {
-			process_buildids(rec);
-
-			if (rec->buildid_all)
-				dsos__hit_all(rec->session);
-		}
-		perf_session__write_header(rec->session, rec->evlist, fd, true);
-	}
+	if (!err)
+		record__finish_output(rec);
 
 	if (!err && !quiet) {
 		char samples[128];
-- 
cgit v0.10.2


From 95c365617aa37878592f2f1c6c64e1abb19f0d4a Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Fri, 26 Feb 2016 09:32:17 +0000
Subject: perf record: Ensure return non-zero rc when mmap fail

perf_evlist__mmap_ex() can fail without setting errno (for example, fail
in condition checking. In this case all syscall is success).

If this happen, record__open() incorrectly returns 0. Force setting rc
is a quick way to avoid this problem, or we have to follow all possible
code path in perf_evlist__mmap_ex() to make sure there's at least one
system call before returning an error.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Zefan Li <lizefan@huawei.com>
Cc: pi3orama@163.com
Link: http://lkml.kernel.org/r/1456479154-136027-30-git-send-email-wangnan0@huawei.com
Signed-off-by: He Kuang <hekuang@huawei.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 46e2772..515510e 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -324,7 +324,10 @@ try_again:
 		} else {
 			pr_err("failed to mmap with %d (%s)\n", errno,
 				strerror_r(errno, msg, sizeof(msg)));
-			rc = -errno;
+			if (errno)
+				rc = -errno;
+			else
+				rc = -EINVAL;
 		}
 		goto out;
 	}
-- 
cgit v0.10.2


From 92a61f6412d3a09d6462252a522fa79c9290f405 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 29 Feb 2016 14:36:21 -0800
Subject: perf stat: Implement CSV metrics output

Now support CSV output for metrics. With the new output callbacks this
is relatively straight forward by creating new callbacks.

This allows to easily plot metrics from CSV files.

The new line callback needs to know the number of fields to skip them
correctly

Example output before:

  % perf stat -x, true
  0.200687,,task-clock,200687,100.00
  0,,context-switches,200687,100.00
  0,,cpu-migrations,200687,100.00
  40,,page-faults,200687,100.00
  730871,,cycles,203601,100.00
  551056,,stalled-cycles-frontend,203601,100.00
  <not supported>,,stalled-cycles-backend,0,100.00
  385523,,instructions,203601,100.00
  78028,,branches,203601,100.00
  3946,,branch-misses,203601,100.00

After:

  % perf stat -x, true
  .502457,,task-clock,502457,100.00,0.485,CPUs utilized
  0,,context-switches,502457,100.00,0.000,K/sec
  0,,cpu-migrations,502457,100.00,0.000,K/sec
  45,,page-faults,502457,100.00,0.090,M/sec
  644692,,cycles,509102,100.00,1.283,GHz
  423470,,stalled-cycles-frontend,509102,100.00,65.69,frontend cycles idle
  <not supported>,,stalled-cycles-backend,0,100.00,,,,
  492701,,instructions,509102,100.00,0.76,insn per cycle
  ,,,,,0.86,stalled cycles per insn
  97767,,branches,509102,100.00,194.578,M/sec
  4788,,branch-misses,509102,100.00,4.90,of all branches

or easier readable

  $ perf stat  -x, -o x.csv true
  $ column -s, -t x.csv
  0.490635        task-clock              490635 100.00 0.489   CPUs utilized
  0               context-switches        490635 100.00 0.000   K/sec
  0               cpu-migrations          490635 100.00 0.000   K/sec
  45              page-faults             490635 100.00 0.092   M/sec
  629080          cycles                  497698 100.00 1.282   GHz
  409498          stalled-cycles-frontend 497698 100.00 65.09   frontend cycles idle
  <not supported> stalled-cycles-backend  0      100.00
  491424          instructions            497698 100.00 0.78    insn per cycle
                                                        0.83    stalled cycles per insn
  97278           branches                497698 100.00 198.270 M/sec
  4569            branch-misses           497698 100.00 4.70    of all branches

Two new fields are added: metric value and metric name.

v2: Split out function argument changes
v3: Reenable metrics for real.
v4: Fix wrong hunk from refactoring.
v5: Remove extra "noise" printing (Jiri), but add it to the not counted case.
Print empty metrics for not counted.
v6: Avoid outputting metric on empty format.
v7: Print metric at the end
v8: Remove extra run, ena fields
v9: Avoid extra new line for unsupported counters

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Link: http://lkml.kernel.org/r/1456785386-19481-3-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 24f222d..2ffb822 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -739,6 +739,7 @@ struct outstate {
 	FILE *fh;
 	bool newline;
 	const char *prefix;
+	int  nfields;
 };
 
 #define METRIC_LEN  35
@@ -789,6 +790,43 @@ static void print_metric_std(void *ctx, const char *color, const char *fmt,
 	fprintf(out, " %-*s", METRIC_LEN - n - 1, unit);
 }
 
+static void new_line_csv(void *ctx)
+{
+	struct outstate *os = ctx;
+	int i;
+
+	fputc('\n', os->fh);
+	if (os->prefix)
+		fprintf(os->fh, "%s%s", os->prefix, csv_sep);
+	for (i = 0; i < os->nfields; i++)
+		fputs(csv_sep, os->fh);
+}
+
+static void print_metric_csv(void *ctx,
+			     const char *color __maybe_unused,
+			     const char *fmt, const char *unit, double val)
+{
+	struct outstate *os = ctx;
+	FILE *out = os->fh;
+	char buf[64], *vals, *ends;
+
+	if (unit == NULL || fmt == NULL) {
+		fprintf(out, "%s%s%s%s", csv_sep, csv_sep, csv_sep, csv_sep);
+		return;
+	}
+	snprintf(buf, sizeof(buf), fmt, val);
+	vals = buf;
+	while (isspace(*vals))
+		vals++;
+	ends = vals;
+	while (isdigit(*ends) || *ends == '.')
+		ends++;
+	*ends = 0;
+	while (isspace(*unit))
+		unit++;
+	fprintf(out, "%s%s%s%s", csv_sep, vals, csv_sep, unit);
+}
+
 static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
 	FILE *output = stat_config.output;
@@ -860,6 +898,22 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 
 	nl = new_line_std;
 
+	if (csv_output) {
+		static int aggr_fields[] = {
+			[AGGR_GLOBAL] = 0,
+			[AGGR_THREAD] = 1,
+			[AGGR_NONE] = 1,
+			[AGGR_SOCKET] = 2,
+			[AGGR_CORE] = 2,
+		};
+
+		pm = print_metric_csv;
+		nl = new_line_csv;
+		os.nfields = 3;
+		os.nfields += aggr_fields[stat_config.aggr_mode];
+		if (counter->cgrp)
+			os.nfields++;
+	}
 	if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
 		aggr_printout(counter, id, nr);
 
@@ -880,7 +934,12 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 			fprintf(stat_config.output, "%s%s",
 				csv_sep, counter->cgrp->name);
 
+		if (!csv_output)
+			pm(&os, NULL, NULL, "", 0);
+		print_noise(counter, noise);
 		print_running(run, ena);
+		if (csv_output)
+			pm(&os, NULL, NULL, "", 0);
 		return;
 	}
 
@@ -893,14 +952,20 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 	out.new_line = nl;
 	out.ctx = &os;
 
-	if (!csv_output)
-		perf_stat__print_shadow_stats(counter, uval,
+	if (csv_output) {
+		print_noise(counter, noise);
+		print_running(run, ena);
+	}
+
+	perf_stat__print_shadow_stats(counter, uval,
 				stat_config.aggr_mode == AGGR_GLOBAL ? 0 :
 				cpu_map__id_to_cpu(id),
 				&out);
 
-	print_noise(counter, noise);
-	print_running(run, ena);
+	if (!csv_output) {
+		print_noise(counter, noise);
+		print_running(run, ena);
+	}
 }
 
 static void print_aggr(char *prefix)
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 4d8f185..367e220 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -310,8 +310,8 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		total = avg_stats(&runtime_stalled_cycles_front_stats[ctx][cpu]);
 		total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[ctx][cpu]));
 
-		out->new_line(ctxp);
 		if (total && avg) {
+			out->new_line(ctxp);
 			ratio = total / avg;
 			print_metric(ctxp, NULL, "%7.2f ",
 					"stalled cycles per insn",
-- 
cgit v0.10.2


From 44d49a6002595ccb95712e86ad2857cd55207602 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 29 Feb 2016 14:36:22 -0800
Subject: perf stat: Support metrics in --per-core/socket mode

Enable metrics printing in --per-core / --per-socket mode. We need to
save the shadow metrics in a unique place. Always use the first CPU in
the aggregation. Then use the same CPU to retrieve the shadow value
later.

Example output:

  % perf stat --per-core -a ./BC1s

   Performance counter stats for 'system wide':

  S0-C0 2   2966.020381 task-clock (msec) #   2.004 CPUs utilized  (100.00%)
  S0-C0 2            49 context-switches  #   0.017 K/sec          (100.00%)
  S0-C0 2             4 cpu-migrations    #   0.001 K/sec          (100.00%)
  S0-C0 2           467 page-faults       #   0.157 K/sec
  S0-C0 2 4,599,061,773 cycles            #   1.551 GHz            (100.00%)
  S0-C0 2 9,755,886,883 instructions      #   2.12  insn per cycle (100.00%)
  S0-C0 2 1,906,272,125 branches          # 642.704 M/sec          (100.00%)
  S0-C0 2    81,180,867 branch-misses     #   4.26% of all branches
  S0-C1 2   2965.995373 task-clock (msec) #   2.003 CPUs utilized  (100.00%)
  S0-C1 2            62 context-switches  #   0.021 K/sec          (100.00%)
  S0-C1 2             8 cpu-migrations    #   0.003 K/sec          (100.00%)
  S0-C1 2           281 page-faults       #   0.095 K/sec
  S0-C1 2     6,347,290 cycles            #   0.002 GHz            (100.00%)
  S0-C1 2     4,654,156 instructions      #   0.73  insn per cycle (100.00%)
  S0-C1 2       947,121 branches          #   0.319 M/sec          (100.00%)
  S0-C1 2        37,322 branch-misses     #   3.94% of all branches

         1.480409747 seconds time elapsed

v2: Rebase to older patches
v3: Document shadow cpus. Fix aggr_get_id argument. Fix -A shadows (Jiri)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/1456785386-19481-4-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2ffb822..9b5089c 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -740,6 +740,8 @@ struct outstate {
 	bool newline;
 	const char *prefix;
 	int  nfields;
+	int  id, nr;
+	struct perf_evsel *evsel;
 };
 
 #define METRIC_LEN  35
@@ -755,12 +757,9 @@ static void do_new_line_std(struct outstate *os)
 {
 	fputc('\n', os->fh);
 	fputs(os->prefix, os->fh);
+	aggr_printout(os->evsel, os->id, os->nr);
 	if (stat_config.aggr_mode == AGGR_NONE)
 		fprintf(os->fh, "        ");
-	if (stat_config.aggr_mode == AGGR_CORE)
-		fprintf(os->fh, "                  ");
-	if (stat_config.aggr_mode == AGGR_SOCKET)
-		fprintf(os->fh, "            ");
 	fprintf(os->fh, "                                                 ");
 }
 
@@ -798,6 +797,7 @@ static void new_line_csv(void *ctx)
 	fputc('\n', os->fh);
 	if (os->prefix)
 		fprintf(os->fh, "%s%s", os->prefix, csv_sep);
+	aggr_printout(os->evsel, os->id, os->nr);
 	for (i = 0; i < os->nfields; i++)
 		fputs(csv_sep, os->fh);
 }
@@ -855,6 +855,28 @@ static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 }
 
+static int first_shadow_cpu(struct perf_evsel *evsel, int id)
+{
+	int i;
+
+	if (!aggr_get_id)
+		return 0;
+
+	if (stat_config.aggr_mode == AGGR_NONE)
+		return id;
+
+	if (stat_config.aggr_mode == AGGR_GLOBAL)
+		return 0;
+
+	for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) {
+		int cpu2 = perf_evsel__cpus(evsel)->map[i];
+
+		if (aggr_get_id(evsel_list->cpus, cpu2) == id)
+			return cpu2;
+	}
+	return 0;
+}
+
 static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
 	FILE *output = stat_config.output;
@@ -891,7 +913,10 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 	struct perf_stat_output_ctx out;
 	struct outstate os = {
 		.fh = stat_config.output,
-		.prefix = prefix ? prefix : ""
+		.prefix = prefix ? prefix : "",
+		.id = id,
+		.nr = nr,
+		.evsel = counter,
 	};
 	print_metric_t pm = print_metric_std;
 	void (*nl)(void *);
@@ -958,16 +983,37 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 	}
 
 	perf_stat__print_shadow_stats(counter, uval,
-				stat_config.aggr_mode == AGGR_GLOBAL ? 0 :
-				cpu_map__id_to_cpu(id),
+				first_shadow_cpu(counter, id),
 				&out);
-
 	if (!csv_output) {
 		print_noise(counter, noise);
 		print_running(run, ena);
 	}
 }
 
+static void aggr_update_shadow(void)
+{
+	int cpu, s2, id, s;
+	u64 val;
+	struct perf_evsel *counter;
+
+	for (s = 0; s < aggr_map->nr; s++) {
+		id = aggr_map->map[s];
+		evlist__for_each(evsel_list, counter) {
+			val = 0;
+			for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+				s2 = aggr_get_id(evsel_list->cpus, cpu);
+				if (s2 != id)
+					continue;
+				val += perf_counts(counter->counts, cpu, 0)->val;
+			}
+			val = val * counter->scale;
+			perf_stat__update_shadow_stats(counter, &val,
+						       first_shadow_cpu(counter, id));
+		}
+	}
+}
+
 static void print_aggr(char *prefix)
 {
 	FILE *output = stat_config.output;
@@ -979,6 +1025,8 @@ static void print_aggr(char *prefix)
 	if (!(aggr_map || aggr_get_id))
 		return;
 
+	aggr_update_shadow();
+
 	for (s = 0; s < aggr_map->nr; s++) {
 		id = aggr_map->map[s];
 		evlist__for_each(evsel_list, counter) {
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 367e220..5e2d2e3 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -14,6 +14,13 @@ enum {
 
 #define NUM_CTX CTX_BIT_MAX
 
+/*
+ * AGGR_GLOBAL: Use CPU 0
+ * AGGR_SOCKET: Use first CPU of socket
+ * AGGR_CORE: Use first CPU of core
+ * AGGR_NONE: Use matching CPU
+ * AGGR_THREAD: Not supported?
+ */
 static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
 static struct stats runtime_cycles_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_stalled_cycles_front_stats[NUM_CTX][MAX_NR_CPUS];
-- 
cgit v0.10.2


From 676787939ef8ccfcf8039104f766ebe5ebe23924 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 1 Feb 2016 02:59:00 +0900
Subject: tools build: Use .s extension for preprocessed assembler code

The "man gcc" says .i extension represents the file is C source code
that should not be preprocessed.  Here, .s should be used.

For clarification,
  .c  ---(preprocess)--->  .i
  .S  ---(preprocess)--->  .s

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Aaro Koskinen <aaro.koskinen@nokia.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Lukas Wunner <lukas@wunner.de>
Link: http://lkml.kernel.org/r/1454263140-19670-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build
index 4a96473..ee566e8 100644
--- a/tools/build/Makefile.build
+++ b/tools/build/Makefile.build
@@ -85,7 +85,7 @@ $(OUTPUT)%.i: %.c FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_i_c)
 
-$(OUTPUT)%.i: %.S FORCE
+$(OUTPUT)%.s: %.S FORCE
 	$(call rule_mkdir)
 	$(call if_changed_dep,cc_i_c)
 
-- 
cgit v0.10.2


From 979ac257b00c53aacec3eacf86142e8c00bee889 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 1 Mar 2016 23:46:20 +0000
Subject: perf script: Fix double free on command_line

The 'command_line' variable is free'd twice if db_export__branch_types()
fails. To avoid this, defer the free'ing of 'command_line' to after this
call so that the error return path will just free 'command_line' once.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Javi Merino <javi.merino@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1456875980-25606-1-git-send-email-colin.king@canonical.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c
index 309d90f..fbd0524 100644
--- a/tools/perf/util/scripting-engines/trace-event-python.c
+++ b/tools/perf/util/scripting-engines/trace-event-python.c
@@ -1094,8 +1094,6 @@ static int python_start_script(const char *script, int argc, const char **argv)
 		goto error;
 	}
 
-	free(command_line);
-
 	set_table_handlers(tables);
 
 	if (tables->db_export_mode) {
@@ -1104,6 +1102,8 @@ static int python_start_script(const char *script, int argc, const char **argv)
 			goto error;
 	}
 
+	free(command_line);
+
 	return err;
 error:
 	Py_Finalize();
-- 
cgit v0.10.2


From 21a30100453516004905d4d5f0806ebaffa95131 Mon Sep 17 00:00:00 2001
From: "Chaos.Chen" <rainboy1215@gmail.com>
Date: Tue, 9 Feb 2016 15:40:14 -0500
Subject: tools lib traceevent: Fix time stamp rounding issue

When rounding to microseconds, if the timestamp subsecond is between
.999999500 and .999999999, it is rounded to .1000000, when it should
instead increment the second counter due to the overflow.

For example, if the timestamp is 1234.999999501 instead of seeing:

  1235.000000

we see:

  1234.1000000

Signed-off-by: Chaos.Chen <rainboy1215@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20160209204236.824426460@goodmis.org
[ fixed incrementing "secs" instead of decrementing it ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 9a1e48a..ce59f48 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -5429,6 +5429,11 @@ void pevent_print_event_time(struct pevent *pevent, struct trace_seq *s,
 			p = 9;
 		} else {
 			usecs = (nsecs + 500) / NSECS_PER_USEC;
+			/* To avoid usecs larger than 1 sec */
+			if (usecs >= 1000000) {
+				usecs -= 1000000;
+				secs++;
+			}
 			p = 6;
 		}
 
-- 
cgit v0.10.2


From 9ec72eafee61f68cd57310a99db129ffb71302f3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 9 Feb 2016 15:40:16 -0500
Subject: tools lib traceevent: Set int_array fields to NULL if freeing from
 error

Had a bug where on error of parsing __print_array() where the fields are
freed after they were allocated, but since they were not set to NULL,
the freeing of the arg also tried to free the already freed fields
causing a double free.

Fix process_hex() while at it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20160209204237.188327674@goodmis.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index ce59f48..fb790aa 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -2635,6 +2635,7 @@ process_hex(struct event_format *event, struct print_arg *arg, char **tok)
 
 free_field:
 	free_arg(arg->hex.field);
+	arg->hex.field = NULL;
 out:
 	*tok = NULL;
 	return EVENT_ERROR;
@@ -2659,8 +2660,10 @@ process_int_array(struct event_format *event, struct print_arg *arg, char **tok)
 
 free_size:
 	free_arg(arg->int_array.count);
+	arg->int_array.count = NULL;
 free_field:
 	free_arg(arg->int_array.field);
+	arg->int_array.field = NULL;
 out:
 	*tok = NULL;
 	return EVENT_ERROR;
-- 
cgit v0.10.2


From a66673a07e260807f570db8f08a9c1207932c665 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 9 Feb 2016 15:40:17 -0500
Subject: tools lib traceevent: Fix output of %llu for 64 bit values read on 32
 bit machines

When a long value is read on 32 bit machines for 64 bit output, the
parsing needs to change "%lu" into "%llu", as the value is read
natively.

Unfortunately, if "%llu" is already there, the code will add another "l"
to it and fail to parse it properly.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20160209204237.337024613@goodmis.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index fb790aa..865dea5 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -4978,7 +4978,7 @@ static void pretty_print(struct trace_seq *s, void *data, int size, struct event
 						break;
 					}
 				}
-				if (pevent->long_size == 8 && ls &&
+				if (pevent->long_size == 8 && ls == 1 &&
 				    sizeof(long) != 8) {
 					char *p;
 
-- 
cgit v0.10.2


From 9b240637eb9b9677a9e9bc2dc568f5e0811e04d6 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 2 Mar 2016 09:58:00 -0300
Subject: perf test: Fix hists related entries

That got broken by d3a72fd8187b ("perf report: Fix indentation of
dynamic entries in hierarchy"), by using the evlist in setup_sorting()
without checking if it is NULL, as done in some 'perf test' entries:

  $ find tools/ -name "*.c" | xargs grep 'setup_sorting(NULL);'
  tools/perf/tests/hists_output.c:      setup_sorting(NULL);
  tools/perf/tests/hists_output.c:      setup_sorting(NULL);
  tools/perf/tests/hists_output.c:      setup_sorting(NULL);
  tools/perf/tests/hists_output.c:      setup_sorting(NULL);
  tools/perf/tests/hists_output.c:      setup_sorting(NULL);
  tools/perf/tests/hists_cumulate.c:    setup_sorting(NULL);
  tools/perf/tests/hists_cumulate.c:    setup_sorting(NULL);
  tools/perf/tests/hists_cumulate.c:    setup_sorting(NULL);
  tools/perf/tests/hists_cumulate.c:    setup_sorting(NULL);
  $

Fix it.

Before:

  [root@jouet ~]# perf test
  <SNIP>
  15: Test matching and linking multiple hists                 : FAILED!
  16: Try 'import perf' in python, checking link problems      : Ok
  17: Test breakpoint overflow signal handler                  : Ok
  18: Test breakpoint overflow sampling                        : Ok
  19: Test number of exit event of a simple workload           : Ok
  20: Test software clock events have valid period values      : Ok
  21: Test object code reading                                 : Ok
  22: Test sample parsing                                      : Ok
  23: Test using a dummy software event to keep tracking       : Ok
  24: Test parsing with no sample_id_all bit set               : Ok
  25: Test filtering hist entries                              : FAILED!
  26: Test mmap thread lookup                                  : Ok
  27: Test thread mg sharing                                   : Ok
  28: Test output sorting of hist entries                      : FAILED!
  29: Test cumulation of child hist entries                    : FAILED!
  <SNIP>

After the patch the above failed tests complete successfully.

Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Fixes: d3a72fd8187b ("perf report: Fix indentation of dynamic entries in hierarchy")
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 5888bfe..4380a28 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2635,25 +2635,14 @@ out:
 	return ret;
 }
 
-int setup_sorting(struct perf_evlist *evlist)
+static void evlist__set_hists_nr_sort_keys(struct perf_evlist *evlist)
 {
-	int err;
-	struct hists *hists;
 	struct perf_evsel *evsel;
-	struct perf_hpp_fmt *fmt;
-
-	err = __setup_sorting(evlist);
-	if (err < 0)
-		return err;
-
-	if (parent_pattern != default_parent_pattern) {
-		err = sort_dimension__add("parent", evlist);
-		if (err < 0)
-			return err;
-	}
 
 	evlist__for_each(evlist, evsel) {
-		hists = evsel__hists(evsel);
+		struct perf_hpp_fmt *fmt;
+		struct hists *hists = evsel__hists(evsel);
+
 		hists->nr_sort_keys = perf_hpp_list.nr_sort_keys;
 
 		/*
@@ -2667,6 +2656,24 @@ int setup_sorting(struct perf_evlist *evlist)
 				hists->nr_sort_keys--;
 		}
 	}
+}
+
+int setup_sorting(struct perf_evlist *evlist)
+{
+	int err;
+
+	err = __setup_sorting(evlist);
+	if (err < 0)
+		return err;
+
+	if (parent_pattern != default_parent_pattern) {
+		err = sort_dimension__add("parent", evlist);
+		if (err < 0)
+			return err;
+	}
+
+	if (evlist != NULL)
+		evlist__set_hists_nr_sort_keys(evlist);
 
 	reset_dimensions();
 
-- 
cgit v0.10.2


From e17a0e16ca3a63d1bafbcba313586cf137418f45 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 2 Mar 2016 12:55:22 +0000
Subject: perf tests: Initialize sa.sa_flags

The sa_flags field is not being initialized, so a garbage value is being
passed to sigaction.  Initialize it to zero.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1456923322-29697-1-git-send-email-colin.king@canonical.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c
index 7bb0d13..7945462 100644
--- a/tools/perf/arch/x86/tests/rdpmc.c
+++ b/tools/perf/arch/x86/tests/rdpmc.c
@@ -103,6 +103,7 @@ static int __test__rdpmc(void)
 
 	sigfillset(&sa.sa_mask);
 	sa.sa_sigaction = segfault_handler;
+	sa.sa_flags = 0;
 	sigaction(SIGSEGV, &sa, NULL);
 
 	fd = sys_perf_event_open(&attr, 0, -1, -1,
-- 
cgit v0.10.2


From 1b69317d2dc80bc8e1d005e1a771c4f5bff3dabe Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 2 Mar 2016 13:50:25 +0000
Subject: tools/power turbostat: fix various build warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When building with gcc 6 we're getting various build warnings that just
require some trivial function declaration and call fixes:

  turbostat.c: In function ‘dump_cstate_pstate_config_info’:
  turbostat.c:1973:1: warning: type of ‘family’ defaults to ‘int’
   dump_cstate_pstate_config_info(family, model)
  turbostat.c:1973:1: warning: type of ‘model’ defaults to ‘int’
  turbostat.c: In function ‘get_tdp’:
  turbostat.c:2145:8: warning: type of ‘model’ defaults to ‘int’
   double get_tdp(model)
  turbostat.c: In function ‘perf_limit_reasons_probe’:
  turbostat.c:2259:6: warning: type of ‘family’ defaults to ‘int’
   void perf_limit_reasons_probe(family, model)
  turbostat.c:2259:6: warning: type of ‘model’ defaults to ‘int’

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-wbicer8n0s9qe6ql8h9x478e@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 0dac7e0..3fa94e2 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -1970,7 +1970,7 @@ int has_config_tdp(unsigned int family, unsigned int model)
 }
 
 static void
-dump_cstate_pstate_config_info(family, model)
+dump_cstate_pstate_config_info(unsigned int family, unsigned int model)
 {
 	if (!do_nhm_platform_info)
 		return;
@@ -2142,7 +2142,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data
 #define	RAPL_POWER_GRANULARITY	0x7FFF	/* 15 bit power granularity */
 #define	RAPL_TIME_GRANULARITY	0x3F /* 6 bit time granularity */
 
-double get_tdp(model)
+double get_tdp(unsigned int model)
 {
 	unsigned long long msr;
 
@@ -2256,7 +2256,7 @@ void rapl_probe(unsigned int family, unsigned int model)
 	return;
 }
 
-void perf_limit_reasons_probe(family, model)
+void perf_limit_reasons_probe(unsigned int family, unsigned int model)
 {
 	if (!genuine_intel)
 		return;
@@ -2792,7 +2792,7 @@ void process_cpuid()
 	perf_limit_reasons_probe(family, model);
 
 	if (debug)
-		dump_cstate_pstate_config_info();
+		dump_cstate_pstate_config_info(family, model);
 
 	if (has_skl_msrs(family, model))
 		calculate_tsc_tweak();
-- 
cgit v0.10.2


From fb4605ba47e772ff9d62d1d54218a832ec8b3e1d Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 1 Mar 2016 10:57:52 -0800
Subject: perf stat: Check for frontend stalled for metrics

Add an extra check for frontend stalled in the metrics.  This avoids an
extra column for the --metric-only case when the CPU does not support
frontend stalled.

v2: Add separate init function

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/1456858672-21594-8-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 9b5089c..baa8207 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1966,6 +1966,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 	argc = parse_options_subcommand(argc, argv, stat_options, stat_subcommands,
 					(const char **) stat_usage,
 					PARSE_OPT_STOP_AT_NON_OPTION);
+	perf_stat__init_shadow_stats();
 
 	if (csv_sep) {
 		csv_output = true;
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 5e2d2e3..b33ffb2 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -2,6 +2,7 @@
 #include "evsel.h"
 #include "stat.h"
 #include "color.h"
+#include "pmu.h"
 
 enum {
 	CTX_BIT_USER	= 1 << 0,
@@ -35,9 +36,15 @@ static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
+static bool have_frontend_stalled;
 
 struct stats walltime_nsecs_stats;
 
+void perf_stat__init_shadow_stats(void)
+{
+	have_frontend_stalled = pmu_have_event("cpu", "stalled-cycles-frontend");
+}
+
 static int evsel_context(struct perf_evsel *evsel)
 {
 	int ctx = 0;
@@ -323,7 +330,7 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 			print_metric(ctxp, NULL, "%7.2f ",
 					"stalled cycles per insn",
 					ratio);
-		} else {
+		} else if (have_frontend_stalled) {
 			print_metric(ctxp, NULL, NULL,
 				     "stalled cycles per insn", 0);
 		}
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index f02af68..0150e78 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -72,6 +72,7 @@ typedef void (*print_metric_t)(void *ctx, const char *color, const char *unit,
 			       const char *fmt, double val);
 typedef void (*new_line_t )(void *ctx);
 
+void perf_stat__init_shadow_stats(void);
 void perf_stat__reset_shadow_stats(void);
 void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
 				    int cpu);
-- 
cgit v0.10.2


From 85d9691ccc96d95629939a877fd6c1f8c4724f56 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Sun, 14 Feb 2016 18:35:51 +0200
Subject: IB/mlx5: Avoid using user-index for SRQs

Normal SRQs, unlike XRC SRQs, don't have user-index, therefore
avoid verifying it and using it.

Fixes: cfb5e088e26a ('IB/mlx5: Add CQE version 1 support to user QPs and SRQs')
Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index 4659256..a1b7122 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -75,7 +75,8 @@ static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type)
 
 static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 			   struct mlx5_create_srq_mbox_in **in,
-			   struct ib_udata *udata, int buf_size, int *inlen)
+			   struct ib_udata *udata, int buf_size, int *inlen,
+			   int is_xrc)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_ib_create_srq ucmd = {};
@@ -108,10 +109,12 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 				 drv_data - sizeof(ucmd)))
 		return -EINVAL;
 
-	err = get_srq_user_index(to_mucontext(pd->uobject->context),
-				 &ucmd, udata->inlen, &uidx);
-	if (err)
-		return err;
+	if (is_xrc) {
+		err = get_srq_user_index(to_mucontext(pd->uobject->context),
+					 &ucmd, udata->inlen, &uidx);
+		if (err)
+			return err;
+	}
 
 	srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
 
@@ -151,7 +154,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 	(*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
 
-	if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+	if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) &&
+	     is_xrc){
 		xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
 				     xrc_srq_context_entry);
 		MLX5_SET(xrc_srqc, xsrqc, user_index, uidx);
@@ -170,7 +174,7 @@ err_umem:
 
 static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
 			     struct mlx5_create_srq_mbox_in **in, int buf_size,
-			     int *inlen)
+			     int *inlen, int is_xrc)
 {
 	int err;
 	int i;
@@ -224,7 +228,8 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
 
 	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
 
-	if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) {
+	if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) &&
+	     is_xrc){
 		xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in,
 				     xrc_srq_context_entry);
 		/* 0xffffff means we ask to work with cqe version 0 */
@@ -302,10 +307,14 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 		    desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs,
 		    srq->msrq.max_avail_gather);
 
+	is_xrc = (init_attr->srq_type == IB_SRQT_XRC);
+
 	if (pd->uobject)
-		err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen);
+		err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen,
+				      is_xrc);
 	else
-		err = create_srq_kernel(dev, srq, &in, buf_size, &inlen);
+		err = create_srq_kernel(dev, srq, &in, buf_size, &inlen,
+					is_xrc);
 
 	if (err) {
 		mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
@@ -313,7 +322,6 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 		goto err_srq;
 	}
 
-	is_xrc = (init_attr->srq_type == IB_SRQT_XRC);
 	in->ctx.state_log_sz = ilog2(srq->msrq.max);
 	flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24;
 	xrcdn = 0;
-- 
cgit v0.10.2


From 3d943c9d1cc5ad1825e46291ef5ce627e1b6b660 Mon Sep 17 00:00:00 2001
From: Majd Dibbiny <majd@mellanox.com>
Date: Sun, 14 Feb 2016 18:35:52 +0200
Subject: IB/{core, mlx5}: Fix input len in vendor part of create_qp/srq

Currently, the inlen field of the vendor's part of the command
doesn't match the command buffer. This happens because the inlen
accommodates ib_uverbs_cmd_hdr which is deducted from the in buffer.
This is problematic since the vendor function could be called either
from the legacy verb (where the input length mismatches the actual
length) or by the extended verb (where the length matches). The vendor
has no idea which function calls it and therefore has no way to know
how the length variable should be treated.

Fixing this by aligning the inlen to the correct length.

All vendor drivers either assumed that inlen >= sizeof(vendor_uhw_cmd)
or just failed wrongly (mlx5) and fixed in this patch.

Fixes: cfb5e088e26a ('IB/mlx5: Add CQE version 1 support to user QPs and SRQs')
Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Reviewed-by: Matan Barak <matanb@mellanox.com>
Reviewed-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 6ffc9c4..6c6fbff 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1970,7 +1970,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
 		   resp_size);
 	INIT_UDATA(&uhw, buf + sizeof(cmd),
 		   (unsigned long)cmd.response + resp_size,
-		   in_len - sizeof(cmd), out_len - resp_size);
+		   in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - resp_size);
 
 	memset(&cmd_ex, 0, sizeof(cmd_ex));
 	cmd_ex.user_handle = cmd.user_handle;
@@ -3413,7 +3414,8 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
 
 	INIT_UDATA(&udata, buf + sizeof cmd,
 		   (unsigned long) cmd.response + sizeof resp,
-		   in_len - sizeof cmd, out_len - sizeof resp);
+		   in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - sizeof resp);
 
 	ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);
 	if (ret)
@@ -3439,7 +3441,8 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
 
 	INIT_UDATA(&udata, buf + sizeof cmd,
 		   (unsigned long) cmd.response + sizeof resp,
-		   in_len - sizeof cmd, out_len - sizeof resp);
+		   in_len - sizeof cmd - sizeof(struct ib_uverbs_cmd_hdr),
+		   out_len - sizeof resp);
 
 	ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);
 	if (ret)
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index a1b7122..3b2ddd6 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -88,13 +88,8 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 	int ncont;
 	u32 offset;
 	u32 uidx = MLX5_IB_DEFAULT_UIDX;
-	int drv_data = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
 
-	if (drv_data < 0)
-		return -EINVAL;
-
-	ucmdlen = (drv_data < sizeof(ucmd)) ?
-		  drv_data : sizeof(ucmd);
+	ucmdlen = min(udata->inlen, sizeof(ucmd));
 
 	if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) {
 		mlx5_ib_dbg(dev, "failed copy udata\n");
@@ -104,9 +99,9 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
 	if (ucmd.reserved0 || ucmd.reserved1)
 		return -EINVAL;
 
-	if (drv_data > sizeof(ucmd) &&
+	if (udata->inlen > sizeof(ucmd) &&
 	    !ib_is_udata_cleared(udata, sizeof(ucmd),
-				 drv_data - sizeof(ucmd)))
+				 udata->inlen - sizeof(ucmd)))
 		return -EINVAL;
 
 	if (is_xrc) {
-- 
cgit v0.10.2


From f16921275cc3c2442d0b95225785a601603b990f Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Tue, 16 Feb 2016 22:54:02 +0100
Subject: dmaengine: pxa_dma: fix cyclic transfers

While testing audio with pxa2xx-ac97, underrun were happening while the
user application was correctly feeding the music. Debug proved that the
cyclic transfer is not cyclic, ie. the last descriptor did not loop on
the first.

Another issue is that the descriptor length was always set to 8192,
because of an trivial operator issue.

This was tested on a pxa27x platform.

Fixes: a57e16cf0333 ("dmaengine: pxa: add pxa dmaengine driver")
Reported-by: Vasily Khoruzhick <anarsoul@gmail.com>
Tested-by: Vasily Khoruzhick <anarsoul@gmail.com>
Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>

diff --git a/drivers/dma/pxa_dma.c b/drivers/dma/pxa_dma.c
index f2a0310..debca82 100644
--- a/drivers/dma/pxa_dma.c
+++ b/drivers/dma/pxa_dma.c
@@ -583,6 +583,8 @@ static void set_updater_desc(struct pxad_desc_sw *sw_desc,
 		(PXA_DCMD_LENGTH & sizeof(u32));
 	if (flags & DMA_PREP_INTERRUPT)
 		updater->dcmd |= PXA_DCMD_ENDIRQEN;
+	if (sw_desc->cyclic)
+		sw_desc->hw_desc[sw_desc->nb_desc - 2]->ddadr = sw_desc->first;
 }
 
 static bool is_desc_completed(struct virt_dma_desc *vd)
@@ -673,6 +675,10 @@ static irqreturn_t pxad_chan_handler(int irq, void *dev_id)
 		dev_dbg(&chan->vc.chan.dev->device,
 			"%s(): checking txd %p[%x]: completed=%d\n",
 			__func__, vd, vd->tx.cookie, is_desc_completed(vd));
+		if (to_pxad_sw_desc(vd)->cyclic) {
+			vchan_cyclic_callback(vd);
+			break;
+		}
 		if (is_desc_completed(vd)) {
 			list_del(&vd->node);
 			vchan_cookie_complete(vd);
@@ -1080,7 +1086,7 @@ pxad_prep_dma_cyclic(struct dma_chan *dchan,
 		return NULL;
 
 	pxad_get_config(chan, dir, &dcmd, &dsadr, &dtadr);
-	dcmd |= PXA_DCMD_ENDIRQEN | (PXA_DCMD_LENGTH | period_len);
+	dcmd |= PXA_DCMD_ENDIRQEN | (PXA_DCMD_LENGTH & period_len);
 	dev_dbg(&chan->vc.chan.dev->device,
 		"%s(): buf_addr=0x%lx len=%zu period=%zu dir=%d flags=%lx\n",
 		__func__, (unsigned long)buf_addr, len, period_len, dir, flags);
-- 
cgit v0.10.2


From 11d8d645343efba0c975aefe7c2cf3b33c836c75 Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Tue, 1 Mar 2016 18:52:23 +0200
Subject: IB/core: Use GRH when the path hop-limit > 0

According to IBTA spec v1.3 section 12.7.19, QPs should use GRH when
the path returned by the SA has hop-limit > 0. Currently, we do that
only for the > 1 case, fix that.

Fixes: 6d969a471ba1 ('IB/sa: Add ib_init_ah_from_path()')
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>

diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index f334090..1e37f35 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -1071,7 +1071,7 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 		}
 	}
 
-	if (rec->hop_limit > 1 || use_roce) {
+	if (rec->hop_limit > 0 || use_roce) {
 		ah_attr->ah_flags = IB_AH_GRH;
 		ah_attr->grh.dgid = rec->dgid;
 
-- 
cgit v0.10.2


From b5891cfab08fe3144a616e8e734df7749fb3b7d0 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Sun, 31 Jan 2016 16:22:16 +0300
Subject: ovl: fix working on distributed fs as lower layer

This adds missing .d_select_inode into alternative dentry_operations.

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Fixes: 7c03b5d45b8e ("ovl: allow distributed fs as lower layer")
Fixes: 4bacc9c9234c ("overlayfs: Make f_path always point to the overlay and f_inode to the underlay")
Reviewed-by: Nikolay Borisov <kernel@kyup.com>
Tested-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: <stable@vger.kernel.org> # 4.2+

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 8d826bd..588a4b5 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -341,6 +341,7 @@ static const struct dentry_operations ovl_dentry_operations = {
 
 static const struct dentry_operations ovl_reval_dentry_operations = {
 	.d_release = ovl_dentry_release,
+	.d_select_inode = ovl_d_select_inode,
 	.d_revalidate = ovl_dentry_revalidate,
 	.d_weak_revalidate = ovl_dentry_weak_revalidate,
 };
-- 
cgit v0.10.2


From ce9113bbcbf45a57c082d6603b9a9f342be3ef74 Mon Sep 17 00:00:00 2001
From: Rui Wang <rui.y.wang@intel.com>
Date: Fri, 8 Jan 2016 23:09:59 +0800
Subject: ovl: fix getcwd() failure after unsuccessful rmdir

ovl_remove_upper() should do d_drop() only after it successfully
removes the dir, otherwise a subsequent getcwd() system call will
fail, breaking userspace programs.

This is to fix: https://bugzilla.kernel.org/show_bug.cgi?id=110491

Signed-off-by: Rui Wang <rui.y.wang@intel.com>
Reviewed-by: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: <stable@vger.kernel.org>

diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index ed95272..795ab65 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -618,7 +618,8 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
 	 * sole user of this dentry.  Too tricky...  Just unhash for
 	 * now.
 	 */
-	d_drop(dentry);
+	if (!err)
+		d_drop(dentry);
 	inode_unlock(dir);
 
 	return err;
-- 
cgit v0.10.2


From 45d11738969633ec07ca35d75d486bf2d8918df6 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Sun, 31 Jan 2016 16:17:53 +0300
Subject: ovl: ignore lower entries when checking purity of non-directory
 entries

After rename file dentry still holds reference to lower dentry from
previous location. This doesn't matter for data access because data comes
from upper dentry. But this stale lower dentry taints dentry at new
location and turns it into non-pure upper. Such file leaves visible
whiteout entry after remove in directory which shouldn't have whiteouts at
all.

Overlayfs already tracks pureness of file location in oe->opaque.  This
patch just uses that for detecting actual path type.

Comment from Vivek Goyal's patch:

Here are the details of the problem. Do following.

$ mkdir upper lower work merged upper/dir/
$ touch lower/test
$ sudo mount -t overlay overlay -olowerdir=lower,upperdir=upper,workdir=
work merged
$ mv merged/test merged/dir/
$ rm merged/dir/test
$ ls -l merged/dir/
/usr/bin/ls: cannot access merged/dir/test: No such file or directory
total 0
c????????? ? ? ? ?            ? test

Basic problem seems to be that once a file has been unlinked, a whiteout
has been left behind which was not needed and hence it becomes visible.

Whiteout is visible because parent dir is of not type MERGE, hence
od->is_real is set during ovl_dir_open(). And that means ovl_iterate()
passes on iterate handling directly to underlying fs. Underlying fs does
not know/filter whiteouts so it becomes visible to user.

Why did we leave a whiteout to begin with when we should not have.
ovl_do_remove() checks for OVL_TYPE_PURE_UPPER() and does not leave
whiteout if file is pure upper. In this case file is not found to be pure
upper hence whiteout is left.

So why file was not PURE_UPPER in this case? I think because dentry is
still carrying some leftover state which was valid before rename. For
example, od->numlower was set to 1 as it was a lower file. After rename,
this state is not valid anymore as there is no such file in lower.

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Reported-by: Viktor Stanchev <me@viktorstanchev.com>
Suggested-by: Vivek Goyal <vgoyal@redhat.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=109611
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: <stable@vger.kernel.org>

diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 795ab65..52f6de5 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -904,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 	if (!overwrite && new_is_dir && !old_opaque && new_opaque)
 		ovl_remove_opaque(newdentry);
 
+	/*
+	 * Old dentry now lives in different location. Dentries in
+	 * lowerstack are stale. We cannot drop them here because
+	 * access to them is lockless. This could be only pure upper
+	 * or opaque directory - numlower is zero. Or upper non-dir
+	 * entry - its pureness is tracked by flag opaque.
+	 */
 	if (old_opaque != new_opaque) {
 		ovl_dentry_set_opaque(old, new_opaque);
 		if (!overwrite)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 588a4b5..619ad4b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
 	if (oe->__upperdentry) {
 		type = __OVL_PATH_UPPER;
 
-		if (oe->numlower) {
-			if (S_ISDIR(dentry->d_inode->i_mode))
-				type |= __OVL_PATH_MERGE;
-		} else if (!oe->opaque) {
+		/*
+		 * Non-dir dentry can hold lower dentry from previous
+		 * location. Its purity depends only on opaque flag.
+		 */
+		if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
+			type |= __OVL_PATH_MERGE;
+		else if (!oe->opaque)
 			type |= __OVL_PATH_PURE;
-		}
 	} else {
 		if (oe->numlower > 1)
 			type |= __OVL_PATH_MERGE;
-- 
cgit v0.10.2


From b81de061fa59f17d2730aabb1b84419ef3913810 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Sun, 31 Jan 2016 16:21:29 +0300
Subject: ovl: copy new uid/gid into overlayfs runtime inode

Overlayfs must update uid/gid after chown, otherwise functions
like inode_owner_or_capable() will check user against stale uid.
Catched by xfstests generic/087, it chowns file and calls utimes.

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Cc: <stable@vger.kernel.org>

diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 49e2045..a4ff5d0 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -65,6 +65,8 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 
 		inode_lock(upperdentry->d_inode);
 		err = notify_change(upperdentry, attr, NULL);
+		if (!err)
+			ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
 		inode_unlock(upperdentry->d_inode);
 	}
 	ovl_drop_write(dentry);
-- 
cgit v0.10.2


From 93125094c07d8c9ec25dff5869f191b33eb9dd6e Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Date: Thu, 3 Mar 2016 14:52:51 -0300
Subject: [media] media.h: postpone connectors entities

The representation of external connections got some heated
discussions recently. As we're too close to the merge window,
let's not set those entities into a stone.

Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>

diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 13e19a1..323f1af 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -82,10 +82,18 @@ struct media_device_info {
  * Connectors
  */
 /* It is a responsibility of the entity drivers to add connectors and links */
+#ifdef __KERNEL__
+	/*
+	 * For now, it should not be used in userspace, as some
+	 * definitions may change
+	 */
+
 #define MEDIA_ENT_F_CONN_RF		(MEDIA_ENT_F_BASE + 0x30001)
 #define MEDIA_ENT_F_CONN_SVIDEO		(MEDIA_ENT_F_BASE + 0x30002)
 #define MEDIA_ENT_F_CONN_COMPOSITE	(MEDIA_ENT_F_BASE + 0x30003)
 
+#endif
+
 /*
  * Don't touch on those. The ranges MEDIA_ENT_F_OLD_BASE and
  * MEDIA_ENT_F_OLD_SUBDEV_BASE are kept to keep backward compatibility
-- 
cgit v0.10.2


From 88f8b1bb41c6208f81b6a480244533ded7b59493 Mon Sep 17 00:00:00 2001
From: Gabriel Fernandez <gabriel.fernandez@linaro.org>
Date: Mon, 29 Feb 2016 17:18:22 +0100
Subject: stmmac: Fix 'eth0: No PHY found' regression

This patch manages the case when you have an Ethernet MAC with
a "fixed link", and not connected to a normal MDIO-managed PHY device.

The test of phy_bus_name was not helpful because it was never affected
and replaced by the mdio test node.

Signed-off-by: Gabriel Fernandez <gabriel.fernandez@linaro.org>
Acked-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 0faf163..efb54f3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -199,21 +199,12 @@ int stmmac_mdio_register(struct net_device *ndev)
 	struct stmmac_priv *priv = netdev_priv(ndev);
 	struct stmmac_mdio_bus_data *mdio_bus_data = priv->plat->mdio_bus_data;
 	int addr, found;
-	struct device_node *mdio_node = NULL;
-	struct device_node *child_node = NULL;
+	struct device_node *mdio_node = priv->plat->mdio_node;
 
 	if (!mdio_bus_data)
 		return 0;
 
 	if (IS_ENABLED(CONFIG_OF)) {
-		for_each_child_of_node(priv->device->of_node, child_node) {
-			if (of_device_is_compatible(child_node,
-						    "snps,dwmac-mdio")) {
-				mdio_node = child_node;
-				break;
-			}
-		}
-
 		if (mdio_node) {
 			netdev_dbg(ndev, "FOUND MDIO subnode\n");
 		} else {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index 6a52fa1..4514ba7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -110,6 +110,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 	struct device_node *np = pdev->dev.of_node;
 	struct plat_stmmacenet_data *plat;
 	struct stmmac_dma_cfg *dma_cfg;
+	struct device_node *child_node = NULL;
 
 	plat = devm_kzalloc(&pdev->dev, sizeof(*plat), GFP_KERNEL);
 	if (!plat)
@@ -140,13 +141,19 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 		plat->phy_node = of_node_get(np);
 	}
 
+	for_each_child_of_node(np, child_node)
+		if (of_device_is_compatible(child_node,	"snps,dwmac-mdio")) {
+			plat->mdio_node = child_node;
+			break;
+		}
+
 	/* "snps,phy-addr" is not a standard property. Mark it as deprecated
 	 * and warn of its use. Remove this when phy node support is added.
 	 */
 	if (of_property_read_u32(np, "snps,phy-addr", &plat->phy_addr) == 0)
 		dev_warn(&pdev->dev, "snps,phy-addr property is deprecated\n");
 
-	if ((plat->phy_node && !of_phy_is_fixed_link(np)) || plat->phy_bus_name)
+	if ((plat->phy_node && !of_phy_is_fixed_link(np)) || !plat->mdio_node)
 		plat->mdio_bus_data = NULL;
 	else
 		plat->mdio_bus_data =
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index eead8ab..881a79d 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -100,6 +100,7 @@ struct plat_stmmacenet_data {
 	int interface;
 	struct stmmac_mdio_bus_data *mdio_bus_data;
 	struct device_node *phy_node;
+	struct device_node *mdio_node;
 	struct stmmac_dma_cfg *dma_cfg;
 	int clk_csr;
 	int has_gmac;
-- 
cgit v0.10.2


From 878e3c1be2dc8a759c18bb280a7390f9ed0bd544 Mon Sep 17 00:00:00 2001
From: Igal Liberman <igal.liberman@freescale.com>
Date: Sun, 28 Feb 2016 23:59:53 +0200
Subject: fsl/fman: Initialize fman->dev earlier

Currently, in a case of error, dev_err is using fman->dev
before its initialization and "(NULL device *)" is printed.
This patch fixes this issue.

Signed-off-by: Igal Liberman <igal.liberman@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c
index 623aa1c..79a210a 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -2791,6 +2791,8 @@ static struct fman *read_dts_node(struct platform_device *of_dev)
 		goto fman_free;
 	}
 
+	fman->dev = &of_dev->dev;
+
 	return fman;
 
 fman_node_put:
@@ -2845,8 +2847,6 @@ static int fman_probe(struct platform_device *of_dev)
 
 	dev_set_drvdata(dev, fman);
 
-	fman->dev = dev;
-
 	dev_dbg(dev, "FMan%d probed\n", fman->dts_params.id);
 
 	return 0;
-- 
cgit v0.10.2


From 1837b2e2bcd23137766555a63867e649c0b637f0 Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@suse.com>
Date: Mon, 29 Feb 2016 15:03:33 -0800
Subject: mld, igmp: Fix reserved tailroom calculation

The current reserved_tailroom calculation fails to take hlen and tlen into
account.

skb:
[__hlen__|__data____________|__tlen___|__extra__]
^                                               ^
head                                            skb_end_offset

In this representation, hlen + data + tlen is the size passed to alloc_skb.
"extra" is the extra space made available in __alloc_skb because of
rounding up by kmalloc. We can reorder the representation like so:

[__hlen__|__data____________|__extra__|__tlen___]
^                                               ^
head                                            skb_end_offset

The maximum space available for ip headers and payload without
fragmentation is min(mtu, data + extra). Therefore,
reserved_tailroom
= data + extra + tlen - min(mtu, data + extra)
= skb_end_offset - hlen - min(mtu, skb_end_offset - hlen - tlen)
= skb_tailroom - min(mtu, skb_tailroom - tlen) ; after skb_reserve(hlen)

Compare the second line to the current expression:
reserved_tailroom = skb_end_offset - min(mtu, skb_end_offset)
and we can see that hlen and tlen are not taken into account.

The min() in the third line can be expanded into:
if mtu < skb_tailroom - tlen:
	reserved_tailroom = skb_tailroom - mtu
else:
	reserved_tailroom = tlen

Depending on hlen, tlen, mtu and the number of multicast address records,
the current code may output skbs that have less tailroom than
dev->needed_tailroom or it may output more skbs than needed because not all
space available is used.

Fixes: 4c672e4b ("ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs")
Signed-off-by: Benjamin Poirier <bpoirier@suse.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4ce9ff7..d3fcd45 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1985,6 +1985,30 @@ static inline void skb_reserve(struct sk_buff *skb, int len)
 	skb->tail += len;
 }
 
+/**
+ *	skb_tailroom_reserve - adjust reserved_tailroom
+ *	@skb: buffer to alter
+ *	@mtu: maximum amount of headlen permitted
+ *	@needed_tailroom: minimum amount of reserved_tailroom
+ *
+ *	Set reserved_tailroom so that headlen can be as large as possible but
+ *	not larger than mtu and tailroom cannot be smaller than
+ *	needed_tailroom.
+ *	The required headroom should already have been reserved before using
+ *	this function.
+ */
+static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu,
+					unsigned int needed_tailroom)
+{
+	SKB_LINEAR_ASSERT(skb);
+	if (mtu < skb_tailroom(skb) - needed_tailroom)
+		/* use at most mtu */
+		skb->reserved_tailroom = skb_tailroom(skb) - mtu;
+	else
+		/* use up to all available space */
+		skb->reserved_tailroom = needed_tailroom;
+}
+
 #define ENCAP_TYPE_ETHER	0
 #define ENCAP_TYPE_IPPROTO	1
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 05e4cba..b3086cf 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -356,9 +356,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
 	skb_dst_set(skb, &rt->dst);
 	skb->dev = dev;
 
-	skb->reserved_tailroom = skb_end_offset(skb) -
-				 min(mtu, skb_end_offset(skb));
 	skb_reserve(skb, hlen);
+	skb_tailroom_reserve(skb, mtu, tlen);
 
 	skb_reset_network_header(skb);
 	pip = ip_hdr(skb);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 5ee56d0..d64ee7e 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1574,9 +1574,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
 		return NULL;
 
 	skb->priority = TC_PRIO_CONTROL;
-	skb->reserved_tailroom = skb_end_offset(skb) -
-				 min(mtu, skb_end_offset(skb));
 	skb_reserve(skb, hlen);
+	skb_tailroom_reserve(skb, mtu, tlen);
 
 	if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) {
 		/* <draft-ietf-magma-mld-source-05.txt>:
-- 
cgit v0.10.2


From fbe093ac9f0201939279cdfe8b0fce20ce5ef7a9 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@iki.fi>
Date: Thu, 3 Mar 2016 14:20:14 -0300
Subject: [media] media: Sanitise the reserved fields of the G_TOPOLOGY IOCTL
 arguments

The argument structs are used in arrays for G_TOPOLOGY IOCTL. The
arguments themselves do not need to be aligned to a power of two, but
aligning them up to the largest basic type alignment (u64) on common ABIs
is a good thing to do.

The patch changes the size of the reserved fields to 5 or 6 u32's and
aligns the size of the struct to 8 bytes so we do no longer depend on the
compiler to perform the alignment.

While at it, add __attribute__ ((packed)) to these structs as well.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>

diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 323f1af..625b38f 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -297,14 +297,14 @@ struct media_v2_entity {
 	__u32 id;
 	char name[64];		/* FIXME: move to a property? (RFC says so) */
 	__u32 function;		/* Main function of the entity */
-	__u16 reserved[12];
-};
+	__u32 reserved[6];
+} __attribute__ ((packed));
 
 /* Should match the specific fields at media_intf_devnode */
 struct media_v2_intf_devnode {
 	__u32 major;
 	__u32 minor;
-};
+} __attribute__ ((packed));
 
 struct media_v2_interface {
 	__u32 id;
@@ -316,22 +316,22 @@ struct media_v2_interface {
 		struct media_v2_intf_devnode devnode;
 		__u32 raw[16];
 	};
-};
+} __attribute__ ((packed));
 
 struct media_v2_pad {
 	__u32 id;
 	__u32 entity_id;
 	__u32 flags;
-	__u16 reserved[9];
-};
+	__u32 reserved[5];
+} __attribute__ ((packed));
 
 struct media_v2_link {
 	__u32 id;
 	__u32 source_id;
 	__u32 sink_id;
 	__u32 flags;
-	__u32 reserved[5];
-};
+	__u32 reserved[6];
+} __attribute__ ((packed));
 
 struct media_v2_topology {
 	__u64 topology_version;
@@ -351,7 +351,7 @@ struct media_v2_topology {
 	__u32 num_links;
 	__u32 reserved4;
 	__u64 ptr_links;
-};
+} __attribute__ ((packed));
 
 static inline void __user *media_get_uptr(__u64 arg)
 {
-- 
cgit v0.10.2


From f214fc402967e1bc94ad7f39faa03db5813d6849 Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Tue, 1 Mar 2016 11:07:09 +0100
Subject: tipc: Revert "tipc: use existing sk_write_queue for outgoing packet
 chain"

reverts commit 94153e36e709e ("tipc: use existing sk_write_queue for
outgoing packet chain")

In Commit 94153e36e709e, we assume that we fill & empty the socket's
sk_write_queue within the same lock_sock() session.

This is not true if the link is congested. During congestion, the
socket lock is released while we wait for the congestion to cease.
This implementation causes a nullptr exception, if the user space
program has several threads accessing the same socket descriptor.

Consider two threads of the same program performing the following:
     Thread1                                  Thread2
--------------------                    ----------------------
Enter tipc_sendmsg()                    Enter tipc_sendmsg()
lock_sock()                             lock_sock()
Enter tipc_link_xmit(), ret=ELINKCONG   spin on socket lock..
sk_wait_event()                             :
release_sock()                          grab socket lock
    :                                   Enter tipc_link_xmit(), ret=0
    :                                   release_sock()
Wakeup after congestion
lock_sock()
skb = skb_peek(pktchain);
!! TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;

In this case, the second thread transmits the buffers belonging to
both thread1 and thread2 successfully. When the first thread wakeup
after the congestion it assumes that the pktchain is intact and
operates on the skb's in it, which leads to the following exception:

[2102.439969] BUG: unable to handle kernel NULL pointer dereference at 00000000000000d0
[2102.440074] IP: [<ffffffffa005f330>] __tipc_link_xmit+0x2b0/0x4d0 [tipc]
[2102.440074] PGD 3fa3f067 PUD 3fa6b067 PMD 0
[2102.440074] Oops: 0000 [#1] SMP
[2102.440074] CPU: 2 PID: 244 Comm: sender Not tainted 3.12.28 #1
[2102.440074] RIP: 0010:[<ffffffffa005f330>]  [<ffffffffa005f330>] __tipc_link_xmit+0x2b0/0x4d0 [tipc]
[...]
[2102.440074] Call Trace:
[2102.440074]  [<ffffffff8163f0b9>] ? schedule+0x29/0x70
[2102.440074]  [<ffffffffa006a756>] ? tipc_node_unlock+0x46/0x170 [tipc]
[2102.440074]  [<ffffffffa005f761>] tipc_link_xmit+0x51/0xf0 [tipc]
[2102.440074]  [<ffffffffa006d8ae>] tipc_send_stream+0x11e/0x4f0 [tipc]
[2102.440074]  [<ffffffff8106b150>] ? __wake_up_sync+0x20/0x20
[2102.440074]  [<ffffffffa006dc9c>] tipc_send_packet+0x1c/0x20 [tipc]
[2102.440074]  [<ffffffff81502478>] sock_sendmsg+0xa8/0xd0
[2102.440074]  [<ffffffff81507895>] ? release_sock+0x145/0x170
[2102.440074]  [<ffffffff815030d8>] ___sys_sendmsg+0x3d8/0x3e0
[2102.440074]  [<ffffffff816426ae>] ? _raw_spin_unlock+0xe/0x10
[2102.440074]  [<ffffffff81115c2a>] ? handle_mm_fault+0x6ca/0x9d0
[2102.440074]  [<ffffffff8107dd65>] ? set_next_entity+0x85/0xa0
[2102.440074]  [<ffffffff816426de>] ? _raw_spin_unlock_irq+0xe/0x20
[2102.440074]  [<ffffffff8107463c>] ? finish_task_switch+0x5c/0xc0
[2102.440074]  [<ffffffff8163ea8c>] ? __schedule+0x34c/0x950
[2102.440074]  [<ffffffff81504e12>] __sys_sendmsg+0x42/0x80
[2102.440074]  [<ffffffff81504e62>] SyS_sendmsg+0x12/0x20
[2102.440074]  [<ffffffff8164aed2>] system_call_fastpath+0x16/0x1b

In this commit, we maintain the skb list always in the stack.

Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 69c2905..4d420bb 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -673,7 +673,7 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct net *net = sock_net(sk);
 	struct tipc_msg *mhdr = &tsk->phdr;
-	struct sk_buff_head *pktchain = &sk->sk_write_queue;
+	struct sk_buff_head pktchain;
 	struct iov_iter save = msg->msg_iter;
 	uint mtu;
 	int rc;
@@ -687,14 +687,16 @@ static int tipc_sendmcast(struct  socket *sock, struct tipc_name_seq *seq,
 	msg_set_nameupper(mhdr, seq->upper);
 	msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
 
+	skb_queue_head_init(&pktchain);
+
 new_mtu:
 	mtu = tipc_bcast_get_mtu(net);
-	rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, pktchain);
+	rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain);
 	if (unlikely(rc < 0))
 		return rc;
 
 	do {
-		rc = tipc_bcast_xmit(net, pktchain);
+		rc = tipc_bcast_xmit(net, &pktchain);
 		if (likely(!rc))
 			return dsz;
 
@@ -704,7 +706,7 @@ new_mtu:
 			if (!rc)
 				continue;
 		}
-		__skb_queue_purge(pktchain);
+		__skb_queue_purge(&pktchain);
 		if (rc == -EMSGSIZE) {
 			msg->msg_iter = save;
 			goto new_mtu;
@@ -863,7 +865,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
 	struct net *net = sock_net(sk);
 	struct tipc_msg *mhdr = &tsk->phdr;
 	u32 dnode, dport;
-	struct sk_buff_head *pktchain = &sk->sk_write_queue;
+	struct sk_buff_head pktchain;
 	struct sk_buff *skb;
 	struct tipc_name_seq *seq;
 	struct iov_iter save;
@@ -924,17 +926,18 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
 		msg_set_hdr_sz(mhdr, BASIC_H_SIZE);
 	}
 
+	skb_queue_head_init(&pktchain);
 	save = m->msg_iter;
 new_mtu:
 	mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
-	rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, pktchain);
+	rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain);
 	if (rc < 0)
 		return rc;
 
 	do {
-		skb = skb_peek(pktchain);
+		skb = skb_peek(&pktchain);
 		TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
-		rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid);
+		rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid);
 		if (likely(!rc)) {
 			if (sock->state != SS_READY)
 				sock->state = SS_CONNECTING;
@@ -946,7 +949,7 @@ new_mtu:
 			if (!rc)
 				continue;
 		}
-		__skb_queue_purge(pktchain);
+		__skb_queue_purge(&pktchain);
 		if (rc == -EMSGSIZE) {
 			m->msg_iter = save;
 			goto new_mtu;
@@ -1016,7 +1019,7 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
 	struct net *net = sock_net(sk);
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct tipc_msg *mhdr = &tsk->phdr;
-	struct sk_buff_head *pktchain = &sk->sk_write_queue;
+	struct sk_buff_head pktchain;
 	DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
 	u32 portid = tsk->portid;
 	int rc = -EINVAL;
@@ -1044,17 +1047,19 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
 
 	timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
 	dnode = tsk_peer_node(tsk);
+	skb_queue_head_init(&pktchain);
 
 next:
 	save = m->msg_iter;
 	mtu = tsk->max_pkt;
 	send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE);
-	rc = tipc_msg_build(mhdr, m, sent, send, mtu, pktchain);
+	rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain);
 	if (unlikely(rc < 0))
 		return rc;
+
 	do {
 		if (likely(!tsk_conn_cong(tsk))) {
-			rc = tipc_node_xmit(net, pktchain, dnode, portid);
+			rc = tipc_node_xmit(net, &pktchain, dnode, portid);
 			if (likely(!rc)) {
 				tsk->sent_unacked++;
 				sent += send;
@@ -1063,7 +1068,7 @@ next:
 				goto next;
 			}
 			if (rc == -EMSGSIZE) {
-				__skb_queue_purge(pktchain);
+				__skb_queue_purge(&pktchain);
 				tsk->max_pkt = tipc_node_get_mtu(net, dnode,
 								 portid);
 				m->msg_iter = save;
@@ -1077,7 +1082,7 @@ next:
 		rc = tipc_wait_for_sndpkt(sock, &timeo);
 	} while (!rc);
 
-	__skb_queue_purge(pktchain);
+	__skb_queue_purge(&pktchain);
 	return sent ? sent : rc;
 }
 
-- 
cgit v0.10.2


From bf13c94ccb33c3182efc92ce4989506a0f541243 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Tue, 1 Mar 2016 14:31:02 +0100
Subject: qmi_wwan: add Sierra Wireless EM74xx device ID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MC74xx and EM74xx modules use different IDs by default, according
to the Lenovo EM7455 driver for Windows.

Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 4ce4ec1..a3a4ccf 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -861,8 +861,10 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x1199, 0x9056, 8)},	/* Sierra Wireless Modem */
 	{QMI_FIXED_INTF(0x1199, 0x9057, 8)},
 	{QMI_FIXED_INTF(0x1199, 0x9061, 8)},	/* Sierra Wireless Modem */
-	{QMI_FIXED_INTF(0x1199, 0x9071, 8)},	/* Sierra Wireless MC74xx/EM74xx */
-	{QMI_FIXED_INTF(0x1199, 0x9071, 10)},	/* Sierra Wireless MC74xx/EM74xx */
+	{QMI_FIXED_INTF(0x1199, 0x9071, 8)},	/* Sierra Wireless MC74xx */
+	{QMI_FIXED_INTF(0x1199, 0x9071, 10)},	/* Sierra Wireless MC74xx */
+	{QMI_FIXED_INTF(0x1199, 0x9079, 8)},	/* Sierra Wireless EM74xx */
+	{QMI_FIXED_INTF(0x1199, 0x9079, 10)},	/* Sierra Wireless EM74xx */
 	{QMI_FIXED_INTF(0x1bbb, 0x011e, 4)},	/* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */
 	{QMI_FIXED_INTF(0x1bbb, 0x0203, 2)},	/* Alcatel L800MA */
 	{QMI_FIXED_INTF(0x2357, 0x0201, 4)},	/* TP-LINK HSUPA Modem MA180 */
-- 
cgit v0.10.2


From 5d150a985520bbe3cb2aa1ceef24a7e32f20c15f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 1 Mar 2016 16:15:16 +0100
Subject: ipv6: re-enable fragment header matching in ipv6_find_hdr

When ipv6_find_hdr is used to find a fragment header
(caller specifies target NEXTHDR_FRAGMENT) we erronously return
-ENOENT for all fragments with nonzero offset.

Before commit 9195bb8e381d, when target was specified, we did not
enter the exthdr walk loop as nexthdr == target so this used to work.

Now we do (so we can skip empty route headers). When we then stumble upon
a frag with nonzero frag_off we must return -ENOENT ("header not found")
only if the caller did not specifically request NEXTHDR_FRAGMENT.

This allows nfables exthdr expression to match ipv6 fragments, e.g. via

nft add rule ip6 filter input frag frag-off gt 0

Fixes: 9195bb8e381d ("ipv6: improve ipv6_find_hdr() to skip empty routing headers")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 5c5d23e..9508a20 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -257,7 +257,11 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
 						*fragoff = _frag_off;
 					return hp->nexthdr;
 				}
-				return -ENOENT;
+				if (!found)
+					return -ENOENT;
+				if (fragoff)
+					*fragoff = _frag_off;
+				break;
 			}
 			hdrlen = 8;
 		} else if (nexthdr == NEXTHDR_AUTH) {
-- 
cgit v0.10.2


From de89e854bcc71ebaf30d415a0c015d1cb6c68856 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Tue, 1 Mar 2016 10:20:09 -0600
Subject: ibmvnic: Fix ibmvnic_capability struct

The ibmvnic_capability struct was defined incorrectly. The last two
elements of the struct are in the wrong order.  In addition, the number
element should be 64-bit. Byteswapping functions are updated
as well.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 7d657084..6e9e16ee 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1348,44 +1348,44 @@ static void init_sub_crqs(struct ibmvnic_adapter *adapter, int retry)
 	crq.request_capability.cmd = REQUEST_CAPABILITY;
 
 	crq.request_capability.capability = cpu_to_be16(REQ_TX_QUEUES);
-	crq.request_capability.number = cpu_to_be32(adapter->req_tx_queues);
+	crq.request_capability.number = cpu_to_be64(adapter->req_tx_queues);
 	ibmvnic_send_crq(adapter, &crq);
 
 	crq.request_capability.capability = cpu_to_be16(REQ_RX_QUEUES);
-	crq.request_capability.number = cpu_to_be32(adapter->req_rx_queues);
+	crq.request_capability.number = cpu_to_be64(adapter->req_rx_queues);
 	ibmvnic_send_crq(adapter, &crq);
 
 	crq.request_capability.capability = cpu_to_be16(REQ_RX_ADD_QUEUES);
-	crq.request_capability.number = cpu_to_be32(adapter->req_rx_add_queues);
+	crq.request_capability.number = cpu_to_be64(adapter->req_rx_add_queues);
 	ibmvnic_send_crq(adapter, &crq);
 
 	crq.request_capability.capability =
 	    cpu_to_be16(REQ_TX_ENTRIES_PER_SUBCRQ);
 	crq.request_capability.number =
-	    cpu_to_be32(adapter->req_tx_entries_per_subcrq);
+	    cpu_to_be64(adapter->req_tx_entries_per_subcrq);
 	ibmvnic_send_crq(adapter, &crq);
 
 	crq.request_capability.capability =
 	    cpu_to_be16(REQ_RX_ADD_ENTRIES_PER_SUBCRQ);
 	crq.request_capability.number =
-	    cpu_to_be32(adapter->req_rx_add_entries_per_subcrq);
+	    cpu_to_be64(adapter->req_rx_add_entries_per_subcrq);
 	ibmvnic_send_crq(adapter, &crq);
 
 	crq.request_capability.capability = cpu_to_be16(REQ_MTU);
-	crq.request_capability.number = cpu_to_be32(adapter->req_mtu);
+	crq.request_capability.number = cpu_to_be64(adapter->req_mtu);
 	ibmvnic_send_crq(adapter, &crq);
 
 	if (adapter->netdev->flags & IFF_PROMISC) {
 		if (adapter->promisc_supported) {
 			crq.request_capability.capability =
 			    cpu_to_be16(PROMISC_REQUESTED);
-			crq.request_capability.number = cpu_to_be32(1);
+			crq.request_capability.number = cpu_to_be64(1);
 			ibmvnic_send_crq(adapter, &crq);
 		}
 	} else {
 		crq.request_capability.capability =
 		    cpu_to_be16(PROMISC_REQUESTED);
-		crq.request_capability.number = cpu_to_be32(0);
+		crq.request_capability.number = cpu_to_be64(0);
 		ibmvnic_send_crq(adapter, &crq);
 	}
 
@@ -2312,93 +2312,93 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq,
 	switch (be16_to_cpu(crq->query_capability.capability)) {
 	case MIN_TX_QUEUES:
 		adapter->min_tx_queues =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "min_tx_queues = %lld\n",
 			   adapter->min_tx_queues);
 		break;
 	case MIN_RX_QUEUES:
 		adapter->min_rx_queues =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "min_rx_queues = %lld\n",
 			   adapter->min_rx_queues);
 		break;
 	case MIN_RX_ADD_QUEUES:
 		adapter->min_rx_add_queues =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "min_rx_add_queues = %lld\n",
 			   adapter->min_rx_add_queues);
 		break;
 	case MAX_TX_QUEUES:
 		adapter->max_tx_queues =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "max_tx_queues = %lld\n",
 			   adapter->max_tx_queues);
 		break;
 	case MAX_RX_QUEUES:
 		adapter->max_rx_queues =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "max_rx_queues = %lld\n",
 			   adapter->max_rx_queues);
 		break;
 	case MAX_RX_ADD_QUEUES:
 		adapter->max_rx_add_queues =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "max_rx_add_queues = %lld\n",
 			   adapter->max_rx_add_queues);
 		break;
 	case MIN_TX_ENTRIES_PER_SUBCRQ:
 		adapter->min_tx_entries_per_subcrq =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "min_tx_entries_per_subcrq = %lld\n",
 			   adapter->min_tx_entries_per_subcrq);
 		break;
 	case MIN_RX_ADD_ENTRIES_PER_SUBCRQ:
 		adapter->min_rx_add_entries_per_subcrq =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "min_rx_add_entrs_per_subcrq = %lld\n",
 			   adapter->min_rx_add_entries_per_subcrq);
 		break;
 	case MAX_TX_ENTRIES_PER_SUBCRQ:
 		adapter->max_tx_entries_per_subcrq =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "max_tx_entries_per_subcrq = %lld\n",
 			   adapter->max_tx_entries_per_subcrq);
 		break;
 	case MAX_RX_ADD_ENTRIES_PER_SUBCRQ:
 		adapter->max_rx_add_entries_per_subcrq =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "max_rx_add_entrs_per_subcrq = %lld\n",
 			   adapter->max_rx_add_entries_per_subcrq);
 		break;
 	case TCP_IP_OFFLOAD:
 		adapter->tcp_ip_offload =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "tcp_ip_offload = %lld\n",
 			   adapter->tcp_ip_offload);
 		break;
 	case PROMISC_SUPPORTED:
 		adapter->promisc_supported =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "promisc_supported = %lld\n",
 			   adapter->promisc_supported);
 		break;
 	case MIN_MTU:
-		adapter->min_mtu = be32_to_cpu(crq->query_capability.number);
+		adapter->min_mtu = be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "min_mtu = %lld\n", adapter->min_mtu);
 		break;
 	case MAX_MTU:
-		adapter->max_mtu = be32_to_cpu(crq->query_capability.number);
+		adapter->max_mtu = be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "max_mtu = %lld\n", adapter->max_mtu);
 		break;
 	case MAX_MULTICAST_FILTERS:
 		adapter->max_multicast_filters =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "max_multicast_filters = %lld\n",
 			   adapter->max_multicast_filters);
 		break;
 	case VLAN_HEADER_INSERTION:
 		adapter->vlan_header_insertion =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		if (adapter->vlan_header_insertion)
 			netdev->features |= NETIF_F_HW_VLAN_STAG_TX;
 		netdev_dbg(netdev, "vlan_header_insertion = %lld\n",
@@ -2406,43 +2406,43 @@ static void handle_query_cap_rsp(union ibmvnic_crq *crq,
 		break;
 	case MAX_TX_SG_ENTRIES:
 		adapter->max_tx_sg_entries =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "max_tx_sg_entries = %lld\n",
 			   adapter->max_tx_sg_entries);
 		break;
 	case RX_SG_SUPPORTED:
 		adapter->rx_sg_supported =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "rx_sg_supported = %lld\n",
 			   adapter->rx_sg_supported);
 		break;
 	case OPT_TX_COMP_SUB_QUEUES:
 		adapter->opt_tx_comp_sub_queues =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "opt_tx_comp_sub_queues = %lld\n",
 			   adapter->opt_tx_comp_sub_queues);
 		break;
 	case OPT_RX_COMP_QUEUES:
 		adapter->opt_rx_comp_queues =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "opt_rx_comp_queues = %lld\n",
 			   adapter->opt_rx_comp_queues);
 		break;
 	case OPT_RX_BUFADD_Q_PER_RX_COMP_Q:
 		adapter->opt_rx_bufadd_q_per_rx_comp_q =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "opt_rx_bufadd_q_per_rx_comp_q = %lld\n",
 			   adapter->opt_rx_bufadd_q_per_rx_comp_q);
 		break;
 	case OPT_TX_ENTRIES_PER_SUBCRQ:
 		adapter->opt_tx_entries_per_subcrq =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "opt_tx_entries_per_subcrq = %lld\n",
 			   adapter->opt_tx_entries_per_subcrq);
 		break;
 	case OPT_RXBA_ENTRIES_PER_SUBCRQ:
 		adapter->opt_rxba_entries_per_subcrq =
-		    be32_to_cpu(crq->query_capability.number);
+		    be64_to_cpu(crq->query_capability.number);
 		netdev_dbg(netdev, "opt_rxba_entries_per_subcrq = %lld\n",
 			   adapter->opt_rxba_entries_per_subcrq);
 		break;
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 1242925..1a9993c 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -319,10 +319,8 @@ struct ibmvnic_capability {
 	u8 first;
 	u8 cmd;
 	__be16 capability; /* one of ibmvnic_capabilities */
+	__be64 number;
 	struct ibmvnic_rc rc;
-	__be32 number; /*FIX: should be __be64, but I'm getting the least
-			* significant word first
-			*/
 } __packed __aligned(8);
 
 struct ibmvnic_login {
-- 
cgit v0.10.2


From e866863066a4e6c42117434dd5cd35c9842f5b82 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Tue, 1 Mar 2016 17:37:58 +0100
Subject: net: ethernet: renesas: ravb_main: don't open code
 of_device_get_match_data()

This change will also make Coverity happy by avoiding a theoretical NULL
pointer dereference; yet another reason is to use the above helper function
to tighten the code and make it more readable.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 744d780..86449c3 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -1722,7 +1722,6 @@ static int ravb_set_gti(struct net_device *ndev)
 static int ravb_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
-	const struct of_device_id *match;
 	struct ravb_private *priv;
 	enum ravb_chip_id chip_id;
 	struct net_device *ndev;
@@ -1754,8 +1753,7 @@ static int ravb_probe(struct platform_device *pdev)
 	ndev->base_addr = res->start;
 	ndev->dma = -1;
 
-	match = of_match_device(of_match_ptr(ravb_match_table), &pdev->dev);
-	chip_id = (enum ravb_chip_id)match->data;
+	chip_id = (enum ravb_chip_id)of_device_get_match_data(&pdev->dev);
 
 	if (chip_id == RCAR_GEN3)
 		irq = platform_get_irq_byname(pdev, "ch22");
-- 
cgit v0.10.2


From 42a67c9b0c0853f6a1159bd54ed0ba50df9f9ad8 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Tue, 1 Mar 2016 17:37:59 +0100
Subject: net: ethernet: renesas: sh_eth: don't open code
 of_device_get_match_data()

This change will also make Coverity happy by avoiding a theoretical NULL
pointer dereference; yet another reason is to use the above helper function
to tighten the code and make it more readable.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index dfa9e59..7384499 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -3061,15 +3061,11 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
 	mdp->ether_link_active_low = pd->ether_link_active_low;
 
 	/* set cpu data */
-	if (id) {
+	if (id)
 		mdp->cd = (struct sh_eth_cpu_data *)id->driver_data;
-	} else	{
-		const struct of_device_id *match;
+	else
+		mdp->cd = (struct sh_eth_cpu_data *)of_device_get_match_data(&pdev->dev);
 
-		match = of_match_device(of_match_ptr(sh_eth_match_table),
-					&pdev->dev);
-		mdp->cd = (struct sh_eth_cpu_data *)match->data;
-	}
 	mdp->reg_offset = sh_eth_get_register_offset(mdp->cd->register_type);
 	if (!mdp->reg_offset) {
 		dev_err(&pdev->dev, "Unknown register type (%d)\n",
-- 
cgit v0.10.2


From 7bcd79ac50d9d83350a835bdb91c04ac9e098412 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Fri, 26 Feb 2016 23:40:50 +0800
Subject: block: bio: introduce helpers to get the 1st and last bvec

The bio passed to bio_will_gap() may be fast cloned from upper
layer(dm, md, bcache, fs, ...), or from bio splitting in block
core.

Unfortunately bio_will_gap() just figures out the last bvec via
'bi_io_vec[prev->bi_vcnt - 1]' directly, and this way is obviously
wrong.

This patch introduces two helpers for getting the first and last
bvec of one bio for fixing the issue.

Cc: stable@vger.kernel.org
Reported-by: Sagi Grimberg <sagig@dev.mellanox.co.il>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5349e68..cb68888 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -310,6 +310,43 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit)
 	bio->bi_flags &= ~(1U << bit);
 }
 
+static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
+{
+	*bv = bio_iovec(bio);
+}
+
+static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
+{
+	struct bvec_iter iter = bio->bi_iter;
+	int idx;
+
+	if (!bio_flagged(bio, BIO_CLONED)) {
+		*bv = bio->bi_io_vec[bio->bi_vcnt - 1];
+		return;
+	}
+
+	if (unlikely(!bio_multiple_segments(bio))) {
+		*bv = bio_iovec(bio);
+		return;
+	}
+
+	bio_advance_iter(bio, &iter, iter.bi_size);
+
+	if (!iter.bi_bvec_done)
+		idx = iter.bi_idx - 1;
+	else	/* in the middle of bvec */
+		idx = iter.bi_idx;
+
+	*bv = bio->bi_io_vec[idx];
+
+	/*
+	 * iter.bi_bvec_done records actual length of the last bvec
+	 * if this bio ends in the middle of one io vector
+	 */
+	if (iter.bi_bvec_done)
+		bv->bv_len = iter.bi_bvec_done;
+}
+
 enum bip_flags {
 	BIP_BLOCK_INTEGRITY	= 1 << 0, /* block layer owns integrity data */
 	BIP_MAPPED_INTEGRITY	= 1 << 1, /* ref tag has been remapped */
-- 
cgit v0.10.2


From e0af29171aa8912e1ca95023b75ef336cd70d661 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Fri, 26 Feb 2016 23:40:51 +0800
Subject: block: check virt boundary in bio_will_gap()

In the following patch, the way for figuring out
the last bvec will be changed with a bit cost introduced,
so return immediately if the queue doesn't have virt
boundary limit. Actually most of devices have not
this limit.

Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4571ef1..cd06a41 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1372,6 +1372,13 @@ static inline void put_dev_sector(Sector p)
 	page_cache_release(p.v);
 }
 
+static inline bool __bvec_gap_to_prev(struct request_queue *q,
+				struct bio_vec *bprv, unsigned int offset)
+{
+	return offset ||
+		((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+}
+
 /*
  * Check if adding a bio_vec after bprv with offset would create a gap in
  * the SG list. Most drivers don't care about this, but some do.
@@ -1381,18 +1388,17 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
 {
 	if (!queue_virt_boundary(q))
 		return false;
-	return offset ||
-		((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+	return __bvec_gap_to_prev(q, bprv, offset);
 }
 
 static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
 			 struct bio *next)
 {
-	if (!bio_has_data(prev))
+	if (!bio_has_data(prev) || !queue_virt_boundary(q))
 		return false;
 
-	return bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1],
-				next->bi_io_vec[0].bv_offset);
+	return __bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1],
+				  next->bi_io_vec[0].bv_offset);
 }
 
 static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
-- 
cgit v0.10.2


From 25e71a99f10e444cd00bb2ebccb11e1c9fb672b1 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Fri, 26 Feb 2016 23:40:52 +0800
Subject: block: get the 1st and last bvec via helpers

This patch applies the two introduced helpers to
figure out the 1st and last bvec, and fixes the
original way after bio splitting.

Cc: stable@vger.kernel.org
Reported-by: Sagi Grimberg <sagig@dev.mellanox.co.il>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cd06a41..d7f6bca 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1394,11 +1394,16 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
 static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
 			 struct bio *next)
 {
-	if (!bio_has_data(prev) || !queue_virt_boundary(q))
-		return false;
+	if (bio_has_data(prev) && queue_virt_boundary(q)) {
+		struct bio_vec pb, nb;
+
+		bio_get_last_bvec(prev, &pb);
+		bio_get_first_bvec(next, &nb);
 
-	return __bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1],
-				  next->bi_io_vec[0].bv_offset);
+		return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+	}
+
+	return false;
 }
 
 static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
-- 
cgit v0.10.2


From e827091cb1bcd8e718ac3657845fb809c0b93324 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Fri, 26 Feb 2016 23:40:53 +0800
Subject: block: merge: get the 1st and last bvec via helpers

This patch applies the two introduced helpers to
figure out the 1st and last bvec.

Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 888a7fe..2613531 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -304,7 +304,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 				   struct bio *nxt)
 {
 	struct bio_vec end_bv = { NULL }, nxt_bv;
-	struct bvec_iter iter;
 
 	if (!blk_queue_cluster(q))
 		return 0;
@@ -316,11 +315,8 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 	if (!bio_has_data(bio))
 		return 1;
 
-	bio_for_each_segment(end_bv, bio, iter)
-		if (end_bv.bv_len == iter.bi_size)
-			break;
-
-	nxt_bv = bio_iovec(nxt);
+	bio_get_last_bvec(bio, &end_bv);
+	bio_get_first_bvec(nxt, &nxt_bv);
 
 	if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
 		return 0;
-- 
cgit v0.10.2


From b00a726a9fd82ddd4c10344e46f0d371e1674303 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 24 Feb 2016 09:15:52 -0700
Subject: NVMe: Don't unmap controller registers on reset

Unmapping the registers on reset or shutdown is not necessary. Keeping
the mapping simplifies reset handling.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a128672..2ea3e39 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1694,10 +1694,10 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	return 0;
 }
 
-static int nvme_dev_map(struct nvme_dev *dev)
+static int nvme_pci_enable(struct nvme_dev *dev)
 {
 	u64 cap;
-	int bars, result = -ENOMEM;
+	int result = -ENOMEM;
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	if (pci_enable_device_mem(pdev))
@@ -1705,24 +1705,14 @@ static int nvme_dev_map(struct nvme_dev *dev)
 
 	dev->entry[0].vector = pdev->irq;
 	pci_set_master(pdev);
-	bars = pci_select_bars(pdev, IORESOURCE_MEM);
-	if (!bars)
-		goto disable_pci;
-
-	if (pci_request_selected_regions(pdev, bars, "nvme"))
-		goto disable_pci;
 
 	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
 	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
 		goto disable;
 
-	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
-	if (!dev->bar)
-		goto disable;
-
 	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
 		result = -ENODEV;
-		goto unmap;
+		goto disable;
 	}
 
 	/*
@@ -1732,7 +1722,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	if (!pdev->irq) {
 		result = pci_enable_msix(pdev, dev->entry, 1);
 		if (result < 0)
-			goto unmap;
+			goto disable;
 	}
 
 	cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
@@ -1759,18 +1749,20 @@ static int nvme_dev_map(struct nvme_dev *dev)
 	pci_save_state(pdev);
 	return 0;
 
- unmap:
-	iounmap(dev->bar);
-	dev->bar = NULL;
  disable:
-	pci_release_regions(pdev);
- disable_pci:
 	pci_disable_device(pdev);
 	return result;
 }
 
 static void nvme_dev_unmap(struct nvme_dev *dev)
 {
+	if (dev->bar)
+		iounmap(dev->bar);
+	pci_release_regions(to_pci_dev(dev->dev));
+}
+
+static void nvme_pci_disable(struct nvme_dev *dev)
+{
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
 	if (pdev->msi_enabled)
@@ -1778,12 +1770,6 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
 	else if (pdev->msix_enabled)
 		pci_disable_msix(pdev);
 
-	if (dev->bar) {
-		iounmap(dev->bar);
-		dev->bar = NULL;
-		pci_release_regions(pdev);
-	}
-
 	if (pci_is_enabled(pdev)) {
 		pci_disable_pcie_error_reporting(pdev);
 		pci_disable_device(pdev);
@@ -1842,7 +1828,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	nvme_dev_list_remove(dev);
 
 	mutex_lock(&dev->shutdown_lock);
-	if (dev->bar) {
+	if (pci_is_enabled(to_pci_dev(dev->dev))) {
 		nvme_stop_queues(&dev->ctrl);
 		csts = readl(dev->bar + NVME_REG_CSTS);
 	}
@@ -1855,7 +1841,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 		nvme_disable_io_queues(dev);
 		nvme_disable_admin_queue(dev, shutdown);
 	}
-	nvme_dev_unmap(dev);
+	nvme_pci_disable(dev);
 
 	for (i = dev->queue_count - 1; i >= 0; i--)
 		nvme_clear_queue(dev->queues[i]);
@@ -1911,12 +1897,12 @@ static void nvme_reset_work(struct work_struct *work)
 	 * If we're called to reset a live controller first shut it down before
 	 * moving on.
 	 */
-	if (dev->bar)
+	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
 
 	set_bit(NVME_CTRL_RESETTING, &dev->flags);
 
-	result = nvme_dev_map(dev);
+	result = nvme_pci_enable(dev);
 	if (result)
 		goto out;
 
@@ -2042,6 +2028,27 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.free_ctrl		= nvme_pci_free_ctrl,
 };
 
+static int nvme_dev_map(struct nvme_dev *dev)
+{
+	int bars;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	bars = pci_select_bars(pdev, IORESOURCE_MEM);
+	if (!bars)
+		return -ENODEV;
+	if (pci_request_selected_regions(pdev, bars, "nvme"))
+		return -ENODEV;
+
+	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+	if (!dev->bar)
+		goto release;
+
+       return 0;
+  release:
+       pci_release_regions(pdev);
+       return -ENODEV;
+}
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	int node, result = -ENOMEM;
@@ -2066,6 +2073,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev->dev = get_device(&pdev->dev);
 	pci_set_drvdata(pdev, dev);
 
+	result = nvme_dev_map(dev);
+	if (result)
+		goto free;
+
 	INIT_LIST_HEAD(&dev->node);
 	INIT_WORK(&dev->scan_work, nvme_dev_scan);
 	INIT_WORK(&dev->reset_work, nvme_reset_work);
@@ -2089,6 +2100,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	nvme_release_prp_pools(dev);
  put_pci:
 	put_device(dev->dev);
+	nvme_dev_unmap(dev);
  free:
 	kfree(dev->queues);
 	kfree(dev->entry);
@@ -2126,6 +2138,7 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_free_queues(dev, 0);
 	nvme_release_cmb(dev);
 	nvme_release_prp_pools(dev);
+	nvme_dev_unmap(dev);
 	nvme_put_ctrl(&dev->ctrl);
 }
 
-- 
cgit v0.10.2


From 075790ebba4a1eb297f9875e581b55c0382b1f3d Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 24 Feb 2016 09:15:53 -0700
Subject: NVMe: Use IDA for namespace disk naming

A namespace may be detached from a controller, but a user may be holding
a reference to it. Attaching a new namespace with the same NSID will create
duplicate names when using the NSID to name the disk.

This patch uses an IDA that is released only when the last reference is
released instead of using the namespace ID.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3cd921e..6c39dbf 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -55,8 +55,9 @@ static void nvme_free_ns(struct kref *kref)
 	ns->disk->private_data = NULL;
 	spin_unlock(&dev_list_lock);
 
-	nvme_put_ctrl(ns->ctrl);
 	put_disk(ns->disk);
+	ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
+	nvme_put_ctrl(ns->ctrl);
 	kfree(ns);
 }
 
@@ -1118,9 +1119,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	if (!ns)
 		return;
 
+	ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
+	if (ns->instance < 0)
+		goto out_free_ns;
+
 	ns->queue = blk_mq_init_queue(ctrl->tagset);
 	if (IS_ERR(ns->queue))
-		goto out_free_ns;
+		goto out_release_instance;
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
 	ns->queue->queuedata = ns;
 	ns->ctrl = ctrl;
@@ -1153,7 +1158,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	disk->queue = ns->queue;
 	disk->driverfs_dev = ctrl->device;
 	disk->flags = GENHD_FL_EXT_DEVT;
-	sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
+	sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
 
 	if (nvme_revalidate_disk(ns->disk))
 		goto out_free_disk;
@@ -1173,6 +1178,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	kfree(disk);
  out_free_queue:
 	blk_cleanup_queue(ns->queue);
+ out_release_instance:
+	ida_simple_remove(&ctrl->ns_ida, ns->instance);
  out_free_ns:
 	kfree(ns);
 }
@@ -1350,6 +1357,7 @@ static void nvme_free_ctrl(struct kref *kref)
 
 	put_device(ctrl->device);
 	nvme_release_instance(ctrl);
+	ida_destroy(&ctrl->ns_ida);
 
 	ctrl->ops->free_ctrl(ctrl);
 }
@@ -1390,6 +1398,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	}
 	get_device(ctrl->device);
 	dev_set_drvdata(ctrl->device, ctrl);
+	ida_init(&ctrl->ns_ida);
 
 	spin_lock(&dev_list_lock);
 	list_add_tail(&ctrl->node, &nvme_ctrl_list);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9664d07..9407f2f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -72,6 +72,7 @@ struct nvme_ctrl {
 	struct mutex namespaces_mutex;
 	struct device *device;	/* char device */
 	struct list_head node;
+	struct ida ns_ida;
 
 	char name[12];
 	char serial[20];
@@ -102,6 +103,7 @@ struct nvme_ns {
 	struct request_queue *queue;
 	struct gendisk *disk;
 	struct kref kref;
+	int instance;
 
 	u8 eui[8];
 	u8 uuid[16];
-- 
cgit v0.10.2


From 646017a612e72f19bd9f991fe25287a149c5f627 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 24 Feb 2016 09:15:54 -0700
Subject: NVMe: Fix namespace removal deadlock

This patch makes nvme namespace removal lockless. It is up to the caller
to ensure no active namespace scanning is occuring. To ensure no scan
work occurs, the nvme pci driver adds a removing state to the controller
device to avoid queueing scan work during removal. The work is flushed
after setting the state, so no new scan work can be queued.

The lockless removal allows the driver to cleanup a namespace
request_queue if the controller fails during removal. Previously this
could deadlock trying to acquire the namespace mutex in order to handle
such events.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6c39dbf..8c2ddd5 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1186,11 +1186,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 static void nvme_ns_remove(struct nvme_ns *ns)
 {
-	bool kill = nvme_io_incapable(ns->ctrl) &&
-			!blk_queue_dying(ns->queue);
+	bool kill;
 
-	lockdep_assert_held(&ns->ctrl->namespaces_mutex);
+	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
+		return;
 
+	kill = nvme_io_incapable(ns->ctrl) &&
+			!blk_queue_dying(ns->queue);
 	if (kill) {
 		blk_set_queue_dying(ns->queue);
 
@@ -1213,7 +1215,9 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 		blk_mq_abort_requeue_list(ns->queue);
 		blk_cleanup_queue(ns->queue);
 	}
+	mutex_lock(&ns->ctrl->namespaces_mutex);
 	list_del_init(&ns->list);
+	mutex_unlock(&ns->ctrl->namespaces_mutex);
 	nvme_put_ns(ns);
 }
 
@@ -1307,10 +1311,8 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns, *next;
 
-	mutex_lock(&ctrl->namespaces_mutex);
 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
 		nvme_ns_remove(ns);
-	mutex_unlock(&ctrl->namespaces_mutex);
 }
 
 static DEFINE_IDA(nvme_instance_ida);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9407f2f..4075fa9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -114,6 +114,10 @@ struct nvme_ns {
 	bool ext;
 	u8 pi_type;
 	int type;
+	unsigned long flags;
+
+#define NVME_NS_REMOVING 0
+
 	u64 mode_select_num_blocks;
 	u32 mode_select_block_len;
 };
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2ea3e39..122f803 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -120,6 +120,7 @@ struct nvme_dev {
 	unsigned long flags;
 
 #define NVME_CTRL_RESETTING    0
+#define NVME_CTRL_REMOVING     1
 
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
@@ -286,6 +287,17 @@ static int nvme_init_request(void *data, struct request *req,
 	return 0;
 }
 
+static void nvme_queue_scan(struct nvme_dev *dev)
+{
+	/*
+	 * Do not queue new scan work when a controller is reset during
+	 * removal.
+	 */
+	if (test_bit(NVME_CTRL_REMOVING, &dev->flags))
+		return;
+	queue_work(nvme_workq, &dev->scan_work);
+}
+
 static void nvme_complete_async_event(struct nvme_dev *dev,
 		struct nvme_completion *cqe)
 {
@@ -300,7 +312,7 @@ static void nvme_complete_async_event(struct nvme_dev *dev,
 	switch (result & 0xff07) {
 	case NVME_AER_NOTICE_NS_CHANGED:
 		dev_info(dev->dev, "rescanning\n");
-		queue_work(nvme_workq, &dev->scan_work);
+		nvme_queue_scan(dev);
 	default:
 		dev_warn(dev->dev, "async event result %08x\n", result);
 	}
@@ -1690,7 +1702,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 			return 0;
 		dev->ctrl.tagset = &dev->tagset;
 	}
-	queue_work(nvme_workq, &dev->scan_work);
+	nvme_queue_scan(dev);
 	return 0;
 }
 
@@ -2128,6 +2140,7 @@ static void nvme_remove(struct pci_dev *pdev)
 {
 	struct nvme_dev *dev = pci_get_drvdata(pdev);
 
+	set_bit(NVME_CTRL_REMOVING, &dev->flags);
 	pci_set_drvdata(pdev, NULL);
 	flush_work(&dev->scan_work);
 	nvme_remove_namespaces(&dev->ctrl);
-- 
cgit v0.10.2


From f58944e265d4ebe47216a5d7488aee3928823d30 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 24 Feb 2016 09:15:55 -0700
Subject: NVMe: Simplify device reset failure

A reset failure schedules the device to unbind from the driver through
the pci driver's remove. This cleans up all intialization, so there is
no need to duplicate the potentially racy cleanup.

To help understand why a reset failed, the status is logged with the
existing warning message.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 122f803..6d2e425 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,7 +86,6 @@ struct nvme_queue;
 
 static int nvme_reset(struct nvme_dev *dev);
 static void nvme_process_cq(struct nvme_queue *nvmeq);
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 
 /*
@@ -1897,10 +1896,19 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 	kfree(dev);
 }
 
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+	dev_warn(dev->dev, "Removing after probe failure status: %d\n", status);
+
+	kref_get(&dev->ctrl.kref);
+	if (!schedule_work(&dev->remove_work))
+		nvme_put_ctrl(&dev->ctrl);
+}
+
 static void nvme_reset_work(struct work_struct *work)
 {
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
-	int result;
+	int result = -ENODEV;
 
 	if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
 		goto out;
@@ -1920,26 +1928,26 @@ static void nvme_reset_work(struct work_struct *work)
 
 	result = nvme_configure_admin_queue(dev);
 	if (result)
-		goto unmap;
+		goto out;
 
 	nvme_init_queue(dev->queues[0], 0);
 	result = nvme_alloc_admin_tags(dev);
 	if (result)
-		goto disable;
+		goto out;
 
 	result = nvme_init_identify(&dev->ctrl);
 	if (result)
-		goto free_tags;
+		goto out;
 
 	result = nvme_setup_io_queues(dev);
 	if (result)
-		goto free_tags;
+		goto out;
 
 	dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
 
 	result = nvme_dev_list_add(dev);
 	if (result)
-		goto remove;
+		goto out;
 
 	/*
 	 * Keep the controller around but remove all namespaces if we don't have
@@ -1956,19 +1964,8 @@ static void nvme_reset_work(struct work_struct *work)
 	clear_bit(NVME_CTRL_RESETTING, &dev->flags);
 	return;
 
- remove:
-	nvme_dev_list_remove(dev);
- free_tags:
-	nvme_dev_remove_admin(dev);
-	blk_put_queue(dev->ctrl.admin_q);
-	dev->ctrl.admin_q = NULL;
-	dev->queues[0]->tags = NULL;
- disable:
-	nvme_disable_admin_queue(dev, false);
- unmap:
-	nvme_dev_unmap(dev);
  out:
-	nvme_remove_dead_ctrl(dev);
+	nvme_remove_dead_ctrl(dev, result);
 }
 
 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -1981,14 +1978,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 	nvme_put_ctrl(&dev->ctrl);
 }
 
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
-{
-	dev_warn(dev->dev, "Removing after probe failure\n");
-	kref_get(&dev->ctrl.kref);
-	if (!schedule_work(&dev->remove_work))
-		nvme_put_ctrl(&dev->ctrl);
-}
-
 static int nvme_reset(struct nvme_dev *dev)
 {
 	if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
@@ -2136,6 +2125,11 @@ static void nvme_shutdown(struct pci_dev *pdev)
 	nvme_dev_disable(dev, true);
 }
 
+/*
+ * The driver's remove may be called on a device in a partially initialized
+ * state. This function must not have any dependencies on the device state in
+ * order to proceed.
+ */
 static void nvme_remove(struct pci_dev *pdev)
 {
 	struct nvme_dev *dev = pci_get_drvdata(pdev);
-- 
cgit v0.10.2


From 69d9a99c258eb1d6478fd9608a2070890797eed7 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 24 Feb 2016 09:15:56 -0700
Subject: NVMe: Move error handling to failed reset handler

This moves failed queue handling out of the namespace removal path and
into the reset failure path, fixing a hanging condition if the controller
fails or link down during del_gendisk. Previously the driver had to see
the controller as degraded prior to calling del_gendisk to setup the
queues to fail. But, if the controller happened to fail after this,
there was no task to end outstanding requests.

On failure, all namespace states are set to dead. This has capacity
revalidate to 0, and ends all new requests with error status.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8c2ddd5..7fd5a7a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -557,6 +557,10 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	u16 old_ms;
 	unsigned short bs;
 
+	if (test_bit(NVME_NS_DEAD, &ns->flags)) {
+		set_capacity(disk, 0);
+		return -ENODEV;
+	}
 	if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
 		dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
 				__func__, ns->ctrl->instance, ns->ns_id);
@@ -1186,32 +1190,15 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 static void nvme_ns_remove(struct nvme_ns *ns)
 {
-	bool kill;
-
 	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
 		return;
 
-	kill = nvme_io_incapable(ns->ctrl) &&
-			!blk_queue_dying(ns->queue);
-	if (kill) {
-		blk_set_queue_dying(ns->queue);
-
-		/*
-		 * The controller was shutdown first if we got here through
-		 * device removal. The shutdown may requeue outstanding
-		 * requests. These need to be aborted immediately so
-		 * del_gendisk doesn't block indefinitely for their completion.
-		 */
-		blk_mq_abort_requeue_list(ns->queue);
-	}
 	if (ns->disk->flags & GENHD_FL_UP) {
 		if (blk_get_integrity(ns->disk))
 			blk_integrity_unregister(ns->disk);
 		sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
 					&nvme_ns_attr_group);
 		del_gendisk(ns->disk);
-	}
-	if (kill || !blk_queue_dying(ns->queue)) {
 		blk_mq_abort_requeue_list(ns->queue);
 		blk_cleanup_queue(ns->queue);
 	}
@@ -1413,6 +1400,38 @@ out:
 	return ret;
 }
 
+/**
+ * nvme_kill_queues(): Ends all namespace queues
+ * @ctrl: the dead controller that needs to end
+ *
+ * Call this function when the driver determines it is unable to get the
+ * controller in a state capable of servicing IO.
+ */
+void nvme_kill_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	mutex_lock(&ctrl->namespaces_mutex);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (!kref_get_unless_zero(&ns->kref))
+			continue;
+
+		/*
+		 * Revalidating a dead namespace sets capacity to 0. This will
+		 * end buffered writers dirtying pages that can't be synced.
+		 */
+		if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+			revalidate_disk(ns->disk);
+
+		blk_set_queue_dying(ns->queue);
+		blk_mq_abort_requeue_list(ns->queue);
+		blk_mq_start_stopped_hw_queues(ns->queue, true);
+
+		nvme_put_ns(ns);
+	}
+	mutex_unlock(&ctrl->namespaces_mutex);
+}
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 4075fa9..fb15ba5 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -117,6 +117,7 @@ struct nvme_ns {
 	unsigned long flags;
 
 #define NVME_NS_REMOVING 0
+#define NVME_NS_DEAD     1
 
 	u64 mode_select_num_blocks;
 	u32 mode_select_block_len;
@@ -246,6 +247,7 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_kill_queues(struct nvme_ctrl *ctrl);
 
 struct request *nvme_alloc_request(struct request_queue *q,
 		struct nvme_command *cmd, unsigned int flags);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6d2e425..680f578 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -690,7 +690,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	spin_lock_irq(&nvmeq->q_lock);
 	if (unlikely(nvmeq->cq_vector < 0)) {
-		ret = BLK_MQ_RQ_QUEUE_BUSY;
+		if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
+			ret = BLK_MQ_RQ_QUEUE_BUSY;
+		else
+			ret = BLK_MQ_RQ_QUEUE_ERROR;
 		spin_unlock_irq(&nvmeq->q_lock);
 		goto out;
 	}
@@ -1261,6 +1264,12 @@ static struct blk_mq_ops nvme_mq_ops = {
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
 {
 	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+		/*
+		 * If the controller was reset during removal, it's possible
+		 * user requests may be waiting on a stopped queue. Start the
+		 * queue to flush these to completion.
+		 */
+		blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
 		blk_cleanup_queue(dev->ctrl.admin_q);
 		blk_mq_free_tag_set(&dev->admin_tagset);
 	}
@@ -1901,6 +1910,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
 	dev_warn(dev->dev, "Removing after probe failure status: %d\n", status);
 
 	kref_get(&dev->ctrl.kref);
+	nvme_dev_disable(dev, false);
 	if (!schedule_work(&dev->remove_work))
 		nvme_put_ctrl(&dev->ctrl);
 }
@@ -1973,6 +1983,7 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
 	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 
+	nvme_kill_queues(&dev->ctrl);
 	if (pci_get_drvdata(pdev))
 		pci_stop_and_remove_bus_device_locked(pdev);
 	nvme_put_ctrl(&dev->ctrl);
-- 
cgit v0.10.2


From 63088ec7c8eadfe08b96127a41b385ec9742dace Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 24 Feb 2016 09:15:57 -0700
Subject: NVMe: Don't allow unsupported flags

The command flags can change the meaning of other fields in the command
that the driver is not prepared to handle. Specifically, the user could
passthrough an SGL flag, causing the controller to misinterpret the PRP
list the driver created, potentially corrupting memory or data.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7fd5a7a..ba15015 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -374,6 +374,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 
 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
+	if (io.flags)
+		return -EINVAL;
 
 	switch (io.opcode) {
 	case nvme_cmd_write:
@@ -425,6 +427,8 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 		return -EACCES;
 	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
 		return -EFAULT;
+	if (cmd.flags)
+		return -EINVAL;
 
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = cmd.opcode;
-- 
cgit v0.10.2


From e9fc63d682dbbef17921aeb00d03fd52d6735ffd Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 24 Feb 2016 09:15:58 -0700
Subject: NVMe: Fix 0-length integrity payload

A user could send a passthrough IO command with a metadata pointer to a
namespace without metadata. With metadata length of 0, kmalloc returns
ZERO_SIZE_PTR. Since that is not NULL, the driver would have set this as
the bio's integrity payload, which causes an access fault on completion.

This patch ignores the users metadata buffer if the namespace format
does not support separate metadata.

Reported-by: Stephen Bates <stephen.bates@microsemi.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ba15015..470d4f3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -184,7 +184,7 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
 			goto out_unmap;
 		}
 
-		if (meta_buffer) {
+		if (meta_buffer && meta_len) {
 			struct bio_integrity_payload *bip;
 
 			meta = kmalloc(meta_len, GFP_KERNEL);
-- 
cgit v0.10.2


From a1a0e23e49037c23ea84bc8cc146a03584d13577 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 29 Feb 2016 18:28:53 -0500
Subject: writeback: flush inode cgroup wb switches instead of pinning
 super_block

If cgroup writeback is in use, inodes can be scheduled for
asynchronous wb switching.  Before 5ff8eaac1636 ("writeback: keep
superblock pinned during cgroup writeback association switches"), this
could race with umount leading to super_block being destroyed while
inodes are pinned for wb switching.  5ff8eaac1636 fixed it by bumping
s_active while wb switches are in flight; however, this allowed
in-flight wb switches to make umounts asynchronous when the userland
expected synchronosity - e.g. fsck immediately following umount may
fail because the device is still busy.

This patch removes the problematic super_block pinning and instead
makes generic_shutdown_super() flush in-flight wb switches.  wb
switches are now executed on a dedicated isw_wq so that they can be
flushed and isw_nr_in_flight keeps track of the number of in-flight wb
switches so that flushing can be avoided in most cases.

v2: Move cgroup_writeback_umount() further below and add MS_ACTIVE
    check in inode_switch_wbs() as Jan an Al suggested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Tahsin Erdogan <tahsin@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Link: http://lkml.kernel.org/g/CAAeU0aNCq7LGODvVGRU-oU_o-6enii5ey0p1c26D1ZzYwkDc5A@mail.gmail.com
Fixes: 5ff8eaac1636 ("writeback: keep superblock pinned during cgroup writeback association switches")
Cc: stable@vger.kernel.org #v4.5
Reviewed-by: Jan Kara <jack@suse.cz>
Tested-by: Tahsin Erdogan <tahsin@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1f76d89..5c46ed9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi,
 #define WB_FRN_HIST_MAX_SLOTS	(WB_FRN_HIST_THR_SLOTS / 2 + 1)
 					/* one round can affect upto 5 slots */
 
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
 void __inode_attach_wb(struct inode *inode, struct page *page)
 {
 	struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -317,7 +320,6 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
 	struct inode_switch_wbs_context *isw =
 		container_of(work, struct inode_switch_wbs_context, work);
 	struct inode *inode = isw->inode;
-	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	struct bdi_writeback *old_wb = inode->i_wb;
 	struct bdi_writeback *new_wb = isw->new_wb;
@@ -424,8 +426,9 @@ skip_switch:
 	wb_put(new_wb);
 
 	iput(inode);
-	deactivate_super(sb);
 	kfree(isw);
+
+	atomic_dec(&isw_nr_in_flight);
 }
 
 static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -435,7 +438,7 @@ static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
 
 	/* needs to grab bh-unsafe locks, bounce to work item */
 	INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
-	schedule_work(&isw->work);
+	queue_work(isw_wq, &isw->work);
 }
 
 /**
@@ -471,20 +474,20 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 
 	/* while holding I_WB_SWITCH, no one else can update the association */
 	spin_lock(&inode->i_lock);
-
-	if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
-	    inode_to_wb(inode) == isw->new_wb)
-		goto out_unlock;
-
-	if (!atomic_inc_not_zero(&inode->i_sb->s_active))
-		goto out_unlock;
-
+	if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+	    inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+	    inode_to_wb(inode) == isw->new_wb) {
+		spin_unlock(&inode->i_lock);
+		goto out_free;
+	}
 	inode->i_state |= I_WB_SWITCH;
 	spin_unlock(&inode->i_lock);
 
 	ihold(inode);
 	isw->inode = inode;
 
+	atomic_inc(&isw_nr_in_flight);
+
 	/*
 	 * In addition to synchronizing among switchers, I_WB_SWITCH tells
 	 * the RCU protected stat update paths to grab the mapping's
@@ -494,8 +497,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
 	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
 	return;
 
-out_unlock:
-	spin_unlock(&inode->i_lock);
 out_free:
 	if (isw->new_wb)
 		wb_put(isw->new_wb);
@@ -847,6 +848,33 @@ restart:
 		wb_put(last_wb);
 }
 
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches.  An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished.  As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+	if (atomic_read(&isw_nr_in_flight)) {
+		synchronize_rcu();
+		flush_workqueue(isw_wq);
+	}
+}
+
+static int __init cgroup_writeback_init(void)
+{
+	isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+	if (!isw_wq)
+		return -ENOMEM;
+	return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
 static struct bdi_writeback *
diff --git a/fs/super.c b/fs/super.c
index 1182af8..74914b1 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@ void generic_shutdown_super(struct super_block *sb)
 		sb->s_flags &= ~MS_ACTIVE;
 
 		fsnotify_unmount_inodes(sb);
+		cgroup_writeback_umount();
 
 		evict_inodes(sb);
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b333c94..d0b5ca5 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -198,6 +198,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 void wbc_detach_inode(struct writeback_control *wbc);
 void wbc_account_io(struct writeback_control *wbc, struct page *page,
 		    size_t bytes);
+void cgroup_writeback_umount(void);
 
 /**
  * inode_attach_wb - associate an inode with its wb
@@ -301,6 +302,10 @@ static inline void wbc_account_io(struct writeback_control *wbc,
 {
 }
 
+static inline void cgroup_writeback_umount(void)
+{
+}
+
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 /*
-- 
cgit v0.10.2


From da35825d9a091a7a1d5824c8468168e2658333ff Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Mar 2016 18:07:11 +0100
Subject: nvme: set queue limits for the admin queue

Factor out a helper to set all the device specific queue limits and apply
them to the admin queue in addition to the I/O queues.  Without this the
command size on the admin queue is arbitrarily low, and the missing
other limitations are just minefields waiting for victims.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Jeff Lien <Jeff.Lien@hgst.com>
Tested-by: Jeff Lien <Jeff.Lien@hgst.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 470d4f3..cfee6ac 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -840,6 +840,21 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
 	return ret;
 }
 
+static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
+		struct request_queue *q)
+{
+	if (ctrl->max_hw_sectors) {
+		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
+		blk_queue_max_segments(q,
+			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
+	}
+	if (ctrl->stripe_size)
+		blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9);
+	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+		blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+	blk_queue_virt_boundary(q, ctrl->page_size - 1);
+}
+
 /*
  * Initialize the cached copies of the Identify data and various controller
  * register in our nvme_ctrl structure.  This should be called as soon as
@@ -897,6 +912,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		}
 	}
 
+	nvme_set_queue_limits(ctrl, ctrl->admin_q);
+
 	kfree(id);
 	return 0;
 }
@@ -1147,17 +1164,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	ns->disk = disk;
 	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
 
+
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
-	if (ctrl->max_hw_sectors) {
-		blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
-		blk_queue_max_segments(ns->queue,
-			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
-	}
-	if (ctrl->stripe_size)
-		blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
-	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
-		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
-	blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
+	nvme_set_queue_limits(ctrl, ns->queue);
 
 	disk->major = nvme_major;
 	disk->first_minor = 0;
-- 
cgit v0.10.2


From 45686b6198bd824f083ff5293f191d78db9d708a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Mar 2016 18:07:12 +0100
Subject: nvme: fix max_segments integer truncation

The block layer uses an unsigned short for max_segments.  The way we
calculate the value for NVMe tends to generate very large 32-bit values,
which after integer truncation may lead to a zero value instead of
the desired outcome.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Jeff Lien <Jeff.Lien@hgst.com>
Tested-by: Jeff Lien <Jeff.Lien@hgst.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index cfee6ac..03c4641 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -844,9 +844,11 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		struct request_queue *q)
 {
 	if (ctrl->max_hw_sectors) {
+		u32 max_segments =
+			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
+
 		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
-		blk_queue_max_segments(q,
-			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
+		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
 	}
 	if (ctrl->stripe_size)
 		blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9);
-- 
cgit v0.10.2


From f21018427cb007a0894c36ad702990ab639cbbb4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 3 Mar 2016 14:43:45 -0700
Subject: block: fix blk_rq_get_max_sectors for driver private requests

Driver private request types should not get the artifical cap for the
FS requests.  This is important to use the full device capabilities
for internal command or NVMe pass through commands.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Jeff Lien <Jeff.Lien@hgst.com>
Tested-by: Jeff Lien <Jeff.Lien@hgst.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>

Updated by me to use an explicit check for the one command type that
does support extended checking, instead of relying on the ordering
of the enum command values - as suggested by Keith.

Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d7f6bca..413c84f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -895,7 +895,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
-	if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
+	if (unlikely(rq->cmd_type != REQ_TYPE_FS))
 		return q->limits.max_hw_sectors;
 
 	if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD))
-- 
cgit v0.10.2


From 4d6af73d9e43f78651a43ee4c5ad221107ac8365 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 2 Mar 2016 18:07:14 +0100
Subject: block: support large requests in blk_rq_map_user_iov

This patch adds support for larger requests in blk_rq_map_user_iov by
allowing it to build multiple bios for a request.  This functionality
used to exist for the non-vectored blk_rq_map_user in the past, and
this patch reuses the existing functionality for it on the unmap side,
which stuck around.  Thanks to the iov_iter API supporting multiple
bios is fairly trivial, as we can just iterate the iov until we've
consumed the whole iov_iter.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Jeff Lien <Jeff.Lien@hgst.com>
Tested-by: Jeff Lien <Jeff.Lien@hgst.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/block/blk-map.c b/block/blk-map.c
index f565e11..a54f054 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -57,6 +57,49 @@ static int __blk_rq_unmap_user(struct bio *bio)
 	return ret;
 }
 
+static int __blk_rq_map_user_iov(struct request *rq,
+		struct rq_map_data *map_data, struct iov_iter *iter,
+		gfp_t gfp_mask, bool copy)
+{
+	struct request_queue *q = rq->q;
+	struct bio *bio, *orig_bio;
+	int ret;
+
+	if (copy)
+		bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
+	else
+		bio = bio_map_user_iov(q, iter, gfp_mask);
+
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	if (map_data && map_data->null_mapped)
+		bio_set_flag(bio, BIO_NULL_MAPPED);
+
+	iov_iter_advance(iter, bio->bi_iter.bi_size);
+	if (map_data)
+		map_data->offset += bio->bi_iter.bi_size;
+
+	orig_bio = bio;
+	blk_queue_bounce(q, &bio);
+
+	/*
+	 * We link the bounce buffer in and could have to traverse it
+	 * later so we have to get a ref to prevent it from being freed
+	 */
+	bio_get(bio);
+
+	ret = blk_rq_append_bio(q, rq, bio);
+	if (ret) {
+		bio_endio(bio);
+		__blk_rq_unmap_user(orig_bio);
+		bio_put(bio);
+		return ret;
+	}
+
+	return 0;
+}
+
 /**
  * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:		request queue where request should be inserted
@@ -82,10 +125,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 			struct rq_map_data *map_data,
 			const struct iov_iter *iter, gfp_t gfp_mask)
 {
-	struct bio *bio;
-	int unaligned = 0;
-	struct iov_iter i;
 	struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
+	bool copy = (q->dma_pad_mask & iter->count) || map_data;
+	struct bio *bio = NULL;
+	struct iov_iter i;
+	int ret;
 
 	if (!iter || !iter->count)
 		return -EINVAL;
@@ -101,42 +145,29 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		 */
 		if ((uaddr & queue_dma_alignment(q)) ||
 		    iovec_gap_to_prv(q, &prv, &iov))
-			unaligned = 1;
+			copy = true;
 
 		prv.iov_base = iov.iov_base;
 		prv.iov_len = iov.iov_len;
 	}
 
-	if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
-		bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
-	else
-		bio = bio_map_user_iov(q, iter, gfp_mask);
-
-	if (IS_ERR(bio))
-		return PTR_ERR(bio);
-
-	if (map_data && map_data->null_mapped)
-		bio_set_flag(bio, BIO_NULL_MAPPED);
-
-	if (bio->bi_iter.bi_size != iter->count) {
-		/*
-		 * Grab an extra reference to this bio, as bio_unmap_user()
-		 * expects to be able to drop it twice as it happens on the
-		 * normal IO completion path
-		 */
-		bio_get(bio);
-		bio_endio(bio);
-		__blk_rq_unmap_user(bio);
-		return -EINVAL;
-	}
+	i = *iter;
+	do {
+		ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
+		if (ret)
+			goto unmap_rq;
+		if (!bio)
+			bio = rq->bio;
+	} while (iov_iter_count(&i));
 
 	if (!bio_flagged(bio, BIO_USER_MAPPED))
 		rq->cmd_flags |= REQ_COPY_USER;
-
-	blk_queue_bounce(q, &bio);
-	bio_get(bio);
-	blk_rq_bio_prep(q, rq, bio);
 	return 0;
+
+unmap_rq:
+	__blk_rq_unmap_user(bio);
+	rq->bio = NULL;
+	return -EINVAL;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
 
-- 
cgit v0.10.2


From 4024fcf70556311521e7b6cf79fa50e16f31013a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 2 Mar 2016 02:32:08 +0100
Subject: vxlan: fix missing options_len update on RX with collect metadata

When signalling to metadata consumers that the metadata_dst entry
carries additional GBP extension data for vxlan (TUNNEL_VXLAN_OPT),
the dst's vxlan_metadata information is populated, but options_len
is left to zero. F.e. in ovs, ovs_flow_key_extract() checks for
options_len before extracting the data through ip_tunnel_info_opts_get().

Geneve uses ip_tunnel_info_opts_set() helper in receive path, which
sets options_len internally, vxlan however uses ip_tunnel_info_opts(),
so when filling vxlan_metadata, we do need to update options_len.

Fixes: 4c22279848c5 ("ip-tunnel: Use API to access tunnel metadata options.")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index b601139..1c32bd1 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1308,8 +1308,10 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		gbp = (struct vxlanhdr_gbp *)vxh;
 		md->gbp = ntohs(gbp->policy_id);
 
-		if (tun_dst)
+		if (tun_dst) {
 			tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
+			tun_dst->u.tun_info.options_len = sizeof(*md);
+		}
 
 		if (gbp->dont_learn)
 			md->gbp |= VXLAN_GBP_DONT_LEARN;
-- 
cgit v0.10.2


From 63a0a00b90b5193173bcf76ef24afdda8eb05d3a Mon Sep 17 00:00:00 2001
From: Nimrod Andy <B38611@freescale.com>
Date: Wed, 2 Mar 2016 17:24:53 +0800
Subject: MAINTAINERS: add maintainer entry for FREESCALE FEC ethernet driver

Add a maintainer entry for FREESCALE FEC ethernet driver and add myself
as a maintainer.

Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
Acked-by: Frank Li <Frank.Li@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/MAINTAINERS b/MAINTAINERS
index e6b14f2..bcda623 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4542,6 +4542,15 @@ S:	Maintained
 F:	drivers/net/ethernet/freescale/fs_enet/
 F:	include/linux/fs_enet_pd.h
 
+FREESCALE IMX / MXC FEC DRIVER
+M:	Fugang Duan <fugang.duan@nxp.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/ethernet/freescale/fec_main.c
+F:	drivers/net/ethernet/freescale/fec_ptp.c
+F:	drivers/net/ethernet/freescale/fec.h
+F:	Documentation/devicetree/bindings/net/fsl-fec.txt
+
 FREESCALE QUICC ENGINE LIBRARY
 L:	linuxppc-dev@lists.ozlabs.org
 S:	Orphan
-- 
cgit v0.10.2


From 1d3cd1773fddfdc9ffb0c2dec9a954c7a54bc207 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 2 Mar 2016 13:11:10 +0300
Subject: net: moxa: fix an error code

We accidentally return IS_ERR(priv->base) which is 1 instead of
PTR_ERR(priv->base) which is the error code.

Fixes: 6c821bd9edc9 ('net: Add MOXA ART SoCs ethernet driver')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/moxa/moxart_ether.c b/drivers/net/ethernet/moxa/moxart_ether.c
index 00cfd95..3e67f45 100644
--- a/drivers/net/ethernet/moxa/moxart_ether.c
+++ b/drivers/net/ethernet/moxa/moxart_ether.c
@@ -474,9 +474,9 @@ static int moxart_mac_probe(struct platform_device *pdev)
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	ndev->base_addr = res->start;
 	priv->base = devm_ioremap_resource(p_dev, res);
-	ret = IS_ERR(priv->base);
-	if (ret) {
+	if (IS_ERR(priv->base)) {
 		dev_err(p_dev, "devm_ioremap_resource failed\n");
+		ret = PTR_ERR(priv->base);
 		goto init_fail;
 	}
 
-- 
cgit v0.10.2


From c1bb0a5588816ec26abc08aa9adcf90e32656781 Mon Sep 17 00:00:00 2001
From: Venkat Duvvuru <venkatkumar.duvvuru@avagotech.com>
Date: Wed, 2 Mar 2016 06:00:28 -0500
Subject: be2net: don't enable multicast flag in be_enable_if_filters() routine

When the interface is opened (in be_open()) the routine
be_enable_if_filters() must be called to switch on the basic filtering
capabilities of an interface that are not changed at run-time.
These include the flags UNTAGGED, BROADCAST and PASS_L3L4_ERRORS.
Other flags such as MULTICAST and PROMISC must be enabled later by
be_set_rx_mode() based on the state in the netdev/adapter struct.

be_enable_if_filters() routine is wrongly trying to enable MULTICAST flag
without checking the current adapter state. This can cause the RX_FILTER
cmds to the FW to fail.  This patch fixes this problem by only enabling
the basic filtering flags in be_enable_if_filters().

The VF must be able to issue RX_FILTER cmd with any filter flag, as long
as the PF allowed those flags (if_cap_flags) in the iface it provisioned
for the VF. This rule is applicable even when the VF doesn't have the
FILTMGMT privilege. There is a bug in BE3 FW that wrongly fails RX_FILTER
multicast programming cmds on VFs that don't have FILTMGMT privilege.
This patch also helps in insulating the VF driver from be_open failures due
to the FW bug. A fix for the BE3 FW issue will be available in
versions >= 11.0.283.0 and 10.6.334.0

Reported-by: Ivan Vecera <ivecera@redhat.com>
Signed-off-by: Venkat Duvvuru <venkatkumar.duvvuru@avagotech.com>
Signed-off-by: Sathya Perla <sathya.perla@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h
index 241819b..6d9a8d7 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.h
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -622,10 +622,13 @@ enum be_if_flags {
 					 BE_IF_FLAGS_VLAN_PROMISCUOUS |\
 					 BE_IF_FLAGS_MCAST_PROMISCUOUS)
 
-#define BE_IF_EN_FLAGS	(BE_IF_FLAGS_BROADCAST | BE_IF_FLAGS_PASS_L3L4_ERRORS |\
-			BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_UNTAGGED)
+#define BE_IF_FILT_FLAGS_BASIC (BE_IF_FLAGS_BROADCAST | \
+				BE_IF_FLAGS_PASS_L3L4_ERRORS | \
+				BE_IF_FLAGS_UNTAGGED)
 
-#define BE_IF_ALL_FILT_FLAGS	(BE_IF_EN_FLAGS | BE_IF_FLAGS_ALL_PROMISCUOUS)
+#define BE_IF_ALL_FILT_FLAGS	(BE_IF_FILT_FLAGS_BASIC | \
+				 BE_IF_FLAGS_MULTICAST | \
+				 BE_IF_FLAGS_ALL_PROMISCUOUS)
 
 /* An RX interface is an object with one or more MAC addresses and
  * filtering capabilities. */
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index f99de36..db81e3d 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -125,6 +125,11 @@ static const char * const ue_status_hi_desc[] = {
 	"Unknown"
 };
 
+#define BE_VF_IF_EN_FLAGS	(BE_IF_FLAGS_UNTAGGED | \
+				 BE_IF_FLAGS_BROADCAST | \
+				 BE_IF_FLAGS_MULTICAST | \
+				 BE_IF_FLAGS_PASS_L3L4_ERRORS)
+
 static void be_queue_free(struct be_adapter *adapter, struct be_queue_info *q)
 {
 	struct be_dma_mem *mem = &q->dma_mem;
@@ -3537,7 +3542,7 @@ static int be_enable_if_filters(struct be_adapter *adapter)
 {
 	int status;
 
-	status = be_cmd_rx_filter(adapter, BE_IF_EN_FLAGS, ON);
+	status = be_cmd_rx_filter(adapter, BE_IF_FILT_FLAGS_BASIC, ON);
 	if (status)
 		return status;
 
@@ -3857,8 +3862,7 @@ static int be_vfs_if_create(struct be_adapter *adapter)
 	int status;
 
 	/* If a FW profile exists, then cap_flags are updated */
-	cap_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
-		    BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS;
+	cap_flags = BE_VF_IF_EN_FLAGS;
 
 	for_all_vfs(adapter, vf_cfg, vf) {
 		if (!BE3_chip(adapter)) {
@@ -3874,10 +3878,8 @@ static int be_vfs_if_create(struct be_adapter *adapter)
 			}
 		}
 
-		en_flags = cap_flags & (BE_IF_FLAGS_UNTAGGED |
-					BE_IF_FLAGS_BROADCAST |
-					BE_IF_FLAGS_MULTICAST |
-					BE_IF_FLAGS_PASS_L3L4_ERRORS);
+		/* PF should enable IF flags during proxy if_create call */
+		en_flags = cap_flags & BE_VF_IF_EN_FLAGS;
 		status = be_cmd_if_create(adapter, cap_flags, en_flags,
 					  &vf_cfg->if_handle, vf + 1);
 		if (status)
-- 
cgit v0.10.2


From f9677e0f83080bb4186865868c359e72e1fac1ea Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 29 Feb 2016 06:33:47 -0800
Subject: x86/tsc: Always Running Timer (ART) correlated clocksource
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On modern Intel systems TSC is derived from the new Always Running Timer
(ART). ART can be captured simultaneous to the capture of
audio and network device clocks, allowing a correlation between timebases
to be constructed. Upon capture, the driver converts the captured ART
value to the appropriate system clock using the correlated clocksource
mechanism.

On systems that support ART a new CPUID leaf (0x15) returns parameters
“m” and “n” such that:

TSC_value = (ART_value * m) / n + k [n >= 1]

[k is an offset that can adjusted by a privileged agent. The
IA32_TSC_ADJUST MSR is an example of an interface to adjust k.
See 17.14.4 of the Intel SDM for more details]

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Tweaked to fix build issue, also reworked math for
64bit division on 32bit systems, as well as !CONFIG_CPU_FREQ build
fixes]
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7ad8c94..ff557b4 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -85,7 +85,7 @@
 #define X86_FEATURE_P4		( 3*32+ 7) /* "" P4 */
 #define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
 #define X86_FEATURE_UP		( 3*32+ 9) /* smp kernel running on up */
-/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ART		(3*32+10) /* Platform has always running timer (ART) */
 #define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
 #define X86_FEATURE_PEBS	( 3*32+12) /* Precise-Event Based Sampling */
 #define X86_FEATURE_BTS		( 3*32+13) /* Branch Trace Store */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 6d7c547..174c421 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -29,6 +29,8 @@ static inline cycles_t get_cycles(void)
 	return rdtsc();
 }
 
+extern struct system_counterval_t convert_art_to_tsc(cycle_t art);
+
 extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
 extern int unsynchronized_tsc(void);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3d743da..80d761e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -43,6 +43,11 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc);
 
 int tsc_clocksource_reliable;
 
+static u32 art_to_tsc_numerator;
+static u32 art_to_tsc_denominator;
+static u64 art_to_tsc_offset;
+struct clocksource *art_related_clocksource;
+
 /*
  * Use a ring-buffer like data structure, where a writer advances the head by
  * writing a new data entry and a reader advances the tail when it observes a
@@ -964,6 +969,37 @@ core_initcall(cpufreq_tsc);
 
 #endif /* CONFIG_CPU_FREQ */
 
+#define ART_CPUID_LEAF (0x15)
+#define ART_MIN_DENOMINATOR (1)
+
+
+/*
+ * If ART is present detect the numerator:denominator to convert to TSC
+ */
+static void detect_art(void)
+{
+	unsigned int unused[2];
+
+	if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
+		return;
+
+	cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
+	      &art_to_tsc_numerator, unused, unused+1);
+
+	/* Don't enable ART in a VM, non-stop TSC required */
+	if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
+	    !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
+	    art_to_tsc_denominator < ART_MIN_DENOMINATOR)
+		return;
+
+	if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset))
+		return;
+
+	/* Make this sticky over multiple CPU init calls */
+	setup_force_cpu_cap(X86_FEATURE_ART);
+}
+
+
 /* clocksource code */
 
 static struct clocksource clocksource_tsc;
@@ -1071,6 +1107,25 @@ int unsynchronized_tsc(void)
 	return 0;
 }
 
+/*
+ * Convert ART to TSC given numerator/denominator found in detect_art()
+ */
+struct system_counterval_t convert_art_to_tsc(cycle_t art)
+{
+	u64 tmp, res, rem;
+
+	rem = do_div(art, art_to_tsc_denominator);
+
+	res = art * art_to_tsc_numerator;
+	tmp = rem * art_to_tsc_numerator;
+
+	do_div(tmp, art_to_tsc_denominator);
+	res += tmp + art_to_tsc_offset;
+
+	return (struct system_counterval_t) {.cs = art_related_clocksource,
+			.cycles = res};
+}
+EXPORT_SYMBOL(convert_art_to_tsc);
 
 static void tsc_refine_calibration_work(struct work_struct *work);
 static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
@@ -1142,6 +1197,8 @@ static void tsc_refine_calibration_work(struct work_struct *work)
 		(unsigned long)tsc_khz % 1000);
 
 out:
+	if (boot_cpu_has(X86_FEATURE_ART))
+		art_related_clocksource = &clocksource_tsc;
 	clocksource_register_khz(&clocksource_tsc, tsc_khz);
 }
 
@@ -1235,6 +1292,8 @@ void __init tsc_init(void)
 		mark_tsc_unstable("TSCs unsynchronized");
 
 	check_system_tsc_reliable();
+
+	detect_art();
 }
 
 #ifdef CONFIG_SMP
-- 
cgit v0.10.2


From 719f1aa4a67199a3c4c68a03f94e5ec44d9d5f82 Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:25 -0800
Subject: ptp: Add PTP_SYS_OFFSET_PRECISE for driver crosstimestamping

Currently, network /system cross-timestamping is performed in the
PTP_SYS_OFFSET ioctl. The PTP clock driver reads gettimeofday() and
the gettime64() callback provided by the driver. The cross-timestamp
is best effort where the latency between the capture of system time
(getnstimeofday()) and the device time (driver callback) may be
significant.

The getcrosststamp() callback and corresponding PTP_SYS_OFFSET_PRECISE
ioctl allows the driver to perform this device/system correlation when
for example cross timestamp hardware is available. Modern Intel
systems can do this for onboard Ethernet controllers using the ART
counter. There is virtually zero latency between captures of the ART
and network device clock.

The capabilities ioctl (PTP_CLOCK_GETCAPS), is augmented allowing
applications to query whether or not drivers implement the
getcrosststamp callback, providing more precise cross timestamping.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Commit subject tweaks]
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/Documentation/ptp/testptp.c b/Documentation/ptp/testptp.c
index 6c6247a..d99012f 100644
--- a/Documentation/ptp/testptp.c
+++ b/Documentation/ptp/testptp.c
@@ -277,13 +277,15 @@ int main(int argc, char *argv[])
 			       "  %d external time stamp channels\n"
 			       "  %d programmable periodic signals\n"
 			       "  %d pulse per second\n"
-			       "  %d programmable pins\n",
+			       "  %d programmable pins\n"
+			       "  %d cross timestamping\n",
 			       caps.max_adj,
 			       caps.n_alarm,
 			       caps.n_ext_ts,
 			       caps.n_per_out,
 			       caps.pps,
-			       caps.n_pins);
+			       caps.n_pins,
+			       caps.cross_timestamping);
 		}
 	}
 
diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index da7bae9..579fd65 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -22,6 +22,7 @@
 #include <linux/poll.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/timekeeping.h>
 
 #include "ptp_private.h"
 
@@ -120,11 +121,13 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 	struct ptp_clock_caps caps;
 	struct ptp_clock_request req;
 	struct ptp_sys_offset *sysoff = NULL;
+	struct ptp_sys_offset_precise precise_offset;
 	struct ptp_pin_desc pd;
 	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
 	struct ptp_clock_info *ops = ptp->info;
 	struct ptp_clock_time *pct;
 	struct timespec64 ts;
+	struct system_device_crosststamp xtstamp;
 	int enable, err = 0;
 	unsigned int i, pin_index;
 
@@ -138,6 +141,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		caps.n_per_out = ptp->info->n_per_out;
 		caps.pps = ptp->info->pps;
 		caps.n_pins = ptp->info->n_pins;
+		caps.cross_timestamping = ptp->info->getcrosststamp != NULL;
 		if (copy_to_user((void __user *)arg, &caps, sizeof(caps)))
 			err = -EFAULT;
 		break;
@@ -180,6 +184,29 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		err = ops->enable(ops, &req, enable);
 		break;
 
+	case PTP_SYS_OFFSET_PRECISE:
+		if (!ptp->info->getcrosststamp) {
+			err = -EOPNOTSUPP;
+			break;
+		}
+		err = ptp->info->getcrosststamp(ptp->info, &xtstamp);
+		if (err)
+			break;
+
+		ts = ktime_to_timespec64(xtstamp.device);
+		precise_offset.device.sec = ts.tv_sec;
+		precise_offset.device.nsec = ts.tv_nsec;
+		ts = ktime_to_timespec64(xtstamp.sys_realtime);
+		precise_offset.sys_realtime.sec = ts.tv_sec;
+		precise_offset.sys_realtime.nsec = ts.tv_nsec;
+		ts = ktime_to_timespec64(xtstamp.sys_monoraw);
+		precise_offset.sys_monoraw.sec = ts.tv_sec;
+		precise_offset.sys_monoraw.nsec = ts.tv_nsec;
+		if (copy_to_user((void __user *)arg, &precise_offset,
+				 sizeof(precise_offset)))
+			err = -EFAULT;
+		break;
+
 	case PTP_SYS_OFFSET:
 		sysoff = kmalloc(sizeof(*sysoff), GFP_KERNEL);
 		if (!sysoff) {
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index b8b7306..6b15e16 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -38,6 +38,7 @@ struct ptp_clock_request {
 	};
 };
 
+struct system_device_crosststamp;
 /**
  * struct ptp_clock_info - decribes a PTP hardware clock
  *
@@ -67,6 +68,11 @@ struct ptp_clock_request {
  * @gettime64:  Reads the current time from the hardware clock.
  *              parameter ts: Holds the result.
  *
+ * @getcrosststamp:  Reads the current time from the hardware clock and
+ *                   system clock simultaneously.
+ *                   parameter cts: Contains timestamp (device,system) pair,
+ *                   where system time is realtime and monotonic.
+ *
  * @settime64:  Set the current time on the hardware clock.
  *              parameter ts: Time value to set.
  *
@@ -105,6 +111,8 @@ struct ptp_clock_info {
 	int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta);
 	int (*adjtime)(struct ptp_clock_info *ptp, s64 delta);
 	int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts);
+	int (*getcrosststamp)(struct ptp_clock_info *ptp,
+			      struct system_device_crosststamp *cts);
 	int (*settime64)(struct ptp_clock_info *p, const struct timespec64 *ts);
 	int (*enable)(struct ptp_clock_info *ptp,
 		      struct ptp_clock_request *request, int on);
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index f0b7bfe..ac6dded 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -51,7 +51,9 @@ struct ptp_clock_caps {
 	int n_per_out; /* Number of programmable periodic signals. */
 	int pps;       /* Whether the clock supports a PPS callback. */
 	int n_pins;    /* Number of input/output pins. */
-	int rsv[14];   /* Reserved for future use. */
+	/* Whether the clock supports precise system-device cross timestamps */
+	int cross_timestamping;
+	int rsv[13];   /* Reserved for future use. */
 };
 
 struct ptp_extts_request {
@@ -81,6 +83,13 @@ struct ptp_sys_offset {
 	struct ptp_clock_time ts[2 * PTP_MAX_SAMPLES + 1];
 };
 
+struct ptp_sys_offset_precise {
+	struct ptp_clock_time device;
+	struct ptp_clock_time sys_realtime;
+	struct ptp_clock_time sys_monoraw;
+	unsigned int rsv[4];    /* Reserved for future use. */
+};
+
 enum ptp_pin_function {
 	PTP_PF_NONE,
 	PTP_PF_EXTTS,
@@ -124,6 +133,8 @@ struct ptp_pin_desc {
 #define PTP_SYS_OFFSET     _IOW(PTP_CLK_MAGIC, 5, struct ptp_sys_offset)
 #define PTP_PIN_GETFUNC    _IOWR(PTP_CLK_MAGIC, 6, struct ptp_pin_desc)
 #define PTP_PIN_SETFUNC    _IOW(PTP_CLK_MAGIC, 7, struct ptp_pin_desc)
+#define PTP_SYS_OFFSET_PRECISE \
+	_IOWR(PTP_CLK_MAGIC, 8, struct ptp_sys_offset_precise)
 
 struct ptp_extts_event {
 	struct ptp_clock_time t; /* Time event occured. */
-- 
cgit v0.10.2


From 01d7ada57ee9c735bd71fbe44ec0bcb70847afd4 Mon Sep 17 00:00:00 2001
From: "Christopher S. Hall" <christopher.s.hall@intel.com>
Date: Mon, 22 Feb 2016 03:15:26 -0800
Subject: e1000e: Adds hardware supported cross timestamp on e1000e nic

Modern Intel systems supports cross timestamping of the network device
clock and Always Running Timer (ART) in hardware.  This allows the
device time and system time to be precisely correlated. The timestamp
pair is returned through e1000e_phc_get_syncdevicetime() used by
get_system_device_crosststamp().  The hardware cross-timestamp result
is made available to applications through the PTP_SYS_OFFSET_PRECISE
ioctl which calls e1000e_phc_getcrosststamp().

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: kevin.b.stanton@intel.com
Cc: kevin.j.clarke@intel.com
Cc: hpa@zytor.com
Cc: jeffrey.t.kirsher@intel.com
Cc: netdev@vger.kernel.org
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Christopher S. Hall <christopher.s.hall@intel.com>
[jstultz: Reworked to use new interface, commit message tweaks]
Signed-off-by: John Stultz <john.stultz@linaro.org>

diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index fa593dd..3772f3a 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -83,6 +83,15 @@ config E1000E
 	  To compile this driver as a module, choose M here. The module
 	  will be called e1000e.
 
+config E1000E_HWTS
+	bool "Support HW cross-timestamp on PCH devices"
+	default y
+	depends on E1000E && X86
+	---help---
+	 Say Y to enable hardware supported cross-timestamping on PCH
+	 devices. The cross-timestamp is available through the PTP clock
+	 driver precise cross-timestamp ioctl (PTP_SYS_OFFSET_PRECISE).
+
 config IGB
 	tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support"
 	depends on PCI
diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h
index f7c7804..0641c00 100644
--- a/drivers/net/ethernet/intel/e1000e/defines.h
+++ b/drivers/net/ethernet/intel/e1000e/defines.h
@@ -528,6 +528,11 @@
 #define E1000_RXCW_C          0x20000000        /* Receive config */
 #define E1000_RXCW_SYNCH      0x40000000        /* Receive config synch */
 
+/* HH Time Sync */
+#define E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK	0x0000F000 /* max delay */
+#define E1000_TSYNCTXCTL_SYNC_COMP		0x40000000 /* sync complete */
+#define E1000_TSYNCTXCTL_START_SYNC		0x80000000 /* initiate sync */
+
 #define E1000_TSYNCTXCTL_VALID		0x00000001 /* Tx timestamp valid */
 #define E1000_TSYNCTXCTL_ENABLED	0x00000010 /* enable Tx timestamping */
 
diff --git a/drivers/net/ethernet/intel/e1000e/ptp.c b/drivers/net/ethernet/intel/e1000e/ptp.c
index 25a0ad5..e2ff3ef 100644
--- a/drivers/net/ethernet/intel/e1000e/ptp.c
+++ b/drivers/net/ethernet/intel/e1000e/ptp.c
@@ -26,6 +26,12 @@
 
 #include "e1000.h"
 
+#ifdef CONFIG_E1000E_HWTS
+#include <linux/clocksource.h>
+#include <linux/ktime.h>
+#include <asm/tsc.h>
+#endif
+
 /**
  * e1000e_phc_adjfreq - adjust the frequency of the hardware clock
  * @ptp: ptp clock structure
@@ -98,6 +104,78 @@ static int e1000e_phc_adjtime(struct ptp_clock_info *ptp, s64 delta)
 	return 0;
 }
 
+#ifdef CONFIG_E1000E_HWTS
+#define MAX_HW_WAIT_COUNT (3)
+
+/**
+ * e1000e_phc_get_syncdevicetime - Callback given to timekeeping code reads system/device registers
+ * @device: current device time
+ * @system: system counter value read synchronously with device time
+ * @ctx: context provided by timekeeping code
+ *
+ * Read device and system (ART) clock simultaneously and return the corrected
+ * clock values in ns.
+ **/
+static int e1000e_phc_get_syncdevicetime(ktime_t *device,
+					 struct system_counterval_t *system,
+					 void *ctx)
+{
+	struct e1000_adapter *adapter = (struct e1000_adapter *)ctx;
+	struct e1000_hw *hw = &adapter->hw;
+	unsigned long flags;
+	int i;
+	u32 tsync_ctrl;
+	cycle_t dev_cycles;
+	cycle_t sys_cycles;
+
+	tsync_ctrl = er32(TSYNCTXCTL);
+	tsync_ctrl |= E1000_TSYNCTXCTL_START_SYNC |
+		E1000_TSYNCTXCTL_MAX_ALLOWED_DLY_MASK;
+	ew32(TSYNCTXCTL, tsync_ctrl);
+	for (i = 0; i < MAX_HW_WAIT_COUNT; ++i) {
+		udelay(1);
+		tsync_ctrl = er32(TSYNCTXCTL);
+		if (tsync_ctrl & E1000_TSYNCTXCTL_SYNC_COMP)
+			break;
+	}
+
+	if (i == MAX_HW_WAIT_COUNT)
+		return -ETIMEDOUT;
+
+	dev_cycles = er32(SYSSTMPH);
+	dev_cycles <<= 32;
+	dev_cycles |= er32(SYSSTMPL);
+	spin_lock_irqsave(&adapter->systim_lock, flags);
+	*device = ns_to_ktime(timecounter_cyc2time(&adapter->tc, dev_cycles));
+	spin_unlock_irqrestore(&adapter->systim_lock, flags);
+
+	sys_cycles = er32(PLTSTMPH);
+	sys_cycles <<= 32;
+	sys_cycles |= er32(PLTSTMPL);
+	*system = convert_art_to_tsc(sys_cycles);
+
+	return 0;
+}
+
+/**
+ * e1000e_phc_getsynctime - Reads the current system/device cross timestamp
+ * @ptp: ptp clock structure
+ * @cts: structure containing timestamp
+ *
+ * Read device and system (ART) clock simultaneously and return the scaled
+ * clock values in ns.
+ **/
+static int e1000e_phc_getcrosststamp(struct ptp_clock_info *ptp,
+				     struct system_device_crosststamp *xtstamp)
+{
+	struct e1000_adapter *adapter = container_of(ptp, struct e1000_adapter,
+						     ptp_clock_info);
+
+	return get_device_system_crosststamp(e1000e_phc_get_syncdevicetime,
+						adapter, NULL, xtstamp);
+}
+#endif/*CONFIG_E1000E_HWTS*/
+
 /**
  * e1000e_phc_gettime - Reads the current time from the hardware clock
  * @ptp: ptp clock structure
@@ -236,6 +314,13 @@ void e1000e_ptp_init(struct e1000_adapter *adapter)
 		break;
 	}
 
+#ifdef CONFIG_E1000E_HWTS
+	/* CPU must have ART and GBe must be from Sunrise Point or greater */
+	if (hw->mac.type >= e1000_pch_spt && boot_cpu_has(X86_FEATURE_ART))
+		adapter->ptp_clock_info.getcrosststamp =
+			e1000e_phc_getcrosststamp;
+#endif/*CONFIG_E1000E_HWTS*/
+
 	INIT_DELAYED_WORK(&adapter->systim_overflow_work,
 			  e1000e_systim_overflow_work);
 
diff --git a/drivers/net/ethernet/intel/e1000e/regs.h b/drivers/net/ethernet/intel/e1000e/regs.h
index 1d5e0b7..0cb4d36 100644
--- a/drivers/net/ethernet/intel/e1000e/regs.h
+++ b/drivers/net/ethernet/intel/e1000e/regs.h
@@ -245,6 +245,10 @@
 #define E1000_SYSTIML	0x0B600	/* System time register Low - RO */
 #define E1000_SYSTIMH	0x0B604	/* System time register High - RO */
 #define E1000_TIMINCA	0x0B608	/* Increment attributes register - RW */
+#define E1000_SYSSTMPL  0x0B648 /* HH Timesync system stamp low register */
+#define E1000_SYSSTMPH  0x0B64C /* HH Timesync system stamp hi register */
+#define E1000_PLTSTMPL  0x0B640 /* HH Timesync platform stamp low register */
+#define E1000_PLTSTMPH  0x0B644 /* HH Timesync platform stamp hi register */
 #define E1000_RXMTRL	0x0B634	/* Time sync Rx EtherType and Msg Type - RW */
 #define E1000_RXUDP	0x0B638	/* Time Sync Rx UDP Port - RW */
 
-- 
cgit v0.10.2


From 909c3a22da3b8d2cfd3505ca5658f0176859d400 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 2 Mar 2016 15:49:38 +0000
Subject: Btrfs: fix loading of orphan roots leading to BUG_ON

When looking for orphan roots during mount we can end up hitting a
BUG_ON() (at root-item.c:btrfs_find_orphan_roots()) if a log tree is
replayed and qgroups are enabled. This is because after a log tree is
replayed, a transaction commit is made, which triggers qgroup extent
accounting which in turn does backref walking which ends up reading and
inserting all roots in the radix tree fs_info->fs_root_radix, including
orphan roots (deleted snapshots). So after the log tree is replayed, when
finding orphan roots we hit the BUG_ON with the following trace:

[118209.182438] ------------[ cut here ]------------
[118209.183279] kernel BUG at fs/btrfs/root-tree.c:314!
[118209.184074] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[118209.185123] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic ppdev xor raid6_pq evdev sg parport_pc parport acpi_cpufreq tpm_tis tpm psmouse
processor i2c_piix4 serio_raw pcspkr i2c_core button loop autofs4 ext4 crc16 mbcache jbd2 sd_mod sr_mod cdrom ata_generic virtio_scsi ata_piix libata
virtio_pci virtio_ring virtio scsi_mod e1000 floppy [last unloaded: btrfs]
[118209.186318] CPU: 14 PID: 28428 Comm: mount Tainted: G        W       4.5.0-rc5-btrfs-next-24+ #1
[118209.186318] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[118209.186318] task: ffff8801ec131040 ti: ffff8800af34c000 task.ti: ffff8800af34c000
[118209.186318] RIP: 0010:[<ffffffffa04237d7>]  [<ffffffffa04237d7>] btrfs_find_orphan_roots+0x1fc/0x244 [btrfs]
[118209.186318] RSP: 0018:ffff8800af34faa8  EFLAGS: 00010246
[118209.186318] RAX: 00000000ffffffef RBX: 00000000ffffffef RCX: 0000000000000001
[118209.186318] RDX: 0000000080000000 RSI: 0000000000000001 RDI: 00000000ffffffff
[118209.186318] RBP: ffff8800af34fb08 R08: 0000000000000001 R09: 0000000000000000
[118209.186318] R10: ffff8800af34f9f0 R11: 6db6db6db6db6db7 R12: ffff880171b97000
[118209.186318] R13: ffff8801ca9d65e0 R14: ffff8800afa2e000 R15: 0000160000000000
[118209.186318] FS:  00007f5bcb914840(0000) GS:ffff88023edc0000(0000) knlGS:0000000000000000
[118209.186318] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[118209.186318] CR2: 00007f5bcaceb5d9 CR3: 00000000b49b5000 CR4: 00000000000006e0
[118209.186318] Stack:
[118209.186318]  fffffbffffffffff 010230ffffffffff 0101000000000000 ff84000000000000
[118209.186318]  fbffffffffffffff 30ffffffffffffff 0000000000000101 ffff880082348000
[118209.186318]  0000000000000000 ffff8800afa2e000 ffff8800afa2e000 0000000000000000
[118209.186318] Call Trace:
[118209.186318]  [<ffffffffa042e2db>] open_ctree+0x1e37/0x21b9 [btrfs]
[118209.186318]  [<ffffffffa040a753>] btrfs_mount+0x97e/0xaed [btrfs]
[118209.186318]  [<ffffffff8108e1c0>] ? trace_hardirqs_on+0xd/0xf
[118209.186318]  [<ffffffff8117b87e>] mount_fs+0x67/0x131
[118209.186318]  [<ffffffff81192d2b>] vfs_kern_mount+0x6c/0xde
[118209.186318]  [<ffffffffa0409f81>] btrfs_mount+0x1ac/0xaed [btrfs]
[118209.186318]  [<ffffffff8108e1c0>] ? trace_hardirqs_on+0xd/0xf
[118209.186318]  [<ffffffff8108c26b>] ? lockdep_init_map+0xb9/0x1b3
[118209.186318]  [<ffffffff8117b87e>] mount_fs+0x67/0x131
[118209.186318]  [<ffffffff81192d2b>] vfs_kern_mount+0x6c/0xde
[118209.186318]  [<ffffffff81195637>] do_mount+0x8a6/0x9e8
[118209.186318]  [<ffffffff8119598d>] SyS_mount+0x77/0x9f
[118209.186318]  [<ffffffff81493017>] entry_SYSCALL_64_fastpath+0x12/0x6b
[118209.186318] Code: 64 00 00 85 c0 89 c3 75 24 f0 41 80 4c 24 20 20 49 8b bc 24 f0 01 00 00 4c 89 e6 e8 e8 65 00 00 85 c0 89 c3 74 11 83 f8 ef 75 02 <0f> 0b
4c 89 e7 e8 da 72 00 00 eb 1c 41 83 bc 24 00 01 00 00 00
[118209.186318] RIP  [<ffffffffa04237d7>] btrfs_find_orphan_roots+0x1fc/0x244 [btrfs]
[118209.186318]  RSP <ffff8800af34faa8>
[118209.230735] ---[ end trace 83938f987d85d477 ]---

So fix this by not treating the error -EEXIST, returned when attempting
to insert a root already inserted by the backref walking code, as an error.

The following test case for xfstests reproduces the bug:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      _cleanup_flakey
      cd /
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter
  . ./common/dmflakey

  # real QA test starts here
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_dm_target flakey
  _require_metadata_journaling $SCRATCH_DEV

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _init_flakey
  _mount_flakey

  _run_btrfs_util_prog quota enable $SCRATCH_MNT

  # Create 2 directories with one file in one of them.
  # We use these just to trigger a transaction commit later, moving the file from
  # directory a to directory b and doing an fsync against directory a.
  mkdir $SCRATCH_MNT/a
  mkdir $SCRATCH_MNT/b
  touch $SCRATCH_MNT/a/f
  sync

  # Create our test file with 2 4K extents.
  $XFS_IO_PROG -f -s -c "pwrite -S 0xaa 0 8K" $SCRATCH_MNT/foobar | _filter_xfs_io

  # Create a snapshot and delete it. This doesn't really delete the snapshot
  # immediately, just makes it inaccessible and invisible to user space, the
  # snapshot is deleted later by a dedicated kernel thread (cleaner kthread)
  # which is woke up at the next transaction commit.
  # A root orphan item is inserted into the tree of tree roots, so that if a
  # power failure happens before the dedicated kernel thread does the snapshot
  # deletion, the next time the filesystem is mounted it resumes the snapshot
  # deletion.
  _run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT $SCRATCH_MNT/snap
  _run_btrfs_util_prog subvolume delete $SCRATCH_MNT/snap

  # Now overwrite half of the extents we wrote before. Because we made a snapshpot
  # before, which isn't really deleted yet (since no transaction commit happened
  # after we did the snapshot delete request), the non overwritten extents get
  # referenced twice, once by the default subvolume and once by the snapshot.
  $XFS_IO_PROG -c "pwrite -S 0xbb 4K 8K" $SCRATCH_MNT/foobar | _filter_xfs_io

  # Now move file f from directory a to directory b and fsync directory a.
  # The fsync on the directory a triggers a transaction commit (because a file
  # was moved from it to another directory) and the file fsync leaves a log tree
  # with file extent items to replay.
  mv $SCRATCH_MNT/a/f $SCRATCH_MNT/a/b
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/a
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foobar

  echo "File digest before power failure:"
  md5sum $SCRATCH_MNT/foobar | _filter_scratch

  # Now simulate a power failure and mount the filesystem to replay the log tree.
  # After the log tree was replayed, we used to hit a BUG_ON() when processing
  # the root orphan item for the deleted snapshot. This is because when processing
  # an orphan root the code expected to be the first code inserting the root into
  # the fs_info->fs_root_radix radix tree, while in reallity it was the second
  # caller attempting to do it - the first caller was the transaction commit that
  # took place after replaying the log tree, when updating the qgroup counters.
  _flakey_drop_and_remount

  echo "File digest before after failure:"
  # Must match what he got before the power failure.
  md5sum $SCRATCH_MNT/foobar | _filter_scratch

  _unmount_flakey
  status=0
  exit

Fixes: 2d9e97761087 ("Btrfs: use btrfs_get_fs_root in resolve_indirect_ref")
Cc: stable@vger.kernel.org  # 4.4+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7cf8509..2c849b0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -310,8 +310,16 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
 
 		err = btrfs_insert_fs_root(root->fs_info, root);
+		/*
+		 * The root might have been inserted already, as before we look
+		 * for orphan roots, log replay might have happened, which
+		 * triggers a transaction commit and qgroup accounting, which
+		 * in turn reads and inserts fs roots while doing backref
+		 * walking.
+		 */
+		if (err == -EEXIST)
+			err = 0;
 		if (err) {
-			BUG_ON(err == -EEXIST);
 			btrfs_free_fs_root(root);
 			break;
 		}
-- 
cgit v0.10.2


From 37c5e942bb2eedd98c1cd1fa1f94d79f6b830c38 Mon Sep 17 00:00:00 2001
From: Scott Wood <oss@buserror.net>
Date: Wed, 2 Mar 2016 22:51:04 -0600
Subject: powerpc/fsl-book3e: Avoid lbarx on e5500

lbarx/stbcx. are implemented on e6500, but not on e5500.
Likewise, SMT is on e6500, but not on e5500.

So, avoid executing an unimplemented instruction by only locking
when needed (i.e. in the presence of SMT).

Signed-off-by: Scott Wood <oss@buserror.net>

diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index 7e6d088..83a8be7 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -8,6 +8,8 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 
+#include <asm/mmu.h>
+
 #ifdef CONFIG_PPC_FSL_BOOK3E
 #ifdef CONFIG_PPC64
 static inline int tlb1_next(void)
@@ -60,6 +62,14 @@ static inline void book3e_tlb_lock(void)
 	unsigned long tmp;
 	int token = smp_processor_id() + 1;
 
+	/*
+	 * Besides being unnecessary in the absence of SMT, this
+	 * check prevents trying to do lbarx/stbcx. on e5500 which
+	 * doesn't implement either feature.
+	 */
+	if (!cpu_has_feature(CPU_FTR_SMT))
+		return;
+
 	asm volatile("1: lbarx %0, 0, %1;"
 		     "cmpwi %0, 0;"
 		     "bne 2f;"
@@ -80,6 +90,9 @@ static inline void book3e_tlb_unlock(void)
 {
 	struct paca_struct *paca = get_paca();
 
+	if (!cpu_has_feature(CPU_FTR_SMT))
+		return;
+
 	isync();
 	paca->tcd_ptr->lock = 0;
 }
-- 
cgit v0.10.2


From ec75a940b1037e877efd9a5a9e94eab1e464f73b Mon Sep 17 00:00:00 2001
From: Libin Yang <libin.yang@linux.intel.com>
Date: Fri, 4 Mar 2016 14:33:06 +0800
Subject: ALSA: hda - hdmi add wmb barrier for audio component

To make sure audio_ptr is set before intel_audio_codec_enable()
or intel_audio_codec_disable() calling pin_eld_notify(),
this patch adds wmb barrier to prevent optimizing.

Signed-off-by: Libin Yang <libin.yang@linux.intel.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index 8ee78db..6858e88 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -2480,6 +2480,11 @@ static int patch_generic_hdmi(struct hda_codec *codec)
 	if (codec_has_acomp(codec)) {
 		codec->depop_delay = 0;
 		spec->i915_audio_ops.audio_ptr = codec;
+		/* intel_audio_codec_enable() or intel_audio_codec_disable()
+		 * will call pin_eld_notify with using audio_ptr pointer
+		 * We need make sure audio_ptr is really setup
+		 */
+		wmb();
 		spec->i915_audio_ops.pin_eld_notify = intel_pin_eld_notify;
 		snd_hdac_i915_register_notifier(&spec->i915_audio_ops);
 	}
-- 
cgit v0.10.2


From 790b415c98de62602810b0eedce26f0f9d6ddd78 Mon Sep 17 00:00:00 2001
From: Libin Yang <libin.yang@linux.intel.com>
Date: Fri, 4 Mar 2016 14:33:43 +0800
Subject: ALSA: hda - hdmi defer to register acomp eld notifier

Defer to register acomp eld notifier until hdmi audio driver
is fully ready.

After registering eld notifier, gfx driver can use this
callback function to notify audio driver the monitor
connection event. However this action may happen when
audio driver is adding the pins or doing other initialization.
This is not always safe, however. For example, using
per_pin->lock before the lock is initialized.

Let's register the eld notifier after the initialization is done.

Signed-off-by: Libin Yang <libin.yang@linux.intel.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>

diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index 6858e88..bcbc4ee 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -2477,18 +2477,6 @@ static int patch_generic_hdmi(struct hda_codec *codec)
 			is_broxton(codec))
 		codec->core.link_power_control = 1;
 
-	if (codec_has_acomp(codec)) {
-		codec->depop_delay = 0;
-		spec->i915_audio_ops.audio_ptr = codec;
-		/* intel_audio_codec_enable() or intel_audio_codec_disable()
-		 * will call pin_eld_notify with using audio_ptr pointer
-		 * We need make sure audio_ptr is really setup
-		 */
-		wmb();
-		spec->i915_audio_ops.pin_eld_notify = intel_pin_eld_notify;
-		snd_hdac_i915_register_notifier(&spec->i915_audio_ops);
-	}
-
 	if (hdmi_parse_codec(codec) < 0) {
 		if (spec->i915_bound)
 			snd_hdac_i915_exit(&codec->bus->core);
@@ -2510,6 +2498,18 @@ static int patch_generic_hdmi(struct hda_codec *codec)
 
 	init_channel_allocations();
 
+	if (codec_has_acomp(codec)) {
+		codec->depop_delay = 0;
+		spec->i915_audio_ops.audio_ptr = codec;
+		/* intel_audio_codec_enable() or intel_audio_codec_disable()
+		 * will call pin_eld_notify with using audio_ptr pointer
+		 * We need make sure audio_ptr is really setup
+		 */
+		wmb();
+		spec->i915_audio_ops.pin_eld_notify = intel_pin_eld_notify;
+		snd_hdac_i915_register_notifier(&spec->i915_audio_ops);
+	}
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From d694b06c42a56c9610861ee0474b4d441af8ada6 Mon Sep 17 00:00:00 2001
From: Ed Spiridonov <edo.rus@gmail.com>
Date: Fri, 4 Mar 2016 09:07:27 +0300
Subject: can: mcp251x: avoid write to error flag register if it's unnecessary

Only two bits (RX0OVR and RX1OVR) are writable in EFLG, write is useless
if these bits aren't set.

Signed-off-by: Ed Spiridonov <edo.rus@gmail.com>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>

diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c
index 575790e..74a7dfe 100644
--- a/drivers/net/can/spi/mcp251x.c
+++ b/drivers/net/can/spi/mcp251x.c
@@ -843,7 +843,7 @@ static irqreturn_t mcp251x_can_ist(int irq, void *dev_id)
 		if (clear_intf)
 			mcp251x_write_bits(spi, CANINTF, clear_intf, 0x00);
 
-		if (eflag)
+		if (eflag & (EFLG_RX0OVR | EFLG_RX1OVR))
 			mcp251x_write_bits(spi, EFLG, eflag, 0x00);
 
 		/* Update can state */
-- 
cgit v0.10.2


From e57cbaf0eb006eaa207395f3bfd7ce52c1b5539c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 3 Mar 2016 17:18:20 -0500
Subject: tracing: Do not have 'comm' filter override event 'comm' field

Commit 9f61668073a8d "tracing: Allow triggers to filter for CPU ids and
process names" added a 'comm' filter that will filter events based on the
current tasks struct 'comm'. But this now hides the ability to filter events
that have a 'comm' field too. For example, sched_migrate_task trace event.
That has a 'comm' field of the task to be migrated.

 echo 'comm == "bash"' > events/sched_migrate_task/filter

will now filter all sched_migrate_task events for tasks named "bash" that
migrates other tasks (in interrupt context), instead of seeing when "bash"
itself gets migrated.

This fix requires a couple of changes.

1) Change the look up order for filter predicates to look at the events
   fields before looking at the generic filters.

2) Instead of basing the filter function off of the "comm" name, have the
   generic "comm" filter have its own filter_type (FILTER_COMM). Test
   against the type instead of the name to assign the filter function.

3) Add a new "COMM" filter that works just like "comm" but will filter based
   on the current task, even if the trace event contains a "comm" field.

Do the same for "cpu" field, adding a FILTER_CPU and a filter "CPU".

Cc: stable@vger.kernel.org # v4.3+
Fixes: 9f61668073a8d "tracing: Allow triggers to filter for CPU ids and process names"
Reported-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 429fdfc..925730b 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -568,6 +568,8 @@ enum {
 	FILTER_DYN_STRING,
 	FILTER_PTR_STRING,
 	FILTER_TRACE_FN,
+	FILTER_COMM,
+	FILTER_CPU,
 };
 
 extern int trace_event_raw_init(struct trace_event_call *call);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ab09829..05ddc08 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -97,16 +97,16 @@ trace_find_event_field(struct trace_event_call *call, char *name)
 	struct ftrace_event_field *field;
 	struct list_head *head;
 
-	field = __find_event_field(&ftrace_generic_fields, name);
+	head = trace_get_fields(call);
+	field = __find_event_field(head, name);
 	if (field)
 		return field;
 
-	field = __find_event_field(&ftrace_common_fields, name);
+	field = __find_event_field(&ftrace_generic_fields, name);
 	if (field)
 		return field;
 
-	head = trace_get_fields(call);
-	return __find_event_field(head, name);
+	return __find_event_field(&ftrace_common_fields, name);
 }
 
 static int __trace_define_field(struct list_head *head, const char *type,
@@ -171,8 +171,10 @@ static int trace_define_generic_fields(void)
 {
 	int ret;
 
-	__generic_field(int, cpu, FILTER_OTHER);
-	__generic_field(char *, comm, FILTER_PTR_STRING);
+	__generic_field(int, CPU, FILTER_CPU);
+	__generic_field(int, cpu, FILTER_CPU);
+	__generic_field(char *, COMM, FILTER_COMM);
+	__generic_field(char *, comm, FILTER_COMM);
 
 	return ret;
 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f93a219..6816302 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1043,13 +1043,14 @@ static int init_pred(struct filter_parse_state *ps,
 		return -EINVAL;
 	}
 
-	if (is_string_field(field)) {
+	if (field->filter_type == FILTER_COMM) {
+		filter_build_regex(pred);
+		fn = filter_pred_comm;
+		pred->regex.field_len = TASK_COMM_LEN;
+	} else if (is_string_field(field)) {
 		filter_build_regex(pred);
 
-		if (!strcmp(field->name, "comm")) {
-			fn = filter_pred_comm;
-			pred->regex.field_len = TASK_COMM_LEN;
-		} else if (field->filter_type == FILTER_STATIC_STRING) {
+		if (field->filter_type == FILTER_STATIC_STRING) {
 			fn = filter_pred_string;
 			pred->regex.field_len = field->size;
 		} else if (field->filter_type == FILTER_DYN_STRING)
@@ -1072,7 +1073,7 @@ static int init_pred(struct filter_parse_state *ps,
 		}
 		pred->val = val;
 
-		if (!strcmp(field->name, "cpu"))
+		if (field->filter_type == FILTER_CPU)
 			fn = filter_pred_cpu;
 		else
 			fn = select_comparison_fn(pred->op, field->size,
-- 
cgit v0.10.2


From 097452e61366a939a4772332181cea7cdcc74760 Mon Sep 17 00:00:00 2001
From: Alexandre Courbot <acourbot@nvidia.com>
Date: Fri, 26 Feb 2016 18:06:52 +0900
Subject: gpu: host1x: Set DMA mask

The default DMA mask covers a 32 bits address range, but host1x devices
can address a larger range on TK1 and TX1. Set the DMA mask to the range
addressable when we use the IOMMU to prevent the use of bounce buffers.

Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>

diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index 314bf37..ff34869 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -23,6 +23,7 @@
 #include <linux/of_device.h>
 #include <linux/clk.h>
 #include <linux/io.h>
+#include <linux/dma-mapping.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/host1x.h>
@@ -68,6 +69,7 @@ static const struct host1x_info host1x01_info = {
 	.nb_bases	= 8,
 	.init		= host1x01_init,
 	.sync_offset	= 0x3000,
+	.dma_mask	= DMA_BIT_MASK(32),
 };
 
 static const struct host1x_info host1x02_info = {
@@ -77,6 +79,7 @@ static const struct host1x_info host1x02_info = {
 	.nb_bases = 12,
 	.init = host1x02_init,
 	.sync_offset = 0x3000,
+	.dma_mask = DMA_BIT_MASK(32),
 };
 
 static const struct host1x_info host1x04_info = {
@@ -86,6 +89,7 @@ static const struct host1x_info host1x04_info = {
 	.nb_bases = 64,
 	.init = host1x04_init,
 	.sync_offset = 0x2100,
+	.dma_mask = DMA_BIT_MASK(34),
 };
 
 static const struct host1x_info host1x05_info = {
@@ -95,6 +99,7 @@ static const struct host1x_info host1x05_info = {
 	.nb_bases = 64,
 	.init = host1x05_init,
 	.sync_offset = 0x2100,
+	.dma_mask = DMA_BIT_MASK(34),
 };
 
 static struct of_device_id host1x_of_match[] = {
@@ -148,6 +153,8 @@ static int host1x_probe(struct platform_device *pdev)
 	if (IS_ERR(host->regs))
 		return PTR_ERR(host->regs);
 
+	dma_set_mask_and_coherent(host->dev, host->info->dma_mask);
+
 	if (host->info->init) {
 		err = host->info->init(host);
 		if (err)
diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
index 0b6e8e9..dace124 100644
--- a/drivers/gpu/host1x/dev.h
+++ b/drivers/gpu/host1x/dev.h
@@ -96,6 +96,7 @@ struct host1x_info {
 	int	nb_mlocks;		/* host1x: number of mlocks */
 	int	(*init)(struct host1x *); /* initialize per SoC ops */
 	int	sync_offset;
+	u64	dma_mask;		/* mask of addressable memory */
 };
 
 struct host1x {
-- 
cgit v0.10.2


From c95469aa5a188384ccf8ac520ece931c66caf8aa Mon Sep 17 00:00:00 2001
From: Alexandre Courbot <acourbot@nvidia.com>
Date: Fri, 26 Feb 2016 18:06:53 +0900
Subject: gpu: host1x: Set DMA ops on device creation

Currently host1x-instanciated devices have their dma_ops left to NULL,
which makes any DMA operation (like buffer import) on ARM64 fallback
to the dummy_dma_ops and fail with an error.

This patch calls of_dma_configure() with the host1x node when creating
such a device, so the proper DMA operations are set.

Suggested-by: Thierry Reding <thierry.reding@gmail.com>
Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>

diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index da462af..dd2dbb9 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -18,6 +18,7 @@
 #include <linux/host1x.h>
 #include <linux/of.h>
 #include <linux/slab.h>
+#include <linux/of_device.h>
 
 #include "bus.h"
 #include "dev.h"
@@ -394,6 +395,7 @@ static int host1x_device_add(struct host1x *host1x,
 	device->dev.coherent_dma_mask = host1x->dev->coherent_dma_mask;
 	device->dev.dma_mask = &device->dev.coherent_dma_mask;
 	dev_set_name(&device->dev, "%s", driver->driver.name);
+	of_dma_configure(&device->dev, host1x->dev->of_node);
 	device->dev.release = host1x_device_release;
 	device->dev.bus = &host1x_bus_type;
 	device->dev.parent = host1x->dev;
-- 
cgit v0.10.2


From 5ea5c5e0a7f70b256417d3b6e36bd9851504babd Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Sun, 14 Feb 2016 18:06:41 +0800
Subject: ceph: initial CEPH_FEATURE_FS_FILE_LAYOUT_V2 support

Add support for the format change of MClientReply/MclientCaps.
Also add code that denies access to inodes with pool_ns layouts.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
Reviewed-by: Sage Weil <sage@redhat.com>

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c222137..19adeb0 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1756,6 +1756,10 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
 	u32 pool;
 	int ret, flags;
 
+	/* does not support pool namespace yet */
+	if (ci->i_pool_ns_len)
+		return -EIO;
+
 	if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
 				NOPOOLPERM))
 		return 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cdbf8cf..6fe0ad2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2753,7 +2753,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 			     void *inline_data, int inline_len,
 			     struct ceph_buffer *xattr_buf,
 			     struct ceph_mds_session *session,
-			     struct ceph_cap *cap, int issued)
+			     struct ceph_cap *cap, int issued,
+			     u32 pool_ns_len)
 	__releases(ci->i_ceph_lock)
 	__releases(mdsc->snap_rwsem)
 {
@@ -2873,6 +2874,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
 		/* file layout may have changed */
 		ci->i_layout = grant->layout;
+		ci->i_pool_ns_len = pool_ns_len;
+
 		/* size/truncate_seq? */
 		queue_trunc = ceph_fill_file_size(inode, issued,
 					le32_to_cpu(grant->truncate_seq),
@@ -3411,6 +3414,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	u32  inline_len = 0;
 	void *snaptrace;
 	size_t snaptrace_len;
+	u32 pool_ns_len = 0;
 	void *p, *end;
 
 	dout("handle_caps from mds%d\n", mds);
@@ -3463,6 +3467,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		p += inline_len;
 	}
 
+	if (le16_to_cpu(msg->hdr.version) >= 8) {
+		u64 flush_tid;
+		u32 caller_uid, caller_gid;
+		u32 osd_epoch_barrier;
+		/* version >= 5 */
+		ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+		/* version >= 6 */
+		ceph_decode_64_safe(&p, end, flush_tid, bad);
+		/* version >= 7 */
+		ceph_decode_32_safe(&p, end, caller_uid, bad);
+		ceph_decode_32_safe(&p, end, caller_gid, bad);
+		/* version >= 8 */
+		ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+	}
+
 	/* lookup ino */
 	inode = ceph_find_inode(sb, vino);
 	ci = ceph_inode(inode);
@@ -3518,7 +3537,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 				  &cap, &issued);
 		handle_cap_grant(mdsc, inode, h,
 				 inline_version, inline_data, inline_len,
-				 msg->middle, session, cap, issued);
+				 msg->middle, session, cap, issued,
+				 pool_ns_len);
 		if (realm)
 			ceph_put_snap_realm(mdsc, realm);
 		goto done_unlocked;
@@ -3542,7 +3562,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		issued |= __ceph_caps_dirty(ci);
 		handle_cap_grant(mdsc, inode, h,
 				 inline_version, inline_data, inline_len,
-				 msg->middle, session, cap, issued);
+				 msg->middle, session, cap, issued,
+				 pool_ns_len);
 		goto done_unlocked;
 
 	case CEPH_CAP_OP_FLUSH_ACK:
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index fb4ba2e..5849b88 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_symlink = NULL;
 
 	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+	ci->i_pool_ns_len = 0;
 
 	ci->i_fragtree = RB_ROOT;
 	mutex_init(&ci->i_fragtree_mutex);
@@ -756,6 +757,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 		if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
 			ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
 		ci->i_layout = info->layout;
+		ci->i_pool_ns_len = iinfo->pool_ns_len;
 
 		queue_trunc = ceph_fill_file_size(inode, issued,
 					le32_to_cpu(info->truncate_seq),
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index e7b130a..911d64d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -100,6 +100,14 @@ static int parse_reply_info_in(void **p, void *end,
 	} else
 		info->inline_version = CEPH_INLINE_NONE;
 
+	if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+		ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+		ceph_decode_need(p, end, info->pool_ns_len, bad);
+		*p += info->pool_ns_len;
+	} else {
+		info->pool_ns_len = 0;
+	}
+
 	return 0;
 bad:
 	return err;
@@ -2298,6 +2306,14 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
 				  CEPH_CAP_PIN);
 
+	/* deny access to directories with pool_ns layouts */
+	if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
+	    ceph_inode(req->r_inode)->i_pool_ns_len)
+		return -EIO;
+	if (req->r_locked_dir &&
+	    ceph_inode(req->r_locked_dir)->i_pool_ns_len)
+		return -EIO;
+
 	/* issue */
 	mutex_lock(&mdsc->mutex);
 	__register_request(mdsc, req, dir);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ccf11ef..37712cc 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -44,6 +44,7 @@ struct ceph_mds_reply_info_in {
 	u64 inline_version;
 	u32 inline_len;
 	char *inline_data;
+	u32 pool_ns_len;
 };
 
 /*
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 75b7d12..9c458eb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -287,6 +287,7 @@ struct ceph_inode_info {
 
 	struct ceph_dir_layout i_dir_layout;
 	struct ceph_file_layout i_layout;
+	size_t i_pool_ns_len;
 	char *i_symlink;
 
 	/* for dirs */
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index c1ef6f1..15151f3 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -75,6 +75,7 @@
 #define CEPH_FEATURE_CRUSH_TUNABLES5	(1ULL<<58) /* chooseleaf stable mode */
 // duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5
 #define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING   (1ULL<<58) /* New, v7 encoding */
+#define CEPH_FEATURE_FS_FILE_LAYOUT_V2       (1ULL<<58) /* file_layout_t */
 
 /*
  * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
-- 
cgit v0.10.2


From e723e3f7f9591b79e8c56b3d7c5a204a9c571b55 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@imgtec.com>
Date: Fri, 4 Mar 2016 01:42:49 +0000
Subject: MIPS: traps: Fix SIGFPE information leak from `do_ov' and
 `do_trap_or_bp'

Avoid sending a partially initialised `siginfo_t' structure along SIGFPE
signals issued from `do_ov' and `do_trap_or_bp', leading to information
leaking from the kernel stack.

Signed-off-by: Maciej W. Rozycki <macro@imgtec.com>
Cc: stable@vger.kernel.org
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index ae790c5..bf14da9 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -690,15 +690,15 @@ static int simulate_sync(struct pt_regs *regs, unsigned int opcode)
 asmlinkage void do_ov(struct pt_regs *regs)
 {
 	enum ctx_state prev_state;
-	siginfo_t info;
+	siginfo_t info = {
+		.si_signo = SIGFPE,
+		.si_code = FPE_INTOVF,
+		.si_addr = (void __user *)regs->cp0_epc,
+	};
 
 	prev_state = exception_enter();
 	die_if_kernel("Integer overflow", regs);
 
-	info.si_code = FPE_INTOVF;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_addr = (void __user *) regs->cp0_epc;
 	force_sig_info(SIGFPE, &info, current);
 	exception_exit(prev_state);
 }
@@ -874,7 +874,7 @@ out:
 void do_trap_or_bp(struct pt_regs *regs, unsigned int code,
 	const char *str)
 {
-	siginfo_t info;
+	siginfo_t info = { 0 };
 	char b[40];
 
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
@@ -903,7 +903,6 @@ void do_trap_or_bp(struct pt_regs *regs, unsigned int code,
 		else
 			info.si_code = FPE_INTOVF;
 		info.si_signo = SIGFPE;
-		info.si_errno = 0;
 		info.si_addr = (void __user *) regs->cp0_epc;
 		force_sig_info(SIGFPE, &info, current);
 		break;
-- 
cgit v0.10.2


From f474c8c857d996f34c39f66bbed23faaa739fad6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mika=20Penttil=C3=A4?= <mika.penttila@nextfour.com>
Date: Mon, 22 Feb 2016 17:56:52 +0100
Subject: ARM: 8544/1: set_memory_xx fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow zero size updates. This makes set_memory_xx() consistent with x86, s390 and arm64 and makes apply_to_page_range() not to BUG() when loading modules.

Signed-off-by: Mika Penttilä mika.penttila@nextfour.com
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>

diff --git a/arch/arm/mm/pageattr.c b/arch/arm/mm/pageattr.c
index cf30daf..d19b1ad 100644
--- a/arch/arm/mm/pageattr.c
+++ b/arch/arm/mm/pageattr.c
@@ -49,6 +49,9 @@ static int change_memory_common(unsigned long addr, int numpages,
 		WARN_ON_ONCE(1);
 	}
 
+	if (!numpages)
+		return 0;
+
 	if (start < MODULES_VADDR || start >= MODULES_END)
 		return -EINVAL;
 
-- 
cgit v0.10.2


From 6e2452dff4441e3dc24d415c8b2cda8a3ba52116 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Thu, 3 Mar 2016 15:39:41 -0700
Subject: nfit: Continue init even if ARS commands are unimplemented

If firmware doesn't implement any of the ARS commands, take that to
mean that ARS is unsupported, and continue to initialize regions without
bad block lists. We cannot make the assumption that ARS commands will be
unconditionally supported on all NVDIMMs.

Reported-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Acked-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Tested-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index fb53db1..35947ac 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -1590,14 +1590,21 @@ static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
 	start = ndr_desc->res->start;
 	len = ndr_desc->res->end - ndr_desc->res->start + 1;
 
+	/*
+	 * If ARS is unimplemented, unsupported, or if the 'Persistent Memory
+	 * Scrub' flag in extended status is not set, skip this but continue
+	 * initialization
+	 */
 	rc = ars_get_cap(nd_desc, ars_cap, start, len);
+	if (rc == -ENOTTY) {
+		dev_dbg(acpi_desc->dev,
+			"Address Range Scrub is not implemented, won't create an error list\n");
+		rc = 0;
+		goto out;
+	}
 	if (rc)
 		goto out;
 
-	/*
-	 * If ARS is unsupported, or if the 'Persistent Memory Scrub' flag in
-	 * extended status is not set, skip this but continue initialization
-	 */
 	if ((ars_cap->status & 0xffff) ||
 		!(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) {
 		dev_warn(acpi_desc->dev,
-- 
cgit v0.10.2


From c57753d4541d5104284abbdceb841e690394e55f Mon Sep 17 00:00:00 2001
From: Jorge Ramirez-Ortiz <jorge.ramirez-ortiz@linaro.org>
Date: Tue, 1 Mar 2016 16:04:00 -0500
Subject: mtd: nand: tests: fix regression introduced in mtd_nandectest

Offending Commit: 6e94119 "mtd: nand: return consistent error codes in
ecc.correct() implementations"

The new error code was not being handled properly in double bit error
detection.

Signed-off-by: Jorge Ramirez-Ortiz <jorge.ramirez-ortiz@linaro.org>
Reviewed-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Tested-by: Franklin S Cooper Jr <fcooper@ti.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>

diff --git a/drivers/mtd/tests/mtd_nandecctest.c b/drivers/mtd/tests/mtd_nandecctest.c
index 7931615..88b6c81 100644
--- a/drivers/mtd/tests/mtd_nandecctest.c
+++ b/drivers/mtd/tests/mtd_nandecctest.c
@@ -187,7 +187,7 @@ static int double_bit_error_detect(void *error_data, void *error_ecc,
 	__nand_calculate_ecc(error_data, size, calc_ecc);
 	ret = __nand_correct_data(error_data, error_ecc, calc_ecc, size);
 
-	return (ret == -1) ? 0 : -EINVAL;
+	return (ret == -EBADMSG) ? 0 : -EINVAL;
 }
 
 static const struct nand_ecc_test nand_ecc_test[] = {
-- 
cgit v0.10.2


From cfa52c0cfa4d727aa3e457bf29aeff296c528a08 Mon Sep 17 00:00:00 2001
From: Karol Herbst <nouveau@karolherbst.de>
Date: Thu, 3 Mar 2016 02:03:11 +0100
Subject: x86/mm/kmmio: Fix mmiotrace for hugepages

Because Linux might use bigger pages than the 4K pages to handle those mmio
ioremaps, the kmmio code shouldn't rely on the pade id as it currently does.

Using the memory address instead of the page id lets us look up how big the
page is and what its base address is, so that we won't get a page fault
within the same page twice anymore.

Tested-by: Pierre Moreau <pierre.morrow@free.fr>
Signed-off-by: Karol Herbst <nouveau@karolherbst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Cc: linux-x86_64@vger.kernel.org
Cc: nouveau@lists.freedesktop.org
Cc: pq@iki.fi
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1456966991-6861-1-git-send-email-nouveau@karolherbst.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 637ab34..ddb2244 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -33,7 +33,7 @@
 struct kmmio_fault_page {
 	struct list_head list;
 	struct kmmio_fault_page *release_next;
-	unsigned long page; /* location of the fault page */
+	unsigned long addr; /* the requested address */
 	pteval_t old_presence; /* page presence prior to arming */
 	bool armed;
 
@@ -70,9 +70,16 @@ unsigned int kmmio_count;
 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
 static LIST_HEAD(kmmio_probes);
 
-static struct list_head *kmmio_page_list(unsigned long page)
+static struct list_head *kmmio_page_list(unsigned long addr)
 {
-	return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
+
+	if (!pte)
+		return NULL;
+	addr &= page_level_mask(l);
+
+	return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
 }
 
 /* Accessed per-cpu */
@@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 }
 
 /* You must be holding RCU read lock. */
-static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
 {
 	struct list_head *head;
 	struct kmmio_fault_page *f;
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
 
-	page &= PAGE_MASK;
-	head = kmmio_page_list(page);
+	if (!pte)
+		return NULL;
+	addr &= page_level_mask(l);
+	head = kmmio_page_list(addr);
 	list_for_each_entry_rcu(f, head, list) {
-		if (f->page == page)
+		if (f->addr == addr)
 			return f;
 	}
 	return NULL;
@@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
 static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
 {
 	unsigned int level;
-	pte_t *pte = lookup_address(f->page, &level);
+	pte_t *pte = lookup_address(f->addr, &level);
 
 	if (!pte) {
-		pr_err("no pte for page 0x%08lx\n", f->page);
+		pr_err("no pte for addr 0x%08lx\n", f->addr);
 		return -1;
 	}
 
@@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
 		return -1;
 	}
 
-	__flush_tlb_one(f->page);
+	__flush_tlb_one(f->addr);
 	return 0;
 }
 
@@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
 	int ret;
 	WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
 	if (f->armed) {
-		pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
-			   f->page, f->count, !!f->old_presence);
+		pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n",
+			   f->addr, f->count, !!f->old_presence);
 	}
 	ret = clear_page_presence(f, true);
-	WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
-		  f->page);
+	WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
+		  f->addr);
 	f->armed = true;
 	return ret;
 }
@@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
 {
 	int ret = clear_page_presence(f, false);
 	WARN_ONCE(ret < 0,
-			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
+			KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
 	f->armed = false;
 }
 
@@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	struct kmmio_context *ctx;
 	struct kmmio_fault_page *faultpage;
 	int ret = 0; /* default to fault not handled */
+	unsigned long page_base = addr;
+	unsigned int l;
+	pte_t *pte = lookup_address(addr, &l);
+	if (!pte)
+		return -EINVAL;
+	page_base &= page_level_mask(l);
 
 	/*
 	 * Preemption is now disabled to prevent process switch during
@@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	preempt_disable();
 	rcu_read_lock();
 
-	faultpage = get_kmmio_fault_page(addr);
+	faultpage = get_kmmio_fault_page(page_base);
 	if (!faultpage) {
 		/*
 		 * Either this page fault is not caused by kmmio, or
@@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 
 	ctx = &get_cpu_var(kmmio_ctx);
 	if (ctx->active) {
-		if (addr == ctx->addr) {
+		if (page_base == ctx->addr) {
 			/*
 			 * A second fault on the same page means some other
 			 * condition needs handling by do_page_fault(), the
@@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	ctx->active++;
 
 	ctx->fpage = faultpage;
-	ctx->probe = get_kmmio_probe(addr);
+	ctx->probe = get_kmmio_probe(page_base);
 	ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
-	ctx->addr = addr;
+	ctx->addr = page_base;
 
 	if (ctx->probe && ctx->probe->pre_handler)
 		ctx->probe->pre_handler(ctx->probe, regs, addr);
@@ -354,12 +371,11 @@ out:
 }
 
 /* You must be holding kmmio_lock. */
-static int add_kmmio_fault_page(unsigned long page)
+static int add_kmmio_fault_page(unsigned long addr)
 {
 	struct kmmio_fault_page *f;
 
-	page &= PAGE_MASK;
-	f = get_kmmio_fault_page(page);
+	f = get_kmmio_fault_page(addr);
 	if (f) {
 		if (!f->count)
 			arm_kmmio_fault_page(f);
@@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page)
 		return -1;
 
 	f->count = 1;
-	f->page = page;
+	f->addr = addr;
 
 	if (arm_kmmio_fault_page(f)) {
 		kfree(f);
 		return -1;
 	}
 
-	list_add_rcu(&f->list, kmmio_page_list(f->page));
+	list_add_rcu(&f->list, kmmio_page_list(f->addr));
 
 	return 0;
 }
 
 /* You must be holding kmmio_lock. */
-static void release_kmmio_fault_page(unsigned long page,
+static void release_kmmio_fault_page(unsigned long addr,
 				struct kmmio_fault_page **release_list)
 {
 	struct kmmio_fault_page *f;
 
-	page &= PAGE_MASK;
-	f = get_kmmio_fault_page(page);
+	f = get_kmmio_fault_page(addr);
 	if (!f)
 		return;
 
@@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p)
 	int ret = 0;
 	unsigned long size = 0;
 	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+	unsigned int l;
+	pte_t *pte;
 
 	spin_lock_irqsave(&kmmio_lock, flags);
 	if (get_kmmio_probe(p->addr)) {
 		ret = -EEXIST;
 		goto out;
 	}
+
+	pte = lookup_address(p->addr, &l);
+	if (!pte) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	kmmio_count++;
 	list_add_rcu(&p->list, &kmmio_probes);
 	while (size < size_lim) {
 		if (add_kmmio_fault_page(p->addr + size))
 			pr_err("Unable to set page fault.\n");
-		size += PAGE_SIZE;
+		size += page_level_size(l);
 	}
 out:
 	spin_unlock_irqrestore(&kmmio_lock, flags);
@@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
 	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
 	struct kmmio_fault_page *release_list = NULL;
 	struct kmmio_delayed_release *drelease;
+	unsigned int l;
+	pte_t *pte;
+
+	pte = lookup_address(p->addr, &l);
+	if (!pte)
+		return;
 
 	spin_lock_irqsave(&kmmio_lock, flags);
 	while (size < size_lim) {
 		release_kmmio_fault_page(p->addr + size, &release_list);
-		size += PAGE_SIZE;
+		size += page_level_size(l);
 	}
 	list_del_rcu(&p->list);
 	kmmio_count--;
-- 
cgit v0.10.2


From ca031745fdfc38fed0b9da89bd07e332fce96520 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Sat, 5 Mar 2016 14:57:38 +0100
Subject: Revert "kbuild: Add option to turn incompatible pointer check into
 error"

This reverts commit ef50c046338948df9f23fb5ef853efda0274c7b3.

So adding -Werror=incompatible-pointer-types wasn't a bad idea, but it
should really not be done in the scheduler tree: it exposes us to a
number of pre-existing warnings (most of them harmless), now upgraded
to build failures...

This should be done via the kbuild tree.

Cc: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-rt-users@vger.kernel.org
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: linux-kernel@vger.kernel.org

diff --git a/Makefile b/Makefile
index db2bc43..af6e5f8 100644
--- a/Makefile
+++ b/Makefile
@@ -773,9 +773,6 @@ KBUILD_CFLAGS   += $(call cc-option,-Werror=strict-prototypes)
 # Prohibit date/time macros, which would make the build non-deterministic
 KBUILD_CFLAGS   += $(call cc-option,-Werror=date-time)
 
-# enforce correct pointer usage
-KBUILD_CFLAGS   += $(call cc-option,-Werror=incompatible-pointer-types)
-
 # use the deterministic mode of AR if available
 KBUILD_ARFLAGS := $(call ar-option,D)
 
-- 
cgit v0.10.2


From 90e94b160c7f647ddffda707f5e3c0c66c170df8 Mon Sep 17 00:00:00 2001
From: Mario Kleiner <mario.kleiner.de@gmail.com>
Date: Tue, 1 Mar 2016 21:31:16 +0100
Subject: drm/amdgpu: Fix error handling in amdgpu_flip_work_func.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The patch e1d09dc0ccc6: "drm/amdgpu: Don't hang in
amdgpu_flip_work_func on disabled crtc." from Feb 19, 2016, leads to
the following static checker warning, as reported by Dan Carpenter in
https://lists.freedesktop.org/archives/dri-devel/2016-February/101987.html

drivers/gpu/drm/amd/amdgpu/amdgpu_display.c:127 amdgpu_flip_work_func()	warn: should this be 'repcnt == -1'
drivers/gpu/drm/amd/amdgpu/amdgpu_display.c:136 amdgpu_flip_work_func() error: double unlock 'spin_lock:&crtc->dev->event_lock'
drivers/gpu/drm/amd/amdgpu/amdgpu_display.c:136 amdgpu_flip_work_func() error: double unlock 'irqsave:flags'

This patch fixes both reported problems:

Change post-decrement of repcnt to pre-decrement, so
it can't underflow anymore, but still performs up to
three repetitions - three is the maximum one could
expect in practice.

Move the spin_unlock_irqrestore to where it actually
belongs.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Mario Kleiner <mario.kleiner.de@gmail.com>
Cc: <stable@vger.kernel.org> # 4.4+
Cc: Michel Dänzer <michel.daenzer@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index 8297bc3..1846d65 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -96,7 +96,7 @@ static void amdgpu_flip_work_func(struct work_struct *__work)
 	 * In practice this won't execute very often unless on very fast
 	 * machines because the time window for this to happen is very small.
 	 */
-	while (amdgpuCrtc->enabled && repcnt--) {
+	while (amdgpuCrtc->enabled && --repcnt) {
 		/* GET_DISTANCE_TO_VBLANKSTART returns distance to real vblank
 		 * start in hpos, and to the "fudged earlier" vblank start in
 		 * vpos.
@@ -112,13 +112,13 @@ static void amdgpu_flip_work_func(struct work_struct *__work)
 			break;
 
 		/* Sleep at least until estimated real start of hw vblank */
-		spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
 		min_udelay = (-hpos + 1) * max(vblank->linedur_ns / 1000, 5);
 		if (min_udelay > vblank->framedur_ns / 2000) {
 			/* Don't wait ridiculously long - something is wrong */
 			repcnt = 0;
 			break;
 		}
+		spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
 		usleep_range(min_udelay, 2 * min_udelay);
 		spin_lock_irqsave(&crtc->dev->event_lock, flags);
 	};
-- 
cgit v0.10.2


From 1e1490a38504419e349caa1b7d55d5c141a9bccb Mon Sep 17 00:00:00 2001
From: Mario Kleiner <mario.kleiner.de@gmail.com>
Date: Tue, 1 Mar 2016 21:31:17 +0100
Subject: drm/radeon: Fix error handling in radeon_flip_work_func.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a port of the patch "drm/amdgpu: Fix error handling in amdgpu_flip_work_func."
to fix the following problem for radeon as well which was
reported against amdgpu:

The patch e1d09dc0ccc6: "drm/amdgpu: Don't hang in
amdgpu_flip_work_func on disabled crtc." from Feb 19, 2016, leads to
the following static checker warning, as reported by Dan Carpenter in
https://lists.freedesktop.org/archives/dri-devel/2016-February/101987.html

drivers/gpu/drm/amd/amdgpu/amdgpu_display.c:127 amdgpu_flip_work_func()     warn: should this be 'repcnt == -1'
drivers/gpu/drm/amd/amdgpu/amdgpu_display.c:136 amdgpu_flip_work_func() error: double unlock 'spin_lock:&crtc->dev->event_lock'
drivers/gpu/drm/amd/amdgpu/amdgpu_display.c:136 amdgpu_flip_work_func() error: double unlock 'irqsave:flags'

This patch fixes both reported problems:

Change post-decrement of repcnt to pre-decrement, so
it can't underflow anymore, but still performs up to
three repetitions - three is the maximum one could
expect in practice.

Move the spin_unlock_irqrestore to where it actually
belongs.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Mario Kleiner <mario.kleiner.de@gmail.com>
Cc: <stable@vger.kernel.org> # 4.4+
Cc: Michel Dänzer <michel.daenzer@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c
index 2b9ba03..2d9196a 100644
--- a/drivers/gpu/drm/radeon/radeon_display.c
+++ b/drivers/gpu/drm/radeon/radeon_display.c
@@ -455,7 +455,7 @@ static void radeon_flip_work_func(struct work_struct *__work)
 	 * In practice this won't execute very often unless on very fast
 	 * machines because the time window for this to happen is very small.
 	 */
-	while (radeon_crtc->enabled && repcnt--) {
+	while (radeon_crtc->enabled && --repcnt) {
 		/* GET_DISTANCE_TO_VBLANKSTART returns distance to real vblank
 		 * start in hpos, and to the "fudged earlier" vblank start in
 		 * vpos.
@@ -471,13 +471,13 @@ static void radeon_flip_work_func(struct work_struct *__work)
 			break;
 
 		/* Sleep at least until estimated real start of hw vblank */
-		spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
 		min_udelay = (-hpos + 1) * max(vblank->linedur_ns / 1000, 5);
 		if (min_udelay > vblank->framedur_ns / 2000) {
 			/* Don't wait ridiculously long - something is wrong */
 			repcnt = 0;
 			break;
 		}
+		spin_unlock_irqrestore(&crtc->dev->event_lock, flags);
 		usleep_range(min_udelay, 2 * min_udelay);
 		spin_lock_irqsave(&crtc->dev->event_lock, flags);
 	};
-- 
cgit v0.10.2


From e4f6daac20332448529b11f09388f1d55ef2084c Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Sun, 21 Feb 2016 10:53:03 +0100
Subject: ubi: Fix out of bounds write in volume update code

ubi_start_leb_change() allocates too few bytes.
ubi_more_leb_change_data() will write up to req->upd_bytes +
ubi->min_io_size bytes.

Cc: stable@vger.kernel.org
Signed-off-by: Richard Weinberger <richard@nod.at>
Reviewed-by: Boris Brezillon <boris.brezillon@free-electrons.com>

diff --git a/drivers/mtd/ubi/upd.c b/drivers/mtd/ubi/upd.c
index 2a1b6e0..0134ba3 100644
--- a/drivers/mtd/ubi/upd.c
+++ b/drivers/mtd/ubi/upd.c
@@ -193,7 +193,7 @@ int ubi_start_leb_change(struct ubi_device *ubi, struct ubi_volume *vol,
 	vol->changing_leb = 1;
 	vol->ch_lnum = req->lnum;
 
-	vol->upd_buf = vmalloc(req->bytes);
+	vol->upd_buf = vmalloc(ALIGN((int)req->bytes, ubi->min_io_size));
 	if (!vol->upd_buf)
 		return -ENOMEM;
 
-- 
cgit v0.10.2


From 322740efbb7ca26df845161e61cc41484b7e328e Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Mon, 25 Jan 2016 23:33:30 +0100
Subject: Revert "um: Fix get_signal() usage"

Commit db2f24dc240856fb1d78005307f1523b7b3c121b
was plain wrong. I did not realize the we are
allowed to loop here.
In fact we have to loop and must not return to userspace
before all SIGSEGVs have been delivered.
Other archs do this directly in their entry code, UML
does it here.

Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Richard Weinberger <richard@nod.at>

diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index fc8be0e..57acbd6 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -69,7 +69,7 @@ void do_signal(struct pt_regs *regs)
 	struct ksignal ksig;
 	int handled_sig = 0;
 
-	if (get_signal(&ksig)) {
+	while (get_signal(&ksig)) {
 		handled_sig = 1;
 		/* Whee!  Actually deliver the signal.  */
 		handle_signal(&ksig, regs);
-- 
cgit v0.10.2


From 0834f9cc9f1c5a104e87000378b9cda290c4f406 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Mon, 25 Jan 2016 23:24:21 +0100
Subject: um: Export pm_power_off

...modules are using this symbol.
Export it like all other archs to.

Signed-off-by: Richard Weinberger <richard@nod.at>

diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index 9bdf67a..b60a9f8 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -12,6 +12,7 @@
 #include <skas.h>
 
 void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
 
 static void kill_off_processes(void)
 {
-- 
cgit v0.10.2


From ad32a1f3c36b046b2e7b0361b88c18e3af61419e Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sat, 23 Jan 2016 19:17:59 +0000
Subject: um: use %lx format specifiers for unsigned longs

static analysis from cppcheck detected %x being used for
unsigned longs:

[arch/x86/um/os-Linux/task_size.c:112]: (warning) %x in format
  string (no. 1) requires 'unsigned int' but the argument type
  is 'unsigned long'.

Use %lx instead of %x

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Richard Weinberger <richard@nod.at>

diff --git a/arch/x86/um/os-Linux/task_size.c b/arch/x86/um/os-Linux/task_size.c
index 8502ad3..5adb6a2 100644
--- a/arch/x86/um/os-Linux/task_size.c
+++ b/arch/x86/um/os-Linux/task_size.c
@@ -109,7 +109,7 @@ unsigned long os_get_top_address(void)
 		exit(1);
 	}
 
-	printf("0x%x\n", bottom << UM_KERN_PAGE_SHIFT);
+	printf("0x%lx\n", bottom << UM_KERN_PAGE_SHIFT);
 	printf("Locating the top of the address space ... ");
 	fflush(stdout);
 
@@ -134,7 +134,7 @@ out:
 		exit(1);
 	}
 	top <<= UM_KERN_PAGE_SHIFT;
-	printf("0x%x\n", top);
+	printf("0x%lx\n", top);
 
 	return top;
 }
-- 
cgit v0.10.2


From 7f54ab5ff52fb0b91569bc69c4a6bc5cac1b768d Mon Sep 17 00:00:00 2001
From: Nicholas Bellinger <nab@linux-iscsi.org>
Date: Sat, 5 Mar 2016 20:00:12 -0800
Subject: target: Drop incorrect ABORT_TASK put for completed commands

This patch fixes a recent ABORT_TASK regression associated
with commit febe562c, where a left-over target_put_sess_cmd()
would still be called when __target_check_io_state() detected
a command has already been completed, and explicit ABORT must
be avoided.

Note commit febe562c dropped the local kref_get_unless_zero()
check in core_tmr_abort_task(), but did not drop this extra
corresponding target_put_sess_cmd() in the failure path.

So go ahead and drop this now bogus target_put_sess_cmd(),
and avoid this potential use-after-free.

Reported-by: Dan Lane <dracodan@gmail.com>
Cc: Quinn Tran <quinn.tran@qlogic.com>
Cc: Himanshu Madhani <himanshu.madhani@qlogic.com>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Andy Grover <agrover@redhat.com>
Cc: Mike Christie <mchristi@redhat.com>
Cc: stable@vger.kernel.org # 3.14+
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>

diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c
index 82a663b..4f229e7 100644
--- a/drivers/target/target_core_tmr.c
+++ b/drivers/target/target_core_tmr.c
@@ -177,7 +177,6 @@ void core_tmr_abort_task(
 
 		if (!__target_check_io_state(se_cmd, se_sess, 0)) {
 			spin_unlock_irqrestore(&se_sess->sess_cmd_lock, flags);
-			target_put_sess_cmd(se_cmd);
 			goto out;
 		}
 		list_del_init(&se_cmd->se_cmd_list);
-- 
cgit v0.10.2


From 5d8eb84253333f8f63ec704276e3f0a8ec8f3189 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 3 Mar 2016 12:24:10 +0100
Subject: cpu/hotplug: Remove redundant state check

The check for the AP range in cpuhp_is_ap_state() is redundant after commit
8df3e07e7f21 "cpu/hotplug: Let upcoming cpu bring itself fully up" because all
states above CPUHP_BRINGUP_CPU are invoked on the hotplugged cpu. Remove it.

Reported-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 93e9d89..373e831 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1231,8 +1231,6 @@ static int cpuhp_cb_check(enum cpuhp_state state)
 
 static bool cpuhp_is_ap_state(enum cpuhp_state state)
 {
-	if (state >= CPUHP_AP_OFFLINE && state <= CPUHP_AP_ONLINE)
-		return true;
 	return state > CPUHP_BRINGUP_CPU;
 }
 
-- 
cgit v0.10.2


From 82ff6cc26e98f9bba8e2a10f727e335fa241cc47 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 7 Mar 2016 08:22:22 +1100
Subject: xfs: separate log head record discovery from verification

The code that locates the log record at the head of the log is buried in
the log head verification function. This is fine when torn write
verification occurs unconditionally, but this behavior is problematic
for filesystems that might be moved across systems with different
architectures.

In preparation for separating examination of the log head for unmount
records from torn write detection, lift the record location logic out of
the log verification function and into the caller. This patch does not
change behavior.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index da37beb..c2d04ff 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1109,27 +1109,10 @@ xlog_verify_head(
 	bool			tmp_wrapped;
 
 	/*
-	 * Search backwards through the log looking for the log record header
-	 * block. This wraps all the way back around to the head so something is
-	 * seriously wrong if we can't find it.
-	 */
-	found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
-				      rhead, wrapped);
-	if (found < 0)
-		return found;
-	if (!found) {
-		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
-		return -EIO;
-	}
-
-	*tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
-
-	/*
-	 * Now that we have a tail block, check the head of the log for torn
-	 * writes. Search again until we hit the tail or the maximum number of
-	 * log record I/Os that could have been in flight at one time. Use a
-	 * temporary buffer so we don't trash the rhead/bp pointer from the
-	 * call above.
+	 * Check the head of the log for torn writes. Search backwards from the
+	 * head until we hit the tail or the maximum number of log record I/Os
+	 * that could have been in flight at one time. Use a temporary buffer so
+	 * we don't trash the rhead/bp pointers from the caller.
 	 */
 	tmp_bp = xlog_get_bp(log, 1);
 	if (!tmp_bp)
@@ -1254,6 +1237,7 @@ xlog_find_tail(
 	 */
 	if ((error = xlog_find_head(log, head_blk)))
 		return error;
+	ASSERT(*head_blk < INT_MAX);
 
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
@@ -1271,12 +1255,26 @@ xlog_find_tail(
 	}
 
 	/*
+	 * Search backwards through the log looking for the log record header
+	 * block. This wraps all the way back around to the head so something is
+	 * seriously wrong if we can't find it.
+	 */
+	error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
+				      &rhead_blk, &rhead, &wrapped);
+	if (error < 0)
+		return error;
+	if (!error) {
+		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+		return -EIO;
+	}
+	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
+
+	/*
 	 * Trim the head block back to skip over torn records. We can have
 	 * multiple log I/Os in flight at any time, so we assume CRC failures
 	 * back through the previous several records are torn writes and skip
 	 * them.
 	 */
-	ASSERT(*head_blk < INT_MAX);
 	error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
 				 &rhead, &wrapped);
 	if (error)
-- 
cgit v0.10.2


From 65b99a08b350876e8835fc0e7173598165f64dee Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 7 Mar 2016 08:22:22 +1100
Subject: xfs: refactor unmount record detection into helper

Once the mount sequence has identified the head and tail blocks of the
physical log, the record at the head of the log is located and examined
for an unmount record to determine if the log is clean. This currently
occurs after torn write verification of the head region of the log.

This must ultimately be separated from torn write verification and may
need to be called again if the log head is walked back due to a torn
write (to determine whether the new head record is an unmount record).
Separate this logic into a new helper function. This patch does not
change behavior.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c2d04ff..1aae756 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1199,6 +1199,84 @@ xlog_verify_head(
 }
 
 /*
+ * Check whether the head of the log points to an unmount record. In other
+ * words, determine whether the log is clean. If so, update the in-core state
+ * appropriately.
+ */
+static int
+xlog_check_unmount_rec(
+	struct xlog		*log,
+	xfs_daddr_t		*head_blk,
+	xfs_daddr_t		*tail_blk,
+	struct xlog_rec_header	*rhead,
+	xfs_daddr_t		rhead_blk,
+	struct xfs_buf		*bp,
+	bool			*clean)
+{
+	struct xlog_op_header	*op_head;
+	xfs_daddr_t		umount_data_blk;
+	xfs_daddr_t		after_umount_blk;
+	int			hblks;
+	int			error;
+	char			*offset;
+
+	*clean = false;
+
+	/*
+	 * Look for unmount record. If we find it, then we know there was a
+	 * clean unmount. Since 'i' could be the last block in the physical
+	 * log, we convert to a log block before comparing to the head_blk.
+	 *
+	 * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
+	 * below. We won't want to clear the unmount record if there is one, so
+	 * we pass the lsn of the unmount record rather than the block after it.
+	 */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		int	h_size = be32_to_cpu(rhead->h_size);
+		int	h_version = be32_to_cpu(rhead->h_version);
+
+		if ((h_version & XLOG_VERSION_2) &&
+		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
+			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
+			if (h_size % XLOG_HEADER_CYCLE_SIZE)
+				hblks++;
+		} else {
+			hblks = 1;
+		}
+	} else {
+		hblks = 1;
+	}
+	after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
+	after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
+	if (*head_blk == after_umount_blk &&
+	    be32_to_cpu(rhead->h_num_logops) == 1) {
+		umount_data_blk = rhead_blk + hblks;
+		umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
+		error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+		if (error)
+			return error;
+
+		op_head = (struct xlog_op_header *)offset;
+		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
+			/*
+			 * Set tail and last sync so that newly written log
+			 * records will point recovery to after the current
+			 * unmount record.
+			 */
+			xlog_assign_atomic_lsn(&log->l_tail_lsn,
+					log->l_curr_cycle, after_umount_blk);
+			xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+					log->l_curr_cycle, after_umount_blk);
+			*tail_blk = after_umount_blk;
+
+			*clean = true;
+		}
+	}
+
+	return 0;
+}
+
+/*
  * Find the sync block number or the tail of the log.
  *
  * This will be the block number of the last record to have its
@@ -1221,16 +1299,13 @@ xlog_find_tail(
 	xfs_daddr_t		*tail_blk)
 {
 	xlog_rec_header_t	*rhead;
-	xlog_op_header_t	*op_head;
 	char			*offset = NULL;
 	xfs_buf_t		*bp;
 	int			error;
-	xfs_daddr_t		umount_data_blk;
-	xfs_daddr_t		after_umount_blk;
 	xfs_daddr_t		rhead_blk;
 	xfs_lsn_t		tail_lsn;
-	int			hblks;
 	bool			wrapped = false;
+	bool			clean = false;
 
 	/*
 	 * Find previous log record
@@ -1301,66 +1376,24 @@ xlog_find_tail(
 					BBTOB(log->l_curr_block));
 	xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
 					BBTOB(log->l_curr_block));
+	tail_lsn = atomic64_read(&log->l_tail_lsn);
 
 	/*
-	 * Look for unmount record.  If we find it, then we know there
-	 * was a clean unmount.  Since 'i' could be the last block in
-	 * the physical log, we convert to a log block before comparing
-	 * to the head_blk.
-	 *
-	 * Save the current tail lsn to use to pass to
-	 * xlog_clear_stale_blocks() below.  We won't want to clear the
-	 * unmount record if there is one, so we pass the lsn of the
-	 * unmount record rather than the block after it.
+	 * Look for an unmount record at the head of the log. This sets the log
+	 * state to determine whether recovery is necessary.
 	 */
-	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-		int	h_size = be32_to_cpu(rhead->h_size);
-		int	h_version = be32_to_cpu(rhead->h_version);
-
-		if ((h_version & XLOG_VERSION_2) &&
-		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
-			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
-			if (h_size % XLOG_HEADER_CYCLE_SIZE)
-				hblks++;
-		} else {
-			hblks = 1;
-		}
-	} else {
-		hblks = 1;
-	}
-	after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
-	after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
-	tail_lsn = atomic64_read(&log->l_tail_lsn);
-	if (*head_blk == after_umount_blk &&
-	    be32_to_cpu(rhead->h_num_logops) == 1) {
-		umount_data_blk = rhead_blk + hblks;
-		umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
-		error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
-		if (error)
-			goto done;
-
-		op_head = (xlog_op_header_t *)offset;
-		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
-			/*
-			 * Set tail and last sync so that newly written
-			 * log records will point recovery to after the
-			 * current unmount record.
-			 */
-			xlog_assign_atomic_lsn(&log->l_tail_lsn,
-					log->l_curr_cycle, after_umount_blk);
-			xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
-					log->l_curr_cycle, after_umount_blk);
-			*tail_blk = after_umount_blk;
+	error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
+				       rhead_blk, bp, &clean);
+	if (error)
+		goto done;
 
-			/*
-			 * Note that the unmount was clean. If the unmount
-			 * was not clean, we need to know this to rebuild the
-			 * superblock counters from the perag headers if we
-			 * have a filesystem using non-persistent counters.
-			 */
-			log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
-		}
-	}
+	/*
+	 * Note that the unmount was clean. If the unmount was not clean, we
+	 * need to know this to rebuild the superblock counters from the perag
+	 * headers if we have a filesystem using non-persistent counters.
+	 */
+	if (clean)
+		log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
 
 	/*
 	 * Make sure that there are no blocks in front of the head
-- 
cgit v0.10.2


From 717bc0ebca0bce9cb3edfc31b49b384a1d55db1c Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 7 Mar 2016 08:22:22 +1100
Subject: xfs: refactor in-core log state update to helper

Once the record at the head of the log is identified and verified, the
in-core log state is updated based on the record. This includes
information such as the current head block and cycle, the start block of
the last record written to the log, the tail lsn, etc.

Once torn write detection is conditional, this logic will need to be
reused. Factor the code to update the in-core log data structures into a
new helper function. This patch does not change behavior.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1aae756..9ac8aa8 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1276,6 +1276,37 @@ xlog_check_unmount_rec(
 	return 0;
 }
 
+static void
+xlog_set_state(
+	struct xlog		*log,
+	xfs_daddr_t		head_blk,
+	struct xlog_rec_header	*rhead,
+	xfs_daddr_t		rhead_blk,
+	bool			bump_cycle)
+{
+	/*
+	 * Reset log values according to the state of the log when we
+	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
+	 * one because the next write starts a new cycle rather than
+	 * continuing the cycle of the last good log record.  At this
+	 * point we have guaranteed that all partial log records have been
+	 * accounted for.  Therefore, we know that the last good log record
+	 * written was complete and ended exactly on the end boundary
+	 * of the physical log.
+	 */
+	log->l_prev_block = rhead_blk;
+	log->l_curr_block = (int)head_blk;
+	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+	if (bump_cycle)
+		log->l_curr_cycle++;
+	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+	xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
+	xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
+}
+
 /*
  * Find the sync block number or the tail of the log.
  *
@@ -1356,26 +1387,9 @@ xlog_find_tail(
 		goto done;
 
 	/*
-	 * Reset log values according to the state of the log when we
-	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
-	 * one because the next write starts a new cycle rather than
-	 * continuing the cycle of the last good log record.  At this
-	 * point we have guaranteed that all partial log records have been
-	 * accounted for.  Therefore, we know that the last good log record
-	 * written was complete and ended exactly on the end boundary
-	 * of the physical log.
+	 * Set the log state based on the current head record.
 	 */
-	log->l_prev_block = rhead_blk;
-	log->l_curr_block = (int)*head_blk;
-	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
-	if (wrapped)
-		log->l_curr_cycle++;
-	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-	xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
-					BBTOB(log->l_curr_block));
-	xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
-					BBTOB(log->l_curr_block));
+	xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
 	tail_lsn = atomic64_read(&log->l_tail_lsn);
 
 	/*
-- 
cgit v0.10.2


From 7f6aff3a29b08fc4234c8136eb1ac31b4897522c Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Mon, 7 Mar 2016 08:22:22 +1100
Subject: xfs: only run torn log write detection on dirty logs

XFS uses CRC verification over a sub-range of the head of the log to
detect and handle torn writes. This torn log write detection currently
runs unconditionally at mount time, regardless of whether the log is
dirty or clean. This is problematic in cases where a filesystem might
end up being moved across different, incompatible (i.e., opposite
byte-endianness) architectures.

The problem lies in the fact that log data is not necessarily written in
an architecture independent format. For example, certain bits of data
are written in native endian format. Further, the size of certain log
data structures differs (i.e., struct xlog_rec_header) depending on the
word size of the cpu. This leads to false positive crc verification
errors and ultimately failed mounts when a cleanly unmounted filesystem
is mounted on a system with an incompatible architecture from data that
was written near the head of the log.

Update the log head/tail discovery code to run torn write detection only
when the log is not clean. This means something other than an unmount
record resides at the head of the log and log recovery is imminent. It
is a requirement to run log recovery on the same type of host that had
written the content of the dirty log and therefore CRC failures are
legitimate corruptions in that scenario.

Reported-by: Jan Beulich <JBeulich@suse.com>
Tested-by: Jan Beulich <JBeulich@suse.com>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9ac8aa8..e7aa82f 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1376,17 +1376,6 @@ xlog_find_tail(
 	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
 
 	/*
-	 * Trim the head block back to skip over torn records. We can have
-	 * multiple log I/Os in flight at any time, so we assume CRC failures
-	 * back through the previous several records are torn writes and skip
-	 * them.
-	 */
-	error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
-				 &rhead, &wrapped);
-	if (error)
-		goto done;
-
-	/*
 	 * Set the log state based on the current head record.
 	 */
 	xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
@@ -1402,6 +1391,37 @@ xlog_find_tail(
 		goto done;
 
 	/*
+	 * Verify the log head if the log is not clean (e.g., we have anything
+	 * but an unmount record at the head). This uses CRC verification to
+	 * detect and trim torn writes. If discovered, CRC failures are
+	 * considered torn writes and the log head is trimmed accordingly.
+	 *
+	 * Note that we can only run CRC verification when the log is dirty
+	 * because there's no guarantee that the log data behind an unmount
+	 * record is compatible with the current architecture.
+	 */
+	if (!clean) {
+		xfs_daddr_t	orig_head = *head_blk;
+
+		error = xlog_verify_head(log, head_blk, tail_blk, bp,
+					 &rhead_blk, &rhead, &wrapped);
+		if (error)
+			goto done;
+
+		/* update in-core state again if the head changed */
+		if (*head_blk != orig_head) {
+			xlog_set_state(log, *head_blk, rhead, rhead_blk,
+				       wrapped);
+			tail_lsn = atomic64_read(&log->l_tail_lsn);
+			error = xlog_check_unmount_rec(log, head_blk, tail_blk,
+						       rhead, rhead_blk, bp,
+						       &clean);
+			if (error)
+				goto done;
+		}
+	}
+
+	/*
 	 * Note that the unmount was clean. If the unmount was not clean, we
 	 * need to know this to rebuild the superblock counters from the perag
 	 * headers if we have a filesystem using non-persistent counters.
-- 
cgit v0.10.2


From f6cede5b49e822ebc41a099fe41ab4989f64e2cb Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 6 Mar 2016 14:48:03 -0800
Subject: Linux 4.5-rc7


diff --git a/Makefile b/Makefile
index af6e5f8..2d519d2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
 PATCHLEVEL = 5
 SUBLEVEL = 0
-EXTRAVERSION = -rc6
+EXTRAVERSION = -rc7
 NAME = Blurry Fish Butt
 
 # *DOCUMENTATION*
-- 
cgit v0.10.2


From 7bfc60822d1cd44069f4fdb16f81a66f716bf787 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <nemoto@toshiba-tops.co.jp>
Date: Thu, 3 Mar 2016 09:07:51 +0900
Subject: gianfar: Enable eTSEC-106 erratum w/a for MPC8548E Rev2

Enable workaround for MPC8548E erratum eTSEC 106,
"Excess delays when transmitting TOE=1 large frames".
(see commit 53fad77375ce "gianfar: Enable eTSEC-20 erratum w/a
for P2020 Rev1")

This erratum was fixed in Rev 3.1.x.

Signed-off-by: Atsushi Nemoto <nemoto@toshiba-tops.co.jp>
Acked-by: Claudiu Manoil <claudiu.manoil@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 2aa7b40..b9ecf19 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -1111,8 +1111,10 @@ static void __gfar_detect_errata_85xx(struct gfar_private *priv)
 
 	if ((SVR_SOC_VER(svr) == SVR_8548) && (SVR_REV(svr) == 0x20))
 		priv->errata |= GFAR_ERRATA_12;
+	/* P2020/P1010 Rev 1; MPC8548 Rev 2 */
 	if (((SVR_SOC_VER(svr) == SVR_P2020) && (SVR_REV(svr) < 0x20)) ||
-	    ((SVR_SOC_VER(svr) == SVR_P2010) && (SVR_REV(svr) < 0x20)))
+	    ((SVR_SOC_VER(svr) == SVR_P2010) && (SVR_REV(svr) < 0x20)) ||
+	    ((SVR_SOC_VER(svr) == SVR_8548) && (SVR_REV(svr) < 0x31)))
 		priv->errata |= GFAR_ERRATA_76; /* aka eTSEC 20 */
 }
 #endif
-- 
cgit v0.10.2


From eb1f420214adf81fd98d9e8df6b60318ef883102 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 3 Mar 2016 13:27:56 +0000
Subject: asix: do not free array priv->mdio->irq

Used to be allocated and required freeing, but now
priv->mdio->irq is now a fixed sized array and should no longer be
free'd.

Issue detected using static analysis with CoverityScan

Fixes: e7f4dc3536a400 ("mdio: Move allocation of interrupts into core")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c
index 224e7d8..cf77f2d 100644
--- a/drivers/net/usb/ax88172a.c
+++ b/drivers/net/usb/ax88172a.c
@@ -134,7 +134,6 @@ static void ax88172a_remove_mdio(struct usbnet *dev)
 
 	netdev_info(dev->net, "deregistering mdio bus %s\n", priv->mdio->id);
 	mdiobus_unregister(priv->mdio);
-	kfree(priv->mdio->irq);
 	mdiobus_free(priv->mdio);
 }
 
-- 
cgit v0.10.2


From 44ef548f4f8bbc739070583f9844ff0cd8fcc6b4 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Thu, 3 Mar 2016 14:34:14 +0100
Subject: net: sched: fix act_ipt for LOG target

Before calling the destroy() or target() callbacks, the family parameter
field has to be initialized. Otherwise at least the LOG target will
refuse to work and upon removal oops the kernel.

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Phil Sutter <phil@nwl.cc>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d058696..6b70399 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -62,6 +62,7 @@ static void ipt_destroy_target(struct xt_entry_target *t)
 	struct xt_tgdtor_param par = {
 		.target   = t->u.kernel.target,
 		.targinfo = t->data,
+		.family   = NFPROTO_IPV4,
 	};
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
@@ -195,6 +196,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
 	par.hooknum  = ipt->tcfi_hook;
 	par.target   = ipt->tcfi_t->u.kernel.target;
 	par.targinfo = ipt->tcfi_t->data;
+	par.family   = NFPROTO_IPV4;
 	ret = par.target->target(skb, &par);
 
 	switch (ret) {
-- 
cgit v0.10.2


From 57a0f3675489181c31636d204fa5c5daf5fa3c83 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 3 Mar 2016 13:43:34 +0000
Subject: net/ethoc: do not free array priv->mdio->irq

priv->mdio->irq used to be allocated and required freeing, but it
is now a fixed sized array and should no longer be free'd.

Issue detected using static analysis with CoverityScan

Fixes: e7f4dc3536a400 ("mdio: Move allocation of interrupts into core")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Tobias Klauser <tklauser@distanz.ch>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c
index 62fa136..41b0106 100644
--- a/drivers/net/ethernet/ethoc.c
+++ b/drivers/net/ethernet/ethoc.c
@@ -1265,7 +1265,6 @@ static int ethoc_remove(struct platform_device *pdev)
 
 		if (priv->mdio) {
 			mdiobus_unregister(priv->mdio);
-			kfree(priv->mdio->irq);
 			mdiobus_free(priv->mdio);
 		}
 		if (priv->clk)
-- 
cgit v0.10.2


From 11f7f79b40ebfbac6a1bcdbc61d5d13d365e744b Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 3 Mar 2016 13:47:18 +0000
Subject: net: eth: altera: do not free array priv->mdio->irq

priv->mdio->irq used to be allocated and required freeing, but it
is now a fixed sized array and should no longer be free'd.

Issue detected using static analysis with CoverityScan

Fixes: e7f4dc3536a400 ("mdio: Move allocation of interrupts into core")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c
index 1747285..f749e4d 100644
--- a/drivers/net/ethernet/altera/altera_tse_main.c
+++ b/drivers/net/ethernet/altera/altera_tse_main.c
@@ -193,7 +193,6 @@ static void altera_tse_mdio_destroy(struct net_device *dev)
 			    priv->mdio->id);
 
 	mdiobus_unregister(priv->mdio);
-	kfree(priv->mdio->irq);
 	mdiobus_free(priv->mdio);
 	priv->mdio = NULL;
 }
-- 
cgit v0.10.2


From 4de13d7ed6ffdcbb34317acaa9236f121176f5f8 Mon Sep 17 00:00:00 2001
From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Date: Thu, 3 Mar 2016 17:54:54 +0100
Subject: tipc: fix nullptr crash during subscription cancel

commit 4d5cfcba2f6e ('tipc: fix connection abort during subscription
cancel'), removes the check for a valid subscription before calling
tipc_nametbl_subscribe().

This will lead to a nullptr exception when we process a
subscription cancel request. For a cancel request, a null
subscription is passed to tipc_nametbl_subscribe() resulting
in exception.

In this commit, we call tipc_nametbl_subscribe() only for
a valid subscription.

Fixes: 4d5cfcba2f6e ('tipc: fix connection abort during subscription cancel')
Reported-by: Anders Widell <anders.widell@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 69ee2ee..f9ff73a 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -296,7 +296,8 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
 	if (tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscrb, &sub))
 		return tipc_conn_terminate(tn->topsrv, subscrb->conid);
 
-	tipc_nametbl_subscribe(sub);
+	if (sub)
+		tipc_nametbl_subscribe(sub);
 }
 
 /* Handle one request to establish a new subscriber */
-- 
cgit v0.10.2


From 2e18f5a1bc18e8af7031b3b26efde25307014837 Mon Sep 17 00:00:00 2001
From: Lokesh Vutla <lokeshvutla@ti.com>
Date: Mon, 7 Mar 2016 01:41:21 -0700
Subject: ARM: OMAP2+: hwmod: Introduce ti,no-idle dt property

Introduce a dt property, ti,no-idle, that prevents an IP to idle at any
point. This is to handle Errata i877, which tells that GMAC clocks
cannot be disabled.

Acked-by: Roger Quadros <rogerq@ti.com>
Tested-by: Mugunthan V N <mugunthanvnm@ti.com>
Signed-off-by: Lokesh Vutla <lokeshvutla@ti.com>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Acked-by: Rob Herring <robh@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Paul Walmsley <paul@pwsan.com>

diff --git a/Documentation/devicetree/bindings/arm/omap/omap.txt b/Documentation/devicetree/bindings/arm/omap/omap.txt
index a2bd593..66422d6 100644
--- a/Documentation/devicetree/bindings/arm/omap/omap.txt
+++ b/Documentation/devicetree/bindings/arm/omap/omap.txt
@@ -23,6 +23,7 @@ Optional properties:
   during suspend.
 - ti,no-reset-on-init: When present, the module should not be reset at init
 - ti,no-idle-on-init: When present, the module should not be idled at init
+- ti,no-idle: When present, the module is never allowed to idle.
 
 Example:
 
diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c
index e9f65fe..b6d62e4 100644
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -2200,6 +2200,11 @@ static int _enable(struct omap_hwmod *oh)
  */
 static int _idle(struct omap_hwmod *oh)
 {
+	if (oh->flags & HWMOD_NO_IDLE) {
+		oh->_int_flags |= _HWMOD_SKIP_ENABLE;
+		return 0;
+	}
+
 	pr_debug("omap_hwmod: %s: idling\n", oh->name);
 
 	if (oh->_state != _HWMOD_STATE_ENABLED) {
@@ -2504,6 +2509,8 @@ static int __init _init(struct omap_hwmod *oh, void *data)
 			oh->flags |= HWMOD_INIT_NO_RESET;
 		if (of_find_property(np, "ti,no-idle-on-init", NULL))
 			oh->flags |= HWMOD_INIT_NO_IDLE;
+		if (of_find_property(np, "ti,no-idle", NULL))
+			oh->flags |= HWMOD_NO_IDLE;
 	}
 
 	oh->_state = _HWMOD_STATE_INITIALIZED;
@@ -2630,7 +2637,7 @@ static void __init _setup_postsetup(struct omap_hwmod *oh)
 	 * XXX HWMOD_INIT_NO_IDLE does not belong in hwmod data -
 	 * it should be set by the core code as a runtime flag during startup
 	 */
-	if ((oh->flags & HWMOD_INIT_NO_IDLE) &&
+	if ((oh->flags & (HWMOD_INIT_NO_IDLE | HWMOD_NO_IDLE)) &&
 	    (postsetup_state == _HWMOD_STATE_IDLE)) {
 		oh->_int_flags |= _HWMOD_SKIP_ENABLE;
 		postsetup_state = _HWMOD_STATE_ENABLED;
diff --git a/arch/arm/mach-omap2/omap_hwmod.h b/arch/arm/mach-omap2/omap_hwmod.h
index 76bce11..7c7a311 100644
--- a/arch/arm/mach-omap2/omap_hwmod.h
+++ b/arch/arm/mach-omap2/omap_hwmod.h
@@ -525,6 +525,8 @@ struct omap_hwmod_omap4_prcm {
  *     or idled.
  * HWMOD_OPT_CLKS_NEEDED: The optional clocks are needed for the module to
  *     operate and they need to be handled at the same time as the main_clk.
+ * HWMOD_NO_IDLE: Do not idle the hwmod at all. Useful to handle certain
+ *     IPs like CPSW on DRA7, where clocks to this module cannot be disabled.
  */
 #define HWMOD_SWSUP_SIDLE			(1 << 0)
 #define HWMOD_SWSUP_MSTANDBY			(1 << 1)
@@ -541,6 +543,7 @@ struct omap_hwmod_omap4_prcm {
 #define HWMOD_SWSUP_SIDLE_ACT			(1 << 12)
 #define HWMOD_RECONFIG_IO_CHAIN			(1 << 13)
 #define HWMOD_OPT_CLKS_NEEDED			(1 << 14)
+#define HWMOD_NO_IDLE				(1 << 15)
 
 /*
  * omap_hwmod._int_flags definitions
-- 
cgit v0.10.2


From 0f514e690740e54815441a87708c3326f8aa8709 Mon Sep 17 00:00:00 2001
From: Mugunthan V N <mugunthanvnm@ti.com>
Date: Mon, 7 Mar 2016 01:41:22 -0700
Subject: ARM: dts: dra7: do not gate cpsw clock due to errata i877

Errata id: i877

Description:
------------
The RGMII 1000 Mbps Transmit timing is based on the output clock
(rgmiin_txc) being driven relative to the rising edge of an internal
clock and the output control/data (rgmiin_txctl/txd) being driven relative
to the falling edge of an internal clock source. If the internal clock
source is allowed to be static low (i.e., disabled) for an extended period
of time then when the clock is actually enabled the timing delta between
the rising edge and falling edge can change over the lifetime of the
device. This can result in the device switching characteristics degrading
over time, and eventually failing to meet the Data Manual Delay Time/Skew
specs.
To maintain RGMII 1000 Mbps IO Timings, SW should minimize the
duration that the Ethernet internal clock source is disabled. Note that
the device reset state for the Ethernet clock is "disabled".
Other RGMII modes (10 Mbps, 100Mbps) are not affected

Workaround:
-----------
If the SoC Ethernet interface(s) are used in RGMII mode at 1000 Mbps,
SW should minimize the time the Ethernet internal clock source is disabled
to a maximum of 200 hours in a device life cycle. This is done by enabling
the clock as early as possible in IPL (QNX) or SPL/u-boot (Linux/Android)
by setting the register CM_GMAC_CLKSTCTRL[1:0]CLKTRCTRL = 0x2:SW_WKUP.

So, do not allow to gate the cpsw clocks using ti,no-idle property in
cpsw node assuming 1000 Mbps is being used all the time. If someone does
not need 1000 Mbps and wants to gate clocks to cpsw, this property needs
to be deleted in their respective board files.

Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Lokesh Vutla <lokeshvutla@ti.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Paul Walmsley <paul@pwsan.com>

diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi
index c4d9175..f82aa44 100644
--- a/arch/arm/boot/dts/dra7.dtsi
+++ b/arch/arm/boot/dts/dra7.dtsi
@@ -1500,6 +1500,16 @@
 			       0x48485200 0x2E00>;
 			#address-cells = <1>;
 			#size-cells = <1>;
+
+			/*
+			 * Do not allow gating of cpsw clock as workaround
+			 * for errata i877. Keeping internal clock disabled
+			 * causes the device switching characteristics
+			 * to degrade over time and eventually fail to meet
+			 * the data manual delay time/skew specs.
+			 */
+			ti,no-idle;
+
 			/*
 			 * rx_thresh_pend
 			 * rx_pend
-- 
cgit v0.10.2


From 912b1c89c5043e163f8cfa568512a131976121b2 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Mon, 7 Mar 2016 15:15:29 +0100
Subject: mlxsw: spectrum: Always decrement bridge's ref count

Since we only support one VLAN filtering bridge we need to associate a
reference count with it, so that when the last port netdev leaves it, we
would know that a different bridge can be offloaded to hardware.

When a LAG device is memeber in a bridge and port netdevs are leaving
the LAG, we should always decrement the bridge's reference count, as it's
incremented for any port in the LAG.

Fixes: 4dc236c31733 ("mlxsw: spectrum: Handle port leaving LAG while bridged")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 09ce451..a94daa8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2358,9 +2358,7 @@ static int mlxsw_sp_port_lag_leave(struct mlxsw_sp_port *mlxsw_sp_port,
 	if (mlxsw_sp_port->bridged) {
 		mlxsw_sp_port_active_vlans_del(mlxsw_sp_port);
 		mlxsw_sp_port_bridge_leave(mlxsw_sp_port, false);
-
-		if (lag->ref_count == 1)
-			mlxsw_sp_master_bridge_dec(mlxsw_sp, NULL);
+		mlxsw_sp_master_bridge_dec(mlxsw_sp, NULL);
 	}
 
 	if (lag->ref_count == 1) {
-- 
cgit v0.10.2


From 5091730d7795ccb21eb880699b5194730641c70b Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Mon, 7 Mar 2016 15:15:30 +0100
Subject: mlxsw: pci: Correctly determine if descriptor queue is full

The descriptor queues for sending (SDQs) and receiving (RDQs) packets
are managed by two counters - producer and consumer - which are both
16-bit in size. A queue is considered full when the difference between
the two equals the queue's maximum number of descriptors.

However, if the producer counter overflows, then it's possible for the
full queue check to fail, as it doesn't take the overflow into account.
In such a case, descriptors already passed to the device - but for which
a completion has yet to be posted - will be overwritten, thereby causing
undefined behavior. The above can be achieved under heavy load (~30
netperf instances).

Fix that by casting the subtraction result to u16, preventing it from
being treated as a signed integer.

Fixes: eda6500a987a ("mlxsw: Add PCI bus implementation")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index c071077..7992c55 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -215,7 +215,7 @@ mlxsw_pci_queue_elem_info_producer_get(struct mlxsw_pci_queue *q)
 {
 	int index = q->producer_counter & (q->count - 1);
 
-	if ((q->producer_counter - q->consumer_counter) == q->count)
+	if ((u16) (q->producer_counter - q->consumer_counter) == q->count)
 		return NULL;
 	return mlxsw_pci_queue_elem_info_get(q, index);
 }
-- 
cgit v0.10.2


From 1666984c8625b3db19a9abc298931d35ab7bc64b Mon Sep 17 00:00:00 2001
From: Oliver Neukum <oneukum@suse.com>
Date: Mon, 7 Mar 2016 11:31:10 +0100
Subject: usbnet: cleanup after bind() in probe()

In case bind() works, but a later error forces bailing
in probe() in error cases work and a timer may be scheduled.
They must be killed. This fixes an error case related to
the double free reported in
http://www.spinics.net/lists/netdev/msg367669.html
and needs to go on top of Linus' fix to cdc-ncm.

Signed-off-by: Oliver Neukum <ONeukum@suse.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 0b0ba7e..1079812 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1769,6 +1769,13 @@ out3:
 	if (info->unbind)
 		info->unbind (dev, udev);
 out1:
+	/* subdrivers must undo all they did in bind() if they
+	 * fail it, but we may fail later and a deferred kevent
+	 * may trigger an error resubmitting itself and, worse,
+	 * schedule a timer. So we kill it all just in case.
+	 */
+	cancel_work_sync(&dev->kevent);
+	del_timer_sync(&dev->delay);
 	free_netdev(net);
 out:
 	return status;
-- 
cgit v0.10.2


From 2f791908a70e95768596f5bb9e6de4f441d7bf13 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Wed, 24 Feb 2016 15:35:22 +0100
Subject: drm/i915: Fix bogus dig_port_map[] assignment for pre-HSW
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The recent commit [0bdf5a05647a: drm/i915: Add reverse mapping between
port and intel_encoder] introduced a reverse mapping to retrieve
intel_dig_port object from the port number.  The code assumed that the
port vs intel_dig_port are 1:1 mapping.  But in reality, this was a
too naive assumption.

As Martin reported about the missing HDMI audio on his SNB machine,
pre-HSW chips may have multiple intel_dig_port objects corresponding
to the same port.  Since we assign the mapping statically at the init
time and the multiple objects override the map, it may not match with
the actually enabled output.

This patch tries to address the regression above.  The reverse mapping
is provided basically only for the audio callbacks, so now we set /
clear the mapping dynamically at enabling and disabling HDMI/DP audio,
so that we can always track the latest and correct object
corresponding to the given port.

Fixes: 0bdf5a05647a ('drm/i915: Add reverse mapping between port and intel_encoder')
Reported-and-tested-by: Martin Kepplinger <martink@posteo.de>
Cc: drm-intel-fixes@lists.freedesktop.org
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Tested-by: Martin Kepplinger <martink@posteo.de>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1456324522-21591-1-git-send-email-tiwai@suse.de
(cherry picked from commit 9dfbffcf4ac0707097af9e6c1372192b9d03a357)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>

diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c
index 31f6d21..30f9214 100644
--- a/drivers/gpu/drm/i915/intel_audio.c
+++ b/drivers/gpu/drm/i915/intel_audio.c
@@ -527,6 +527,8 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder)
 
 	mutex_lock(&dev_priv->av_mutex);
 	intel_dig_port->audio_connector = connector;
+	/* referred in audio callbacks */
+	dev_priv->dig_port_map[port] = intel_encoder;
 	mutex_unlock(&dev_priv->av_mutex);
 
 	if (acomp && acomp->audio_ops && acomp->audio_ops->pin_eld_notify)
@@ -554,6 +556,7 @@ void intel_audio_codec_disable(struct intel_encoder *intel_encoder)
 
 	mutex_lock(&dev_priv->av_mutex);
 	intel_dig_port->audio_connector = NULL;
+	dev_priv->dig_port_map[port] = NULL;
 	mutex_unlock(&dev_priv->av_mutex);
 
 	if (acomp && acomp->audio_ops && acomp->audio_ops->pin_eld_notify)
diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c
index 0f3df2c..084d558 100644
--- a/drivers/gpu/drm/i915/intel_ddi.c
+++ b/drivers/gpu/drm/i915/intel_ddi.c
@@ -3358,7 +3358,6 @@ void intel_ddi_init(struct drm_device *dev, enum port port)
 	intel_encoder->get_config = intel_ddi_get_config;
 
 	intel_dig_port->port = port;
-	dev_priv->dig_port_map[port] = intel_encoder;
 	intel_dig_port->saved_port_bits = I915_READ(DDI_BUF_CTL(port)) &
 					  (DDI_BUF_PORT_REVERSAL |
 					   DDI_A_4_LANES);
diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c
index 1d8de43..cdc2c15 100644
--- a/drivers/gpu/drm/i915/intel_dp.c
+++ b/drivers/gpu/drm/i915/intel_dp.c
@@ -6045,7 +6045,6 @@ intel_dp_init(struct drm_device *dev,
 	}
 
 	intel_dig_port->port = port;
-	dev_priv->dig_port_map[port] = intel_encoder;
 	intel_dig_port->dp.output_reg = output_reg;
 
 	intel_encoder->type = INTEL_OUTPUT_DISPLAYPORT;
diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c
index cb5d1b1..616108c 100644
--- a/drivers/gpu/drm/i915/intel_hdmi.c
+++ b/drivers/gpu/drm/i915/intel_hdmi.c
@@ -2154,7 +2154,6 @@ void intel_hdmi_init_connector(struct intel_digital_port *intel_dig_port,
 void intel_hdmi_init(struct drm_device *dev,
 		     i915_reg_t hdmi_reg, enum port port)
 {
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_digital_port *intel_dig_port;
 	struct intel_encoder *intel_encoder;
 	struct intel_connector *intel_connector;
@@ -2223,7 +2222,6 @@ void intel_hdmi_init(struct drm_device *dev,
 		intel_encoder->cloneable |= 1 << INTEL_OUTPUT_HDMI;
 
 	intel_dig_port->port = port;
-	dev_priv->dig_port_map[port] = intel_encoder;
 	intel_dig_port->hdmi.hdmi_reg = hdmi_reg;
 	intel_dig_port->dp.output_reg = INVALID_MMIO_REG;
 
-- 
cgit v0.10.2


From 83bdaad4d919722744ef1b726a9913ec36c6a430 Mon Sep 17 00:00:00 2001
From: Hubert Chrzaniuk <hubert.chrzaniuk@intel.com>
Date: Mon, 7 Mar 2016 15:30:45 +0100
Subject: EDAC, sb_edac: Fix logic when computing DIMM sizes on Xeon Phi

Correct a typo introduced by

  d0cdf9003140 ("EDAC, sb_edac: Add Knights Landing (Xeon Phi gen 2) support")

As a result under some configurations DIMMs were not correctly
recognized. Problem affects only Xeon Phi architecture.

Signed-off-by: Hubert Chrzaniuk <hubert.chrzaniuk@intel.com>
Acked-by: Aristeu Rozanski <aris@redhat.com>
Cc: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Cc: lukasz.anaczkowski@intel.com
Link: http://lkml.kernel.org/r/1457361045-26221-1-git-send-email-hubert.chrzaniuk@intel.com
Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index e438ee5..f5c6b97 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -1574,7 +1574,7 @@ static int knl_get_dimm_capacity(struct sbridge_pvt *pvt, u64 *mc_sizes)
 				for (cha = 0; cha < KNL_MAX_CHAS; cha++) {
 					if (knl_get_mc_route(target,
 						mc_route_reg[cha]) == channel
-						&& participants[channel]) {
+						&& !participants[channel]) {
 						participant_count++;
 						participants[channel] = 1;
 						break;
-- 
cgit v0.10.2


From 48906f62c96cc2cd35753e59310cb70eb08cc6a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Thu, 3 Mar 2016 22:20:53 +0100
Subject: cdc_ncm: toggle altsetting to force reset before setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some devices will silently fail setup unless they are reset first.
This is necessary even if the data interface is already in
altsetting 0, which it will be when the device is probed for the
first time.  Briefly toggling the altsetting forces a function
reset regardless of the initial state.

This fixes a setup problem observed on a number of Huawei devices,
appearing to operate in NTB-32 mode even if we explicitly set them
to NTB-16 mode.

Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index dc0212c..be92796 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -837,7 +837,11 @@ int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_
 
 	iface_no = ctx->data->cur_altsetting->desc.bInterfaceNumber;
 
-	/* reset data interface */
+	/* Reset data interface. Some devices will not reset properly
+	 * unless they are configured first.  Toggle the altsetting to
+	 * force a reset
+	 */
+	usb_set_interface(dev->udev, iface_no, data_altsetting);
 	temp = usb_set_interface(dev->udev, iface_no, 0);
 	if (temp) {
 		dev_dbg(&intf->dev, "set interface failed\n");
-- 
cgit v0.10.2


From 2c42bf4b43170ee83354bb68a12f8fc84c2e27a9 Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Thu, 3 Mar 2016 15:22:36 -0600
Subject: ibmveth: check return of skb_linearize in ibmveth_start_xmit

If skb_linearize fails, the driver should drop the packet
instead of trying to copy it into the bounce buffer.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index 335417b..ebe6071 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1166,7 +1166,10 @@ map_failed:
 	if (!firmware_has_feature(FW_FEATURE_CMO))
 		netdev_err(netdev, "tx: unable to map xmit buffer\n");
 	adapter->tx_map_failed++;
-	skb_linearize(skb);
+	if (skb_linearize(skb)) {
+		netdev->stats.tx_dropped++;
+		goto out;
+	}
 	force_bounce = 1;
 	goto retry_bounce;
 }
-- 
cgit v0.10.2


From 979d804e5bc926f8f3133715736f26456c910a97 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Fri, 4 Mar 2016 10:04:52 +0900
Subject: net: ethernet: Add missing MFD_SYSCON dependency on HAS_IOMEM

The MFD_SYSCON depends on HAS_IOMEM so when selecting it avoid unmet
direct dependencies.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/hisilicon/Kconfig b/drivers/net/ethernet/hisilicon/Kconfig
index 74beb18..4ccc032 100644
--- a/drivers/net/ethernet/hisilicon/Kconfig
+++ b/drivers/net/ethernet/hisilicon/Kconfig
@@ -25,6 +25,7 @@ config HIX5HD2_GMAC
 
 config HIP04_ETH
 	tristate "HISILICON P04 Ethernet support"
+	depends on HAS_IOMEM	# For MFD_SYSCON
 	select MARVELL_PHY
 	select MFD_SYSCON
 	select HNS_MDIO
-- 
cgit v0.10.2


From cec05562fb1dcdf09ef7b46e1e3eade98e488893 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 4 Mar 2016 13:40:48 -0500
Subject: vmxnet3: avoid calling pskb_may_pull with interrupts disabled

vmxnet3 has a function vmxnet3_parse_and_copy_hdr which, among other operations,
uses pskb_may_pull to linearize the header portion of an skb.  That operation
eventually uses local_bh_disable/enable to ensure that it doesn't race with the
drivers bottom half handler.  Unfortunately, vmxnet3 preforms this
parse_and_copy operation with a spinlock held and interrupts disabled.  This
causes us to run afoul of the WARN_ON_ONCE(irqs_disabled()) warning in
local_bh_enable, resulting in this:

WARNING: at kernel/softirq.c:159 local_bh_enable+0x59/0x90() (Not tainted)
Hardware name: VMware Virtual Platform
Modules linked in: ipv6 ppdev parport_pc parport microcode e1000 vmware_balloon
vmxnet3 i2c_piix4 sg ext4 jbd2 mbcache sd_mod crc_t10dif sr_mod cdrom mptspi
mptscsih mptbase scsi_transport_spi pata_acpi ata_generic ata_piix vmwgfx ttm
drm_kms_helper drm i2c_core dm_mirror dm_region_hash dm_log dm_mod [last
unloaded: mperf]
Pid: 6229, comm: sshd Not tainted 2.6.32-616.el6.i686 #1
Call Trace:
 [<c04624d9>] ? warn_slowpath_common+0x89/0xe0
 [<c0469e99>] ? local_bh_enable+0x59/0x90
 [<c046254b>] ? warn_slowpath_null+0x1b/0x20
 [<c0469e99>] ? local_bh_enable+0x59/0x90
 [<c07bb936>] ? skb_copy_bits+0x126/0x210
 [<f8d1d9fe>] ? ext4_ext_find_extent+0x24e/0x2d0 [ext4]
 [<c07bc49e>] ? __pskb_pull_tail+0x6e/0x2b0
 [<f95a6164>] ? vmxnet3_xmit_frame+0xba4/0xef0 [vmxnet3]
 [<c05d15a6>] ? selinux_ip_postroute+0x56/0x320
 [<c0615988>] ? cfq_add_rq_rb+0x98/0x110
 [<c0852df8>] ? packet_rcv+0x48/0x350
 [<c07c5839>] ? dev_queue_xmit_nit+0xc9/0x140
...

Fix it by splitting vmxnet3_parse_and_copy_hdr into two functions:

vmxnet3_parse_hdr, which sets up the internal/on stack ctx datastructure, and
pulls the skb (both of which can be done without holding the spinlock with irqs
disabled

and

vmxnet3_copy_header, which just copies the skb to the tx ring under the lock
safely.

tested and shown to correct the described problem.  Applies cleanly to the head
of the net tree

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Shrikrishna Khare <skhare@vmware.com>
CC: "VMware, Inc." <pv-drivers@vmware.com>
CC: "David S. Miller" <davem@davemloft.net>
Acked-by: Shrikrishna Khare <skhare@vmware.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 0cbf520..fc895d0 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -814,7 +814,7 @@ vmxnet3_tq_init_all(struct vmxnet3_adapter *adapter)
 
 
 /*
- *    parse and copy relevant protocol headers:
+ *    parse relevant protocol headers:
  *      For a tso pkt, relevant headers are L2/3/4 including options
  *      For a pkt requesting csum offloading, they are L2/3 and may include L4
  *      if it's a TCP/UDP pkt
@@ -827,15 +827,14 @@ vmxnet3_tq_init_all(struct vmxnet3_adapter *adapter)
  * Other effects:
  *    1. related *ctx fields are updated.
  *    2. ctx->copy_size is # of bytes copied
- *    3. the portion copied is guaranteed to be in the linear part
+ *    3. the portion to be copied is guaranteed to be in the linear part
  *
  */
 static int
-vmxnet3_parse_and_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
-			   struct vmxnet3_tx_ctx *ctx,
-			   struct vmxnet3_adapter *adapter)
+vmxnet3_parse_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
+		  struct vmxnet3_tx_ctx *ctx,
+		  struct vmxnet3_adapter *adapter)
 {
-	struct Vmxnet3_TxDataDesc *tdd;
 	u8 protocol = 0;
 
 	if (ctx->mss) {	/* TSO */
@@ -892,16 +891,34 @@ vmxnet3_parse_and_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 		return 0;
 	}
 
+	return 1;
+err:
+	return -1;
+}
+
+/*
+ *    copy relevant protocol headers to the transmit ring:
+ *      For a tso pkt, relevant headers are L2/3/4 including options
+ *      For a pkt requesting csum offloading, they are L2/3 and may include L4
+ *      if it's a TCP/UDP pkt
+ *
+ *
+ *    Note that this requires that vmxnet3_parse_hdr be called first to set the
+ *      appropriate bits in ctx first
+ */
+static void
+vmxnet3_copy_hdr(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
+		 struct vmxnet3_tx_ctx *ctx,
+		 struct vmxnet3_adapter *adapter)
+{
+	struct Vmxnet3_TxDataDesc *tdd;
+
 	tdd = tq->data_ring.base + tq->tx_ring.next2fill;
 
 	memcpy(tdd->data, skb->data, ctx->copy_size);
 	netdev_dbg(adapter->netdev,
 		"copy %u bytes to dataRing[%u]\n",
 		ctx->copy_size, tq->tx_ring.next2fill);
-	return 1;
-
-err:
-	return -1;
 }
 
 
@@ -998,22 +1015,7 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 		}
 	}
 
-	spin_lock_irqsave(&tq->tx_lock, flags);
-
-	if (count > vmxnet3_cmd_ring_desc_avail(&tq->tx_ring)) {
-		tq->stats.tx_ring_full++;
-		netdev_dbg(adapter->netdev,
-			"tx queue stopped on %s, next2comp %u"
-			" next2fill %u\n", adapter->netdev->name,
-			tq->tx_ring.next2comp, tq->tx_ring.next2fill);
-
-		vmxnet3_tq_stop(tq, adapter);
-		spin_unlock_irqrestore(&tq->tx_lock, flags);
-		return NETDEV_TX_BUSY;
-	}
-
-
-	ret = vmxnet3_parse_and_copy_hdr(skb, tq, &ctx, adapter);
+	ret = vmxnet3_parse_hdr(skb, tq, &ctx, adapter);
 	if (ret >= 0) {
 		BUG_ON(ret <= 0 && ctx.copy_size != 0);
 		/* hdrs parsed, check against other limits */
@@ -1033,9 +1035,26 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 		}
 	} else {
 		tq->stats.drop_hdr_inspect_err++;
-		goto unlock_drop_pkt;
+		goto drop_pkt;
 	}
 
+	spin_lock_irqsave(&tq->tx_lock, flags);
+
+	if (count > vmxnet3_cmd_ring_desc_avail(&tq->tx_ring)) {
+		tq->stats.tx_ring_full++;
+		netdev_dbg(adapter->netdev,
+			"tx queue stopped on %s, next2comp %u"
+			" next2fill %u\n", adapter->netdev->name,
+			tq->tx_ring.next2comp, tq->tx_ring.next2fill);
+
+		vmxnet3_tq_stop(tq, adapter);
+		spin_unlock_irqrestore(&tq->tx_lock, flags);
+		return NETDEV_TX_BUSY;
+	}
+
+
+	vmxnet3_copy_hdr(skb, tq, &ctx, adapter);
+
 	/* fill tx descs related to addr & len */
 	if (vmxnet3_map_pkt(skb, &ctx, tq, adapter->pdev, adapter))
 		goto unlock_drop_pkt;
-- 
cgit v0.10.2


From a69bf3c5b49ef488970c74e26ba0ec12f08491c2 Mon Sep 17 00:00:00 2001
From: Douglas Miller <dougmill@linux.vnet.ibm.com>
Date: Fri, 4 Mar 2016 15:36:56 -0600
Subject: be2net: Don't leak iomapped memory on removal.

The adapter->pcicfg resource is either mapped via pci_iomap() or
derived from adapter->db. During be_remove() this resource was ignored
and so could remain mapped after remove.

Add a flag to track whether adapter->pcicfg was mapped or not, then
use that flag in be_unmap_pci_bars() to unmap if required.

Fixes: 25848c901 ("use PCI MMIO read instead of config read for errors")

Signed-off-by: Douglas Miller <dougmill@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h
index cf83783..f975129 100644
--- a/drivers/net/ethernet/emulex/benet/be.h
+++ b/drivers/net/ethernet/emulex/benet/be.h
@@ -531,6 +531,7 @@ struct be_adapter {
 
 	struct delayed_work be_err_detection_work;
 	u8 err_flags;
+	bool pcicfg_mapped;	/* pcicfg obtained via pci_iomap() */
 	u32 flags;
 	u32 cmd_privileges;
 	/* Ethtool knobs and info */
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index db81e3d..d1cf127 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4970,6 +4970,8 @@ static void be_unmap_pci_bars(struct be_adapter *adapter)
 		pci_iounmap(adapter->pdev, adapter->csr);
 	if (adapter->db)
 		pci_iounmap(adapter->pdev, adapter->db);
+	if (adapter->pcicfg && adapter->pcicfg_mapped)
+		pci_iounmap(adapter->pdev, adapter->pcicfg);
 }
 
 static int db_bar(struct be_adapter *adapter)
@@ -5021,8 +5023,10 @@ static int be_map_pci_bars(struct be_adapter *adapter)
 			if (!addr)
 				goto pci_map_err;
 			adapter->pcicfg = addr;
+			adapter->pcicfg_mapped = true;
 		} else {
 			adapter->pcicfg = adapter->db + SRIOV_VF_PCICFG_OFFSET;
+			adapter->pcicfg_mapped = false;
 		}
 	}
 
-- 
cgit v0.10.2


From 59dca1d8a6725a121dae6c452de0b2611d5865dc Mon Sep 17 00:00:00 2001
From: Bill Sommerfeld <wsommerfeld@google.com>
Date: Fri, 4 Mar 2016 14:47:21 -0800
Subject: udp6: fix UDP/IPv6 encap resubmit path

IPv4 interprets a negative return value from a protocol handler as a
request to redispatch to a new protocol.  In contrast, IPv6 interprets a
negative value as an error, and interprets a positive value as a request
for redispatch.

UDP for IPv6 was unaware of this difference.  Change __udp6_lib_rcv() to
return a positive value for redispatch.  Note that the socket's
encap_rcv hook still needs to return a negative value to request
dispatch, and in the case of IPv6 packets, adjust IP6CB(skb)->nhoff to
identify the byte containing the next protocol.

Signed-off-by: Bill Sommerfeld <wsommerfeld@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 22e28a4..422dd01 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -962,11 +962,9 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 		ret = udpv6_queue_rcv_skb(sk, skb);
 		sock_put(sk);
 
-		/* a return value > 0 means to resubmit the input, but
-		 * it wants the return to be -protocol, or 0
-		 */
+		/* a return value > 0 means to resubmit the input */
 		if (ret > 0)
-			return -ret;
+			return ret;
 
 		return 0;
 	}
-- 
cgit v0.10.2


From 54c6e2dd00c313d0add58e5befe62fe6f286d03b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20=3D=3Futf-8=3FQ=3FHa=3DC5=3D82asa=3F=3D?=
 <khalasa@piap.pl>
Date: Tue, 1 Mar 2016 07:07:18 +0100
Subject: PCI: Allow a NULL "parent" pointer in pci_bus_assign_domain_nr()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pci_create_root_bus() passes a "parent" pointer to
pci_bus_assign_domain_nr().  When CONFIG_PCI_DOMAINS_GENERIC is defined,
pci_bus_assign_domain_nr() dereferences that pointer.  Many callers of
pci_create_root_bus() supply a NULL "parent" pointer, which leads to a NULL
pointer dereference error.

7c674700098c ("PCI: Move domain assignment from arm64 to generic code")
moved the "parent" dereference from arm64 to generic code.  Only arm64 used
that code (because only arm64 defined CONFIG_PCI_DOMAINS_GENERIC), and it
always supplied a valid "parent" pointer.  Other arches supplied NULL
"parent" pointers but didn't defined CONFIG_PCI_DOMAINS_GENERIC, so they
used a no-op version of pci_bus_assign_domain_nr().

8c7d14746abc ("ARM/PCI: Move to generic PCI domains") defined
CONFIG_PCI_DOMAINS_GENERIC on ARM, and many ARM platforms use
pci_common_init(), which supplies a NULL "parent" pointer.
These platforms (cns3xxx, dove, footbridge, iop13xx, etc.) crash
with a NULL pointer dereference like this while probing PCI:

  Unable to handle kernel NULL pointer dereference at virtual address 000000a4
  PC is at pci_bus_assign_domain_nr+0x10/0x84
  LR is at pci_create_root_bus+0x48/0x2e4
  Kernel panic - not syncing: Attempted to kill init!

[bhelgaas: changelog, add "Reported:" and "Fixes:" tags]
Reported: http://forum.doozan.com/read.php?2,17868,22070,quote=1
Fixes: 8c7d14746abc ("ARM/PCI: Move to generic PCI domains")
Fixes: 7c674700098c ("PCI: Move domain assignment from arm64 to generic code")
Signed-off-by: Krzysztof Hałasa <khalasa@piap.pl>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
CC: stable@vger.kernel.org	# v4.0+

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 602eb42..f89db3a 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4772,8 +4772,10 @@ int pci_get_new_domain_nr(void)
 void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent)
 {
 	static int use_dt_domains = -1;
-	int domain = of_get_pci_domain_nr(parent->of_node);
+	int domain = -1;
 
+	if (parent)
+		domain = of_get_pci_domain_nr(parent->of_node);
 	/*
 	 * Check DT domain and use_dt_domains values.
 	 *
-- 
cgit v0.10.2


From 0772a99b818079e628a1da122ac7ee023faed83e Mon Sep 17 00:00:00 2001
From: Guo-Fu Tseng <cooldavid@cooldavid.org>
Date: Sat, 5 Mar 2016 08:11:55 +0800
Subject: jme: Do not enable NIC WoL functions on S0

Otherwise it might be back on resume right after going to suspend in
some hardware.

Reported-by: Diego Viola <diego.viola@gmail.com>
Signed-off-by: Guo-Fu Tseng <cooldavid@cooldavid.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index 8adbe8f..74b9c9d 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -270,11 +270,17 @@ jme_reset_mac_processor(struct jme_adapter *jme)
 }
 
 static inline void
-jme_clear_pm(struct jme_adapter *jme)
+jme_clear_pm_enable_wol(struct jme_adapter *jme)
 {
 	jwrite32(jme, JME_PMCS, PMCS_STMASK | jme->reg_pmcs);
 }
 
+static inline void
+jme_clear_pm_disable_wol(struct jme_adapter *jme)
+{
+	jwrite32(jme, JME_PMCS, PMCS_STMASK);
+}
+
 static int
 jme_reload_eeprom(struct jme_adapter *jme)
 {
@@ -1853,7 +1859,7 @@ jme_open(struct net_device *netdev)
 	struct jme_adapter *jme = netdev_priv(netdev);
 	int rc;
 
-	jme_clear_pm(jme);
+	jme_clear_pm_disable_wol(jme);
 	JME_NAPI_ENABLE(jme);
 
 	tasklet_init(&jme->linkch_task, jme_link_change_tasklet,
@@ -1929,7 +1935,7 @@ jme_powersave_phy(struct jme_adapter *jme)
 		jme_set_100m_half(jme);
 		if (jme->reg_pmcs & (PMCS_LFEN | PMCS_LREN))
 			jme_wait_link(jme);
-		jme_clear_pm(jme);
+		jme_clear_pm_enable_wol(jme);
 	} else {
 		jme_phy_off(jme);
 	}
@@ -2646,7 +2652,6 @@ jme_set_wol(struct net_device *netdev,
 	if (wol->wolopts & WAKE_MAGIC)
 		jme->reg_pmcs |= PMCS_MFEN;
 
-	jwrite32(jme, JME_PMCS, jme->reg_pmcs);
 	device_set_wakeup_enable(&jme->pdev->dev, !!(jme->reg_pmcs));
 
 	return 0;
@@ -3172,7 +3177,7 @@ jme_init_one(struct pci_dev *pdev,
 	jme->mii_if.mdio_read = jme_mdio_read;
 	jme->mii_if.mdio_write = jme_mdio_write;
 
-	jme_clear_pm(jme);
+	jme_clear_pm_disable_wol(jme);
 	device_set_wakeup_enable(&pdev->dev, true);
 
 	jme_set_phyfifo_5level(jme);
@@ -3304,7 +3309,7 @@ jme_resume(struct device *dev)
 	if (!netif_running(netdev))
 		return 0;
 
-	jme_clear_pm(jme);
+	jme_clear_pm_disable_wol(jme);
 	jme_phy_on(jme);
 	if (test_bit(JME_FLAG_SSET, &jme->flags))
 		jme_set_settings(netdev, &jme->old_ecmd);
-- 
cgit v0.10.2


From 81422e672f8181d7ad1ee6c60c723aac649f538f Mon Sep 17 00:00:00 2001
From: Guo-Fu Tseng <cooldavid@cooldavid.org>
Date: Sat, 5 Mar 2016 08:11:56 +0800
Subject: jme: Fix device PM wakeup API usage

According to Documentation/power/devices.txt

The driver should not use device_set_wakeup_enable() which is the policy
for user to decide.

Using device_init_wakeup() to initialize dev->power.should_wakeup and
dev->power.can_wakeup on driver initialization.

And use device_may_wakeup() on suspend to decide if WoL function should
be enabled on NIC.

Reported-by: Diego Viola <diego.viola@gmail.com>
Signed-off-by: Guo-Fu Tseng <cooldavid@cooldavid.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index 74b9c9d..3ddf657 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -1931,7 +1931,7 @@ jme_wait_link(struct jme_adapter *jme)
 static void
 jme_powersave_phy(struct jme_adapter *jme)
 {
-	if (jme->reg_pmcs) {
+	if (jme->reg_pmcs && device_may_wakeup(&jme->pdev->dev)) {
 		jme_set_100m_half(jme);
 		if (jme->reg_pmcs & (PMCS_LFEN | PMCS_LREN))
 			jme_wait_link(jme);
@@ -2652,8 +2652,6 @@ jme_set_wol(struct net_device *netdev,
 	if (wol->wolopts & WAKE_MAGIC)
 		jme->reg_pmcs |= PMCS_MFEN;
 
-	device_set_wakeup_enable(&jme->pdev->dev, !!(jme->reg_pmcs));
-
 	return 0;
 }
 
@@ -3178,7 +3176,7 @@ jme_init_one(struct pci_dev *pdev,
 	jme->mii_if.mdio_write = jme_mdio_write;
 
 	jme_clear_pm_disable_wol(jme);
-	device_set_wakeup_enable(&pdev->dev, true);
+	device_init_wakeup(&pdev->dev, true);
 
 	jme_set_phyfifo_5level(jme);
 	jme->pcirev = pdev->revision;
-- 
cgit v0.10.2


From 68c222a6bc90e4bf03df7ae36a9b196fa869928e Mon Sep 17 00:00:00 2001
From: yankejian <yankejian@huawei.com>
Date: Sat, 5 Mar 2016 14:10:42 +0800
Subject: net: hns: fix the bug about loopback

It will always be passed if the soc is tested the loopback cases. This
patch will fix this bug.

Signed-off-by: Kejian Yan <yankejian@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
index a0070d0..d4f92ed 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ae_adapt.c
@@ -675,8 +675,12 @@ static int hns_ae_config_loopback(struct hnae_handle *handle,
 {
 	int ret;
 	struct hnae_vf_cb *vf_cb = hns_ae_get_vf_cb(handle);
+	struct hns_mac_cb *mac_cb = hns_get_mac_cb(handle);
 
 	switch (loop) {
+	case MAC_INTERNALLOOP_PHY:
+		ret = 0;
+		break;
 	case MAC_INTERNALLOOP_SERDES:
 		ret = hns_mac_config_sds_loopback(vf_cb->mac_cb, en);
 		break;
@@ -686,6 +690,10 @@ static int hns_ae_config_loopback(struct hnae_handle *handle,
 	default:
 		ret = -EINVAL;
 	}
+
+	if (!ret)
+		hns_dsaf_set_inner_lb(mac_cb->dsaf_dev, mac_cb->mac_id, en);
+
 	return ret;
 }
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
index 9439f04..38fc5be 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
@@ -230,6 +230,30 @@ static void hns_dsaf_mix_def_qid_cfg(struct dsaf_device *dsaf_dev)
 	}
 }
 
+static void hns_dsaf_inner_qid_cfg(struct dsaf_device *dsaf_dev)
+{
+	u16 max_q_per_vf, max_vfn;
+	u32 q_id, q_num_per_port;
+	u32 mac_id;
+
+	if (AE_IS_VER1(dsaf_dev->dsaf_ver))
+		return;
+
+	hns_rcb_get_queue_mode(dsaf_dev->dsaf_mode,
+			       HNS_DSAF_COMM_SERVICE_NW_IDX,
+			       &max_vfn, &max_q_per_vf);
+	q_num_per_port = max_vfn * max_q_per_vf;
+
+	for (mac_id = 0, q_id = 0; mac_id < DSAF_SERVICE_NW_NUM; mac_id++) {
+		dsaf_set_dev_field(dsaf_dev,
+				   DSAFV2_SERDES_LBK_0_REG + 4 * mac_id,
+				   DSAFV2_SERDES_LBK_QID_M,
+				   DSAFV2_SERDES_LBK_QID_S,
+				   q_id);
+		q_id += q_num_per_port;
+	}
+}
+
 /**
  * hns_dsaf_sw_port_type_cfg - cfg sw type
  * @dsaf_id: dsa fabric id
@@ -691,6 +715,16 @@ void hns_dsaf_set_promisc_mode(struct dsaf_device *dsaf_dev, u32 en)
 	dsaf_set_dev_bit(dsaf_dev, DSAF_CFG_0_REG, DSAF_CFG_MIX_MODE_S, !!en);
 }
 
+void hns_dsaf_set_inner_lb(struct dsaf_device *dsaf_dev, u32 mac_id, u32 en)
+{
+	if (AE_IS_VER1(dsaf_dev->dsaf_ver) ||
+	    dsaf_dev->mac_cb[mac_id].mac_type == HNAE_PORT_DEBUG)
+		return;
+
+	dsaf_set_dev_bit(dsaf_dev, DSAFV2_SERDES_LBK_0_REG + 4 * mac_id,
+			 DSAFV2_SERDES_LBK_EN_B, !!en);
+}
+
 /**
  * hns_dsaf_tbl_stat_en - tbl
  * @dsaf_id: dsa fabric id
@@ -1022,6 +1056,9 @@ static void hns_dsaf_comm_init(struct dsaf_device *dsaf_dev)
 	/* set promisc def queue id */
 	hns_dsaf_mix_def_qid_cfg(dsaf_dev);
 
+	/* set inner loopback queue id */
+	hns_dsaf_inner_qid_cfg(dsaf_dev);
+
 	/* in non switch mode, set all port to access mode */
 	hns_dsaf_sw_port_type_cfg(dsaf_dev, DSAF_SW_PORT_TYPE_NON_VLAN);
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h
index 40205b9..5fea226 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h
@@ -417,5 +417,6 @@ void hns_dsaf_get_strings(int stringset, u8 *data, int port);
 void hns_dsaf_get_regs(struct dsaf_device *ddev, u32 port, void *data);
 int hns_dsaf_get_regs_count(void);
 void hns_dsaf_set_promisc_mode(struct dsaf_device *dsaf_dev, u32 en);
+void hns_dsaf_set_inner_lb(struct dsaf_device *dsaf_dev, u32 mac_id, u32 en);
 
 #endif /* __HNS_DSAF_MAIN_H__ */
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h
index f0c4f9b..60d695d 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h
@@ -134,6 +134,7 @@
 #define DSAF_XGE_INT_STS_0_REG		0x1C0
 #define DSAF_PPE_INT_STS_0_REG		0x1E0
 #define DSAF_ROCEE_INT_STS_0_REG	0x200
+#define DSAFV2_SERDES_LBK_0_REG         0x220
 #define DSAF_PPE_QID_CFG_0_REG		0x300
 #define DSAF_SW_PORT_TYPE_0_REG		0x320
 #define DSAF_STP_PORT_TYPE_0_REG	0x340
@@ -857,6 +858,10 @@
 #define PPEV2_CFG_RSS_TBL_4N3_S	24
 #define PPEV2_CFG_RSS_TBL_4N3_M	(((1UL << 5) - 1) << PPEV2_CFG_RSS_TBL_4N3_S)
 
+#define DSAFV2_SERDES_LBK_EN_B  8
+#define DSAFV2_SERDES_LBK_QID_S 0
+#define DSAFV2_SERDES_LBK_QID_M	(((1UL << 8) - 1) << DSAFV2_SERDES_LBK_QID_S)
+
 #define PPE_CNT_CLR_CE_B	0
 #define PPE_CNT_CLR_SNAP_EN_B	1
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
index 3df2284..3c4a3bc 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@@ -295,8 +295,10 @@ static int __lb_setup(struct net_device *ndev,
 
 	switch (loop) {
 	case MAC_INTERNALLOOP_PHY:
-		if ((phy_dev) && (!phy_dev->is_c45))
+		if ((phy_dev) && (!phy_dev->is_c45)) {
 			ret = hns_nic_config_phy_loopback(phy_dev, 0x1);
+			ret |= h->dev->ops->set_loopback(h, loop, 0x1);
+		}
 		break;
 	case MAC_INTERNALLOOP_MAC:
 		if ((h->dev->ops->set_loopback) &&
@@ -376,6 +378,7 @@ static void __lb_other_process(struct hns_nic_ring_data *ring_data,
 			       struct sk_buff *skb)
 {
 	struct net_device *ndev;
+	struct hns_nic_priv *priv;
 	struct hnae_ring *ring;
 	struct netdev_queue *dev_queue;
 	struct sk_buff *new_skb;
@@ -385,8 +388,17 @@ static void __lb_other_process(struct hns_nic_ring_data *ring_data,
 	char buff[33]; /* 32B data and the last character '\0' */
 
 	if (!ring_data) { /* Just for doing create frame*/
+		ndev = skb->dev;
+		priv = netdev_priv(ndev);
+
 		frame_size = skb->len;
 		memset(skb->data, 0xFF, frame_size);
+		if ((!AE_IS_VER1(priv->enet_ver)) &&
+		    (priv->ae_handle->port_type == HNAE_PORT_SERVICE)) {
+			memcpy(skb->data, ndev->dev_addr, 6);
+			skb->data[5] += 0x1f;
+		}
+
 		frame_size &= ~1ul;
 		memset(&skb->data[frame_size / 2], 0xAA, frame_size / 2 - 1);
 		memset(&skb->data[frame_size / 2 + 10], 0xBE,
@@ -486,6 +498,7 @@ static int __lb_run_test(struct net_device *ndev,
 
 	/* place data into test skb */
 	(void)skb_put(skb, size);
+	skb->dev = ndev;
 	__lb_other_process(NULL, skb);
 	skb->queue_mapping = NIC_LB_TEST_RING_ID;
 
-- 
cgit v0.10.2


From a9d99ce28ed359d68cf6f3c1a69038aefedf6d6a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 6 Mar 2016 09:29:21 -0800
Subject: tcp: fix tcpi_segs_in after connection establishment

If final packet (ACK) of 3WHS is lost, it appears we do not properly
account the following incoming segment into tcpi_segs_in

While we are at it, starts segs_in with one, to count the SYN packet.

We do not yet count number of SYN we received for a request sock, we
might add this someday.

packetdrill script showing proper behavior after fix :

// Tests tcpi_segs_in when 3rd packet (ACK) of 3WHS is lost
0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
   +0 bind(3, ..., ...) = 0
   +0 listen(3, 1) = 0

   +0 < S 0:0(0) win 32792 <mss 1000,sackOK,nop,nop>
   +0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK>
+.020 < P. 1:1001(1000) ack 1 win 32792

   +0 accept(3, ..., ...) = 4

+.000 %{ assert tcpi_segs_in == 2, 'tcpi_segs_in=%d' % tcpi_segs_in }%

Fixes: 2efd055c53c06 ("tcp: add tcpi_segs_in and tcpi_segs_out to tcp_info")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 75632a9..9b02af2 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -455,7 +455,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 
 		newtp->rcv_wup = newtp->copied_seq =
 		newtp->rcv_nxt = treq->rcv_isn + 1;
-		newtp->segs_in = 0;
+		newtp->segs_in = 1;
 
 		newtp->snd_sml = newtp->snd_una =
 		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
@@ -815,6 +815,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 	int ret = 0;
 	int state = child->sk_state;
 
+	tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
 	if (!sock_owned_by_user(child)) {
 		ret = tcp_rcv_state_process(child, skb);
 		/* Wakeup parent, send SIGIO */
-- 
cgit v0.10.2


From 4d06dd537f95683aba3651098ae288b7cbff8274 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Mon, 7 Mar 2016 21:15:36 +0100
Subject: cdc_ncm: do not call usbnet_link_change from cdc_ncm_bind
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

usbnet_link_change will call schedule_work and should be
avoided if bind is failing. Otherwise we will end up with
scheduled work referring to a netdev which has gone away.

Instead of making the call conditional, we can just defer
it to usbnet_probe, using the driver_info flag made for
this purpose.

Fixes: 8a34b0ae8778 ("usbnet: cdc_ncm: apply usbnet_link_change")
Reported-by: Andrey Konovalov <andreyknvl@gmail.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index be92796..86ba30b 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -988,8 +988,6 @@ EXPORT_SYMBOL_GPL(cdc_ncm_select_altsetting);
 
 static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
 {
-	int ret;
-
 	/* MBIM backwards compatible function? */
 	if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM)
 		return -ENODEV;
@@ -998,16 +996,7 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
 	 * Additionally, generic NCM devices are assumed to accept arbitrarily
 	 * placed NDP.
 	 */
-	ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0);
-
-	/*
-	 * We should get an event when network connection is "connected" or
-	 * "disconnected". Set network connection in "disconnected" state
-	 * (carrier is OFF) during attach, so the IP network stack does not
-	 * start IPv6 negotiation and more.
-	 */
-	usbnet_link_change(dev, 0, 0);
-	return ret;
+	return cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0);
 }
 
 static void cdc_ncm_align_tail(struct sk_buff *skb, size_t modulus, size_t remainder, size_t max)
@@ -1590,7 +1579,8 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb)
 
 static const struct driver_info cdc_ncm_info = {
 	.description = "CDC NCM",
-	.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET,
+	.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
+			| FLAG_LINK_INTR,
 	.bind = cdc_ncm_bind,
 	.unbind = cdc_ncm_unbind,
 	.manage_power = usbnet_manage_power,
@@ -1603,7 +1593,7 @@ static const struct driver_info cdc_ncm_info = {
 static const struct driver_info wwan_info = {
 	.description = "Mobile Broadband Network Device",
 	.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
-			| FLAG_WWAN,
+			| FLAG_LINK_INTR | FLAG_WWAN,
 	.bind = cdc_ncm_bind,
 	.unbind = cdc_ncm_unbind,
 	.manage_power = usbnet_manage_power,
@@ -1616,7 +1606,7 @@ static const struct driver_info wwan_info = {
 static const struct driver_info wwan_noarp_info = {
 	.description = "Mobile Broadband Network Device (NO ARP)",
 	.flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT | FLAG_MULTI_PACKET
-			| FLAG_WWAN | FLAG_NOARP,
+			| FLAG_LINK_INTR | FLAG_WWAN | FLAG_NOARP,
 	.bind = cdc_ncm_bind,
 	.unbind = cdc_ncm_unbind,
 	.manage_power = usbnet_manage_power,
-- 
cgit v0.10.2


From 6faac63a6986f29ef39827f460edd3a5ba64ad5c Mon Sep 17 00:00:00 2001
From: Guillaume Nault <g.nault@alphalink.fr>
Date: Mon, 7 Mar 2016 19:36:44 +0100
Subject: ppp: release rtnl mutex when interface creation fails

Add missing rtnl_unlock() in the error path of ppp_create_interface().

Fixes: 58a89ecaca53 ("ppp: fix lockdep splat in ppp_dev_uninit()")
Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index e8a5936..d61da9ec 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -2816,6 +2816,7 @@ static struct ppp *ppp_create_interface(struct net *net, int unit,
 
 out2:
 	mutex_unlock(&pn->all_ppp_mutex);
+	rtnl_unlock();
 	free_netdev(dev);
 out1:
 	*retp = ret;
-- 
cgit v0.10.2


From 256faedcfd646161477d47a1a78c32a562d2e845 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 7 Mar 2016 13:15:09 -0800
Subject: Revert "drm/radeon: call hpd_irq_event on resume"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This reverts commit dbb17a21c131eca94eb31136eee9a7fe5aff00d9.

It turns out that commit can cause problems for systems with multiple
GPUs, and causes X to hang on at least a HP Pavilion dv7 with hybrid
graphics.

This got noticed originally in 4.4.4, where this patch had already
gotten back-ported, but 4.5-rc7 was verified to have the same problem.

Alexander Deucher says:
 "It looks like you have a muxed system so I suspect what's happening is
  that one of the display is being reported as connected for both the
  IGP and the dGPU and then the desktop environment gets confused or
  there some sort problem in the detect functions since the mux is not
  switched to the dGPU.  I don't see an easy fix unless Dave has any
  ideas.  I'd say just revert for now"

Reported-by: Jörg-Volker Peetz <jvpeetz@web.de>
Acked-by: Alexander Deucher <Alexander.Deucher@amd.com>
Cc: Dave Airlie <airlied@gmail.com>
Cc: stable@kernel.org  # wherever dbb17a21c131 got back-ported
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 902b59c..4197ca1 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1744,7 +1744,6 @@ int radeon_resume_kms(struct drm_device *dev, bool resume, bool fbcon)
 	}
 
 	drm_kms_helper_poll_enable(dev);
-	drm_helper_hpd_irq_event(dev);
 
 	/* set the power state here in case we are a PX system or headless */
 	if ((rdev->pm.pm_method == PM_METHOD_DPM) && rdev->pm.dpm_enabled)
-- 
cgit v0.10.2


From d17e67de6e81394624685f16f174453bd561d48b Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Mon, 7 Mar 2016 18:06:01 +0100
Subject: drm/vmwgfx: Add back ->detect() and ->fill_modes()

This partially reverts commit d56f57ac969c ("drm/gma500: Move to private
save/restore hooks") which removed these lines by mistake.

Reported-by: Sebastian Herbszt <herbszt@gmx.de>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Tested-by: Thomas Hellstrom <thellstrom@vmware.com>
Tested-by: Sebastian Herbszt <herbszt@gmx.de>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
index db082be..c5a1a08 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_scrn.c
@@ -563,6 +563,8 @@ static void vmw_sou_connector_destroy(struct drm_connector *connector)
 
 static const struct drm_connector_funcs vmw_sou_connector_funcs = {
 	.dpms = vmw_du_connector_dpms,
+	.detect = vmw_du_connector_detect,
+	.fill_modes = vmw_du_connector_fill_modes,
 	.set_property = vmw_du_connector_set_property,
 	.destroy = vmw_sou_connector_destroy,
 };
-- 
cgit v0.10.2


From dad82ea3ef4b9dc97ca5ec718abc6b0986f1aaaf Mon Sep 17 00:00:00 2001
From: Jyri Sarha <jsarha@ti.com>
Date: Sat, 16 Jan 2016 22:17:54 +0200
Subject: drm/i2c: tda998x: Choose between atomic or non atomic dpms helper

Choose between atomic or non atomic connector dpms helper. If tda998x
is connected to a drm driver that does not support atomic modeset
calling drm_atomic_helper_connector_dpms() causes a crash when the
connectors atomic state is not initialized. The patch implements a
driver specific connector dpms helper that calls
drm_atomic_helper_connector_dpms() if driver supports DRIVER_ATOMIC
and otherwise it calls the legacy drm_helper_connector_dpms().

Fixes commit 9736e988d328 ("drm/i2c: tda998x: Add support for atomic
modesetting").

Signed-off-by: Jyri Sarha <jsarha@ti.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/drivers/gpu/drm/i2c/tda998x_drv.c b/drivers/gpu/drm/i2c/tda998x_drv.c
index 34e3874..f8ee740 100644
--- a/drivers/gpu/drm/i2c/tda998x_drv.c
+++ b/drivers/gpu/drm/i2c/tda998x_drv.c
@@ -1382,8 +1382,16 @@ static void tda998x_connector_destroy(struct drm_connector *connector)
 	drm_connector_cleanup(connector);
 }
 
+static int tda998x_connector_dpms(struct drm_connector *connector, int mode)
+{
+	if (drm_core_check_feature(connector->dev, DRIVER_ATOMIC))
+		return drm_atomic_helper_connector_dpms(connector, mode);
+	else
+		return drm_helper_connector_dpms(connector, mode);
+}
+
 static const struct drm_connector_funcs tda998x_connector_funcs = {
-	.dpms = drm_atomic_helper_connector_dpms,
+	.dpms = tda998x_connector_dpms,
 	.reset = drm_atomic_helper_connector_reset,
 	.fill_modes = drm_helper_probe_single_connector_modes,
 	.detect = tda998x_connector_detect,
-- 
cgit v0.10.2


From ccec44563b18a0ce90e2d4f332784b3cb25c8e9c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 5 Mar 2016 19:34:39 +1100
Subject: KVM: PPC: Book3S HV: Sanitize special-purpose register values on
 guest exit

Thomas Huth discovered that a guest could cause a hard hang of a
host CPU by setting the Instruction Authority Mask Register (IAMR)
to a suitable value.  It turns out that this is because when the
code was added to context-switch the new special-purpose registers
(SPRs) that were added in POWER8, we forgot to add code to ensure
that they were restored to a sane value on guest exit.

This adds code to set those registers where a bad value could
compromise the execution of the host kernel to a suitable neutral
value on guest exit.

Cc: stable@vger.kernel.org # v3.14+
Fixes: b005255e12a3
Reported-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 6ee26de..25ae2c9 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1370,6 +1370,20 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	std	r6, VCPU_ACOP(r9)
 	stw	r7, VCPU_GUEST_PID(r9)
 	std	r8, VCPU_WORT(r9)
+	/*
+	 * Restore various registers to 0, where non-zero values
+	 * set by the guest could disrupt the host.
+	 */
+	li	r0, 0
+	mtspr	SPRN_IAMR, r0
+	mtspr	SPRN_CIABR, r0
+	mtspr	SPRN_DAWRX, r0
+	mtspr	SPRN_TCSCR, r0
+	mtspr	SPRN_WORT, r0
+	/* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
+	li	r0, 1
+	sldi	r0, r0, 31
+	mtspr	SPRN_MMCRS, r0
 8:
 
 	/* Save and reset AMR and UAMOR before turning on the MMU */
-- 
cgit v0.10.2


From 803c00123a8012b3a283c0530910653973ef6d8f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Mar 2016 22:17:07 -0500
Subject: ncpfs: fix a braino in OOM handling in ncp_fill_cache()

Failing to allocate an inode for child means that cache for *parent* is
incompletely populated.  So it's parent directory inode ('dir') that
needs NCPI_DIR_CACHE flag removed, *not* the child inode ('inode', which
is what we'd failed to allocate in the first place).

Fucked-up-in: commit 5e993e25 ("ncpfs: get rid of d_validate() nonsense")
Fucked-up-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org # v3.19
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 26c2de2..b7f8eae 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -633,7 +633,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
 				d_rehash(newdent);
 		} else {
 			spin_lock(&dentry->d_lock);
-			NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+			NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
 			spin_unlock(&dentry->d_lock);
 		}
 	} else {
-- 
cgit v0.10.2


From f93812846f31381d35c04c6c577d724254355e7f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Mar 2016 23:07:10 -0500
Subject: jffs2: reduce the breakage on recovery from halfway failed rename()

d_instantiate(new_dentry, old_inode) is absolutely wrong thing to
do - it will oops if new_dentry used to be positive, for starters.
What we need is d_invalidate() the target and be done with that.

Cc: stable@vger.kernel.org # v3.18+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index d211b8e..30c4c9e 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 
 		pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
 			  __func__, ret);
-		/* Might as well let the VFS know */
-		d_instantiate(new_dentry, d_inode(old_dentry));
-		ihold(d_inode(old_dentry));
+		/*
+		 * We can't keep the target in dcache after that.
+		 * For one thing, we can't afford dentry aliases for directories.
+		 * For another, if there was a victim, we _can't_ set new inode
+		 * for that sucker and we have to trigger mount eviction - the
+		 * caller won't do it on its own since we are returning an error.
+		 */
+		d_invalidate(new_dentry);
 		new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
 		return ret;
 	}
-- 
cgit v0.10.2


From bc864af13f34d19c911f5691d87bdacc9ce109f5 Mon Sep 17 00:00:00 2001
From: Chris Bainbridge <chris.bainbridge@gmail.com>
Date: Mon, 7 Mar 2016 11:10:00 +0100
Subject: x86/microcode/intel: Change checksum variables to u32

Microcode checksum verification should be done using unsigned 32-bit
values otherwise the calculation overflow results in undefined
behaviour.

This is also nicely documented in the SDM, section "Microcode Update
Checksum":

  "To check for a corrupt microcode update, software must perform a
  unsigned DWORD (32-bit) checksum of the microcode update. Even though
  some fields are signed, the checksum procedure treats all DWORDs as
  unsigned. Microcode updates with a header version equal to 00000001H
  must sum all DWORDs that comprise the microcode update. A valid
  checksum check will yield a value of 00000000H."

but for some reason the code has been using ints from the very
beginning.

In practice, this bug possibly manifested itself only when doing the
microcode data checksum - apparently, currently shipped Intel microcode
doesn't have an extended signature table for which we do checksum
verification too.

  UBSAN: Undefined behaviour in arch/x86/kernel/cpu/microcode/intel_lib.c:105:12
  signed integer overflow:
  -1500151068 + -2125470173 cannot be represented in type 'int'
  CPU: 0 PID: 0 Comm: swapper Not tainted 4.5.0-rc5+ #495
  ...
  Call Trace:
   dump_stack
   ? inotify_ioctl
   ubsan_epilogue
   handle_overflow
   __ubsan_handle_add_overflow
   microcode_sanity_check
   get_matching_model_microcode.isra.2.constprop.8
   ? early_idt_handler_common
   ? strlcpy
   ? find_cpio_data
   load_ucode_intel_bsp
   load_ucode_bsp
   ? load_ucode_bsp
   x86_64_start_kernel

[ Expand and massage commit message. ]
Signed-off-by: Chris Bainbridge <chris.bainbridge@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: hmh@hmh.eng.br
Link: http://lkml.kernel.org/r/1456834359-5132-1-git-send-email-chris.bainbridge@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index b96896b..99ca2c9 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -49,7 +49,7 @@ int microcode_sanity_check(void *mc, int print_err)
 	unsigned long total_size, data_size, ext_table_size;
 	struct microcode_header_intel *mc_header = mc;
 	struct extended_sigtable *ext_header = NULL;
-	int sum, orig_sum, ext_sigcount = 0, i;
+	u32 sum, orig_sum, ext_sigcount = 0, i;
 	struct extended_signature *ext_sig;
 
 	total_size = get_totalsize(mc_header);
@@ -85,8 +85,8 @@ int microcode_sanity_check(void *mc, int print_err)
 
 	/* check extended table checksum */
 	if (ext_table_size) {
-		int ext_table_sum = 0;
-		int *ext_tablep = (int *)ext_header;
+		u32 ext_table_sum = 0;
+		u32 *ext_tablep = (u32 *)ext_header;
 
 		i = ext_table_size / DWSIZE;
 		while (i--)
@@ -102,7 +102,7 @@ int microcode_sanity_check(void *mc, int print_err)
 	orig_sum = 0;
 	i = (MC_HEADER_SIZE + data_size) / DWSIZE;
 	while (i--)
-		orig_sum += ((int *)mc)[i];
+		orig_sum += ((u32 *)mc)[i];
 	if (orig_sum) {
 		if (print_err)
 			pr_err("aborting, bad checksum\n");
-- 
cgit v0.10.2


From c0414622177ae4739a49ca7dad4306a681e2878b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Mar 2016 11:10:01 +0100
Subject: x86/microcode/intel: Get rid of DWSIZE

sizeof(u32) is perfectly clear as it is.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1457345404-28884-3-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 8559b01..603417f 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -40,7 +40,6 @@ struct extended_sigtable {
 #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
 #define EXT_HEADER_SIZE		(sizeof(struct extended_sigtable))
 #define EXT_SIGNATURE_SIZE	(sizeof(struct extended_signature))
-#define DWSIZE			(sizeof(u32))
 
 #define get_totalsize(mc) \
 	(((struct microcode_intel *)mc)->hdr.datasize ? \
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index 99ca2c9..2b2d135 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -88,7 +88,7 @@ int microcode_sanity_check(void *mc, int print_err)
 		u32 ext_table_sum = 0;
 		u32 *ext_tablep = (u32 *)ext_header;
 
-		i = ext_table_size / DWSIZE;
+		i = ext_table_size / sizeof(u32);
 		while (i--)
 			ext_table_sum += ext_tablep[i];
 		if (ext_table_sum) {
@@ -100,7 +100,7 @@ int microcode_sanity_check(void *mc, int print_err)
 
 	/* calculate the checksum */
 	orig_sum = 0;
-	i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+	i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
 	while (i--)
 		orig_sum += ((u32 *)mc)[i];
 	if (orig_sum) {
-- 
cgit v0.10.2


From 7d0161569a3b66aaa01520002c3e5fd7126d071f Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Mar 2016 11:10:02 +0100
Subject: x86/microcode/intel: Merge two consecutive if-statements

Merge the two consecutive "if (ext_table_size)". No functional change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1457345404-28884-4-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index 2b2d135..ffb1bbf 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -68,6 +68,9 @@ int microcode_sanity_check(void *mc, int print_err)
 	}
 	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
 	if (ext_table_size) {
+		u32 ext_table_sum = 0;
+		u32 *ext_tablep;
+
 		if ((ext_table_size < EXT_HEADER_SIZE)
 		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
 			if (print_err)
@@ -81,12 +84,9 @@ int microcode_sanity_check(void *mc, int print_err)
 			return -EFAULT;
 		}
 		ext_sigcount = ext_header->count;
-	}
 
-	/* check extended table checksum */
-	if (ext_table_size) {
-		u32 ext_table_sum = 0;
-		u32 *ext_tablep = (u32 *)ext_header;
+		/* check extended table checksum */
+		ext_tablep = (u32 *)ext_header;
 
 		i = ext_table_size / sizeof(u32);
 		while (i--)
-- 
cgit v0.10.2


From 5b46b5e003724547f0c83041cada15f9f496590d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Mar 2016 11:10:03 +0100
Subject: x86/microcode/intel: Improve microcode sanity-checking error messages

Turn them into proper sentences. Add comments to microcode_sanity_check() to
explain what it does.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1457345404-28884-5-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ffb1bbf..23b1d92 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -57,15 +57,16 @@ int microcode_sanity_check(void *mc, int print_err)
 
 	if (data_size + MC_HEADER_SIZE > total_size) {
 		if (print_err)
-			pr_err("error! Bad data size in microcode data file\n");
+			pr_err("Error: bad microcode data file size.\n");
 		return -EINVAL;
 	}
 
 	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
 		if (print_err)
-			pr_err("error! Unknown microcode update format\n");
+			pr_err("Error: invalid/unknown microcode update format.\n");
 		return -EINVAL;
 	}
+
 	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
 	if (ext_table_size) {
 		u32 ext_table_sum = 0;
@@ -74,43 +75,58 @@ int microcode_sanity_check(void *mc, int print_err)
 		if ((ext_table_size < EXT_HEADER_SIZE)
 		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
 			if (print_err)
-				pr_err("error! Small exttable size in microcode data file\n");
+				pr_err("Error: truncated extended signature table.\n");
 			return -EINVAL;
 		}
+
 		ext_header = mc + MC_HEADER_SIZE + data_size;
 		if (ext_table_size != exttable_size(ext_header)) {
 			if (print_err)
-				pr_err("error! Bad exttable size in microcode data file\n");
+				pr_err("Error: extended signature table size mismatch.\n");
 			return -EFAULT;
 		}
+
 		ext_sigcount = ext_header->count;
 
-		/* check extended table checksum */
+		/*
+		 * Check extended table checksum: the sum of all dwords that
+		 * comprise a valid table must be 0.
+		 */
 		ext_tablep = (u32 *)ext_header;
 
 		i = ext_table_size / sizeof(u32);
 		while (i--)
 			ext_table_sum += ext_tablep[i];
+
 		if (ext_table_sum) {
 			if (print_err)
-				pr_warn("aborting, bad extended signature table checksum\n");
+				pr_warn("Bad extended signature table checksum, aborting.\n");
 			return -EINVAL;
 		}
 	}
 
-	/* calculate the checksum */
+	/*
+	 * Calculate the checksum of update data and header. The checksum of
+	 * valid update data and header including the extended signature table
+	 * must be 0.
+	 */
 	orig_sum = 0;
 	i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
 	while (i--)
 		orig_sum += ((u32 *)mc)[i];
+
 	if (orig_sum) {
 		if (print_err)
-			pr_err("aborting, bad checksum\n");
+			pr_err("Bad microcode data checksum, aborting.\n");
 		return -EINVAL;
 	}
+
 	if (!ext_table_size)
 		return 0;
-	/* check extended signature checksum */
+
+	/*
+	 * Check extended signature checksum: 0 => valid.
+	 */
 	for (i = 0; i < ext_sigcount; i++) {
 		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
 			  EXT_SIGNATURE_SIZE * i;
@@ -119,7 +135,7 @@ int microcode_sanity_check(void *mc, int print_err)
 			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
 		if (sum) {
 			if (print_err)
-				pr_err("aborting, bad checksum\n");
+				pr_err("Bad extended signature checksum, aborting.\n");
 			return -EINVAL;
 		}
 	}
-- 
cgit v0.10.2


From 4ace2e7a48ab426eaa9745ace4c50c6a7adb3992 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Mar 2016 11:10:04 +0100
Subject: x86/microcode/intel: Drop orig_sum from ext signature checksum

It is 0 because for !0 values we would have exited already.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1457345404-28884-6-git-send-email-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index 23b1d92..2ce1a7d 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -130,9 +130,9 @@ int microcode_sanity_check(void *mc, int print_err)
 	for (i = 0; i < ext_sigcount; i++) {
 		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
 			  EXT_SIGNATURE_SIZE * i;
-		sum = orig_sum
-			- (mc_header->sig + mc_header->pf + mc_header->cksum)
-			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+
+		sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+		      (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
 		if (sum) {
 			if (print_err)
 				pr_err("Bad extended signature checksum, aborting.\n");
-- 
cgit v0.10.2


From 07ef7574458369cb0345facc748e964af68a75f4 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 7 Mar 2016 16:44:37 -0300
Subject: perf tools: Explicitly declare inc_group_count as a void function

The return type is not defined, so it defaults to int, however, the
function is not returning anything, so this is clearly not correct. Make
it a void function.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457008214-14393-1-git-send-email-colin.king@canonical.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y
index 85c44ba..5be4a5f 100644
--- a/tools/perf/util/parse-events.y
+++ b/tools/perf/util/parse-events.y
@@ -28,7 +28,7 @@ do { \
 	INIT_LIST_HEAD(list);         \
 } while (0)
 
-static inc_group_count(struct list_head *list,
+static void inc_group_count(struct list_head *list,
 		       struct parse_events_evlist *data)
 {
 	/* Count groups only have more than 1 members */
-- 
cgit v0.10.2


From 640dad47988ec4b734d71934be103bb6e931279f Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 7 Mar 2016 16:44:38 -0300
Subject: perf inject: Hit all DSOs for AUX data in JIT and other cases

Currently, when injecting build ids, if there is AUX data then 'perf
inject' hits all DSOs because it is not known which DSOs the trace data
would hit.

That needs to be done for JIT injection also, and in fact there is no
reason to distinguish what kind of injection is being done.  That is,
any time there is AUX data and the HEADER_BUID_ID feature flag is set,
and the AUX data is not being processed, then hit all DSOs.  This patch
does that.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1457005856-6143-2-git-send-email-adrian.hunter@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index b38445f..c6a4f2f 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -679,12 +679,16 @@ static int __cmd_inject(struct perf_inject *inject)
 	ret = perf_session__process_events(session);
 
 	if (!file_out->is_pipe) {
-		if (inject->build_ids) {
+		if (inject->build_ids)
 			perf_header__set_feat(&session->header,
 					      HEADER_BUILD_ID);
-			if (inject->have_auxtrace)
-				dsos__hit_all(session);
-		}
+		/*
+		 * Keep all buildids when there is unprocessed AUX data because
+		 * it is not known which ones the AUX trace hits.
+		 */
+		if (perf_header__has_feat(&session->header, HEADER_BUILD_ID) &&
+		    inject->have_auxtrace && !inject->itrace_synth_opts.set)
+			dsos__hit_all(session);
 		/*
 		 * The AUX areas have been removed and replaced with
 		 * synthesized hardware events, so clear the feature flag and
-- 
cgit v0.10.2


From 5fb0ac16c5091f48eecf1a77e461f6957a463d61 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 7 Mar 2016 16:44:39 -0300
Subject: perf session: Simplify tool stubs

Some of the stubs are identical so just have one function for them.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1457005856-6143-3-git-send-email-adrian.hunter@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 40b7a0d..60b3593 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -240,14 +240,6 @@ static int process_event_stub(struct perf_tool *tool __maybe_unused,
 	return 0;
 }
 
-static int process_build_id_stub(struct perf_tool *tool __maybe_unused,
-				 union perf_event *event __maybe_unused,
-				 struct perf_session *session __maybe_unused)
-{
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
 static int process_finished_round_stub(struct perf_tool *tool __maybe_unused,
 				       union perf_event *event __maybe_unused,
 				       struct ordered_events *oe __maybe_unused)
@@ -260,23 +252,6 @@ static int process_finished_round(struct perf_tool *tool,
 				  union perf_event *event,
 				  struct ordered_events *oe);
 
-static int process_id_index_stub(struct perf_tool *tool __maybe_unused,
-				 union perf_event *event __maybe_unused,
-				 struct perf_session *perf_session
-				 __maybe_unused)
-{
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
-static int process_event_auxtrace_info_stub(struct perf_tool *tool __maybe_unused,
-				union perf_event *event __maybe_unused,
-				struct perf_session *session __maybe_unused)
-{
-	dump_printf(": unhandled!\n");
-	return 0;
-}
-
 static int skipn(int fd, off_t n)
 {
 	char buf[4096];
@@ -303,10 +278,9 @@ static s64 process_event_auxtrace_stub(struct perf_tool *tool __maybe_unused,
 	return event->auxtrace.size;
 }
 
-static
-int process_event_auxtrace_error_stub(struct perf_tool *tool __maybe_unused,
-				      union perf_event *event __maybe_unused,
-				      struct perf_session *session __maybe_unused)
+static int process_event_op2_stub(struct perf_tool *tool __maybe_unused,
+				  union perf_event *event __maybe_unused,
+				  struct perf_session *session __maybe_unused)
 {
 	dump_printf(": unhandled!\n");
 	return 0;
@@ -410,7 +384,7 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 	if (tool->tracing_data == NULL)
 		tool->tracing_data = process_event_synth_tracing_data_stub;
 	if (tool->build_id == NULL)
-		tool->build_id = process_build_id_stub;
+		tool->build_id = process_event_op2_stub;
 	if (tool->finished_round == NULL) {
 		if (tool->ordered_events)
 			tool->finished_round = process_finished_round;
@@ -418,13 +392,13 @@ void perf_tool__fill_defaults(struct perf_tool *tool)
 			tool->finished_round = process_finished_round_stub;
 	}
 	if (tool->id_index == NULL)
-		tool->id_index = process_id_index_stub;
+		tool->id_index = process_event_op2_stub;
 	if (tool->auxtrace_info == NULL)
-		tool->auxtrace_info = process_event_auxtrace_info_stub;
+		tool->auxtrace_info = process_event_op2_stub;
 	if (tool->auxtrace == NULL)
 		tool->auxtrace = process_event_auxtrace_stub;
 	if (tool->auxtrace_error == NULL)
-		tool->auxtrace_error = process_event_auxtrace_error_stub;
+		tool->auxtrace_error = process_event_op2_stub;
 	if (tool->thread_map == NULL)
 		tool->thread_map = process_event_thread_map_stub;
 	if (tool->cpu_map == NULL)
-- 
cgit v0.10.2


From 570735b33d122bcb259ef67c6aa63e5609af5752 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 7 Mar 2016 16:44:40 -0300
Subject: perf jit: Let jit_process() return errors

In preparation for moving clockid validation into jit_process().

Previously a return value of zero meant the processing had been done and
non-zero meant either the processing was not done (i.e. not the jitdump
file mmap event) or an error occurred.

Change it so that zero means the processing was not done, one means the
processing was done and successful, and negative values are an error.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1457005856-6143-5-git-send-email-adrian.hunter@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index c6a4f2f..2512d71 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -253,12 +253,16 @@ static int perf_event__jit_repipe_mmap(struct perf_tool *tool,
 {
 	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
 	u64 n = 0;
+	int ret;
 
 	/*
 	 * if jit marker, then inject jit mmaps and generate ELF images
 	 */
-	if (!jit_process(inject->session, &inject->output, machine,
-			 event->mmap.filename, sample->pid, &n)) {
+	ret = jit_process(inject->session, &inject->output, machine,
+			  event->mmap.filename, sample->pid, &n);
+	if (ret < 0)
+		return ret;
+	if (ret) {
 		inject->bytes_written += n;
 		return 0;
 	}
@@ -287,12 +291,16 @@ static int perf_event__jit_repipe_mmap2(struct perf_tool *tool,
 {
 	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
 	u64 n = 0;
+	int ret;
 
 	/*
 	 * if jit marker, then inject jit mmaps and generate ELF images
 	 */
-	if (!jit_process(inject->session, &inject->output, machine,
-			  event->mmap2.filename, sample->pid, &n)) {
+	ret = jit_process(inject->session, &inject->output, machine,
+			  event->mmap2.filename, sample->pid, &n);
+	if (ret < 0)
+		return ret;
+	if (ret) {
 		inject->bytes_written += n;
 		return 0;
 	}
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index 99fa5ee..bd9e44f 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -647,7 +647,7 @@ jit_process(struct perf_session *session,
 	 * first, detect marker mmap (i.e., the jitdump mmap)
 	 */
 	if (jit_detect(filename, pid))
-		return -1;
+		return 0;
 
 	memset(&jd, 0, sizeof(jd));
 
@@ -665,8 +665,10 @@ jit_process(struct perf_session *session,
 	*nbytes = 0;
 
 	ret = jit_inject(&jd, filename);
-	if (!ret)
+	if (!ret) {
 		*nbytes = jd.bytes_written;
+		ret = 1;
+	}
 
 	return ret;
 }
-- 
cgit v0.10.2


From 4a018cc47932ef1e68a0600ce3ac100df70fab2a Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 7 Mar 2016 16:44:41 -0300
Subject: perf jit: Move clockid validation

Move clockid validation into jit_process() so it can later be made
conditional.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1457005856-6143-6-git-send-email-adrian.hunter@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 2512d71..b288577 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -729,23 +729,6 @@ static int __cmd_inject(struct perf_inject *inject)
 	return ret;
 }
 
-#ifdef HAVE_LIBELF_SUPPORT
-static int
-jit_validate_events(struct perf_session *session)
-{
-	struct perf_evsel *evsel;
-
-	/*
-	 * check that all events use CLOCK_MONOTONIC
-	 */
-	evlist__for_each(session->evlist, evsel) {
-		if (evsel->attr.use_clockid == 0 || evsel->attr.clockid != CLOCK_MONOTONIC)
-			return -1;
-	}
-	return 0;
-}
-#endif
-
 int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 {
 	struct perf_inject inject = {
@@ -852,13 +835,6 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 	}
 #ifdef HAVE_LIBELF_SUPPORT
 	if (inject.jit_mode) {
-		/*
-		 * validate event is using the correct clockid
-		 */
-		if (jit_validate_events(inject.session)) {
-			fprintf(stderr, "error, jitted code must be sampled with perf record -k 1\n");
-			return -1;
-		}
 		inject.tool.mmap2	   = perf_event__jit_repipe_mmap2;
 		inject.tool.mmap	   = perf_event__jit_repipe_mmap;
 		inject.tool.ordered_events = true;
diff --git a/tools/perf/util/jitdump.c b/tools/perf/util/jitdump.c
index bd9e44f..cd272cc 100644
--- a/tools/perf/util/jitdump.c
+++ b/tools/perf/util/jitdump.c
@@ -99,6 +99,21 @@ jit_close(struct jit_buf_desc *jd)
 }
 
 static int
+jit_validate_events(struct perf_session *session)
+{
+	struct perf_evsel *evsel;
+
+	/*
+	 * check that all events use CLOCK_MONOTONIC
+	 */
+	evlist__for_each(session->evlist, evsel) {
+		if (evsel->attr.use_clockid == 0 || evsel->attr.clockid != CLOCK_MONOTONIC)
+			return -1;
+	}
+	return 0;
+}
+
+static int
 jit_open(struct jit_buf_desc *jd, const char *name)
 {
 	struct jitheader header;
@@ -157,6 +172,14 @@ jit_open(struct jit_buf_desc *jd, const char *name)
 		goto error;
 	}
 
+	/*
+	 * validate event is using the correct clockid
+	 */
+	if (jit_validate_events(jd->session)) {
+		pr_err("error, jitted code must be sampled with perf record -k 1\n");
+		goto error;
+	}
+
 	bs = header.total_size - sizeof(header);
 
 	if (bs > bsz) {
-- 
cgit v0.10.2


From a23f96ee4d51ebd50b83ce0dbb5d04898fb8e3cb Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 7 Mar 2016 16:44:42 -0300
Subject: perf tools: Use 64-bit shifts with (TSC) time conversion

Commit b9511cd761fa ("perf/x86: Fix time_shift in perf_event_mmap_page")
altered the time conversion algorithms documented in the perf_event.h
header file, to use 64-bit shifts.  That was done to make the code more
future-proof (i.e. some time in the future a 32-bit shift could be
allowed).  Reflect those changes in perf tools.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1457005856-6143-9-git-send-email-adrian.hunter@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/arch/x86/tests/rdpmc.c b/tools/perf/arch/x86/tests/rdpmc.c
index 7945462..72193f19 100644
--- a/tools/perf/arch/x86/tests/rdpmc.c
+++ b/tools/perf/arch/x86/tests/rdpmc.c
@@ -59,7 +59,7 @@ static u64 mmap_read_self(void *addr)
 		u64 quot, rem;
 
 		quot = (cyc >> time_shift);
-		rem = cyc & ((1 << time_shift) - 1);
+		rem = cyc & (((u64)1 << time_shift) - 1);
 		delta = time_offset + quot * time_mult +
 			((rem * time_mult) >> time_shift);
 
diff --git a/tools/perf/util/tsc.c b/tools/perf/util/tsc.c
index 4d4210d..1b74164 100644
--- a/tools/perf/util/tsc.c
+++ b/tools/perf/util/tsc.c
@@ -19,7 +19,7 @@ u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc)
 	u64 quot, rem;
 
 	quot = cyc >> tc->time_shift;
-	rem  = cyc & ((1 << tc->time_shift) - 1);
+	rem  = cyc & (((u64)1 << tc->time_shift) - 1);
 	return tc->time_zero + quot * tc->time_mult +
 	       ((rem * tc->time_mult) >> tc->time_shift);
 }
-- 
cgit v0.10.2


From 4b633eba14627bcb1ef5c7a498e7dc308cd6a5d6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 7 Mar 2016 16:44:43 -0300
Subject: perf hists: Add level field to struct perf_hpp_fmt

The level field is to distinguish levels in the hierarchy mode.
Currently each column (perf_hpp_fmt) has a different level.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457103582-28396-2-git-send-email-namhyung@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index da5e505..f4ef513 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -233,6 +233,7 @@ struct perf_hpp_fmt {
 	int len;
 	int user_len;
 	int idx;
+	int level;
 };
 
 struct perf_hpp_list {
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 4380a28..ab6eb7c 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1544,7 +1544,7 @@ static void hse_free(struct perf_hpp_fmt *fmt)
 }
 
 static struct hpp_sort_entry *
-__sort_dimension__alloc_hpp(struct sort_dimension *sd)
+__sort_dimension__alloc_hpp(struct sort_dimension *sd, int level)
 {
 	struct hpp_sort_entry *hse;
 
@@ -1572,6 +1572,7 @@ __sort_dimension__alloc_hpp(struct sort_dimension *sd)
 	hse->hpp.elide = false;
 	hse->hpp.len = 0;
 	hse->hpp.user_len = 0;
+	hse->hpp.level = level;
 
 	return hse;
 }
@@ -1581,7 +1582,8 @@ static void hpp_free(struct perf_hpp_fmt *fmt)
 	free(fmt);
 }
 
-static struct perf_hpp_fmt *__hpp_dimension__alloc_hpp(struct hpp_dimension *hd)
+static struct perf_hpp_fmt *__hpp_dimension__alloc_hpp(struct hpp_dimension *hd,
+						       int level)
 {
 	struct perf_hpp_fmt *fmt;
 
@@ -1590,6 +1592,7 @@ static struct perf_hpp_fmt *__hpp_dimension__alloc_hpp(struct hpp_dimension *hd)
 		INIT_LIST_HEAD(&fmt->list);
 		INIT_LIST_HEAD(&fmt->sort_list);
 		fmt->free = hpp_free;
+		fmt->level = level;
 	}
 
 	return fmt;
@@ -1611,9 +1614,9 @@ int hist_entry__filter(struct hist_entry *he, int type, const void *arg)
 	return hse->se->se_filter(he, type, arg);
 }
 
-static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd)
+static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd, int level)
 {
-	struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
+	struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd, level);
 
 	if (hse == NULL)
 		return -1;
@@ -1625,7 +1628,7 @@ static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd)
 static int __sort_dimension__add_hpp_output(struct perf_hpp_list *list,
 					    struct sort_dimension *sd)
 {
-	struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd);
+	struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd, 0);
 
 	if (hse == NULL)
 		return -1;
@@ -1868,7 +1871,8 @@ static void hde_free(struct perf_hpp_fmt *fmt)
 }
 
 static struct hpp_dynamic_entry *
-__alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
+__alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field,
+		      int level)
 {
 	struct hpp_dynamic_entry *hde;
 
@@ -1899,6 +1903,7 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field)
 	hde->hpp.elide = false;
 	hde->hpp.len = 0;
 	hde->hpp.user_len = 0;
+	hde->hpp.level = level;
 
 	return hde;
 }
@@ -1974,11 +1979,11 @@ static struct perf_evsel *find_evsel(struct perf_evlist *evlist, char *event_nam
 
 static int __dynamic_dimension__add(struct perf_evsel *evsel,
 				    struct format_field *field,
-				    bool raw_trace)
+				    bool raw_trace, int level)
 {
 	struct hpp_dynamic_entry *hde;
 
-	hde = __alloc_dynamic_entry(evsel, field);
+	hde = __alloc_dynamic_entry(evsel, field, level);
 	if (hde == NULL)
 		return -ENOMEM;
 
@@ -1988,14 +1993,14 @@ static int __dynamic_dimension__add(struct perf_evsel *evsel,
 	return 0;
 }
 
-static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace)
+static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace, int level)
 {
 	int ret;
 	struct format_field *field;
 
 	field = evsel->tp_format->format.fields;
 	while (field) {
-		ret = __dynamic_dimension__add(evsel, field, raw_trace);
+		ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
 		if (ret < 0)
 			return ret;
 
@@ -2004,7 +2009,8 @@ static int add_evsel_fields(struct perf_evsel *evsel, bool raw_trace)
 	return 0;
 }
 
-static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
+static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace,
+				  int level)
 {
 	int ret;
 	struct perf_evsel *evsel;
@@ -2013,7 +2019,7 @@ static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
 		if (evsel->attr.type != PERF_TYPE_TRACEPOINT)
 			continue;
 
-		ret = add_evsel_fields(evsel, raw_trace);
+		ret = add_evsel_fields(evsel, raw_trace, level);
 		if (ret < 0)
 			return ret;
 	}
@@ -2021,7 +2027,7 @@ static int add_all_dynamic_fields(struct perf_evlist *evlist, bool raw_trace)
 }
 
 static int add_all_matching_fields(struct perf_evlist *evlist,
-				   char *field_name, bool raw_trace)
+				   char *field_name, bool raw_trace, int level)
 {
 	int ret = -ESRCH;
 	struct perf_evsel *evsel;
@@ -2035,14 +2041,15 @@ static int add_all_matching_fields(struct perf_evlist *evlist,
 		if (field == NULL)
 			continue;
 
-		ret = __dynamic_dimension__add(evsel, field, raw_trace);
+		ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
 		if (ret < 0)
 			break;
 	}
 	return ret;
 }
 
-static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
+static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok,
+			     int level)
 {
 	char *str, *event_name, *field_name, *opt_name;
 	struct perf_evsel *evsel;
@@ -2072,12 +2079,12 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 	}
 
 	if (!strcmp(field_name, "trace_fields")) {
-		ret = add_all_dynamic_fields(evlist, raw_trace);
+		ret = add_all_dynamic_fields(evlist, raw_trace, level);
 		goto out;
 	}
 
 	if (event_name == NULL) {
-		ret = add_all_matching_fields(evlist, field_name, raw_trace);
+		ret = add_all_matching_fields(evlist, field_name, raw_trace, level);
 		goto out;
 	}
 
@@ -2095,7 +2102,7 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 	}
 
 	if (!strcmp(field_name, "*")) {
-		ret = add_evsel_fields(evsel, raw_trace);
+		ret = add_evsel_fields(evsel, raw_trace, level);
 	} else {
 		field = pevent_find_any_field(evsel->tp_format, field_name);
 		if (field == NULL) {
@@ -2104,7 +2111,7 @@ static int add_dynamic_entry(struct perf_evlist *evlist, const char *tok)
 			return -ENOENT;
 		}
 
-		ret = __dynamic_dimension__add(evsel, field, raw_trace);
+		ret = __dynamic_dimension__add(evsel, field, raw_trace, level);
 	}
 
 out:
@@ -2112,12 +2119,12 @@ out:
 	return ret;
 }
 
-static int __sort_dimension__add(struct sort_dimension *sd)
+static int __sort_dimension__add(struct sort_dimension *sd, int level)
 {
 	if (sd->taken)
 		return 0;
 
-	if (__sort_dimension__add_hpp_sort(sd) < 0)
+	if (__sort_dimension__add_hpp_sort(sd, level) < 0)
 		return -1;
 
 	if (sd->entry->se_collapse)
@@ -2128,14 +2135,14 @@ static int __sort_dimension__add(struct sort_dimension *sd)
 	return 0;
 }
 
-static int __hpp_dimension__add(struct hpp_dimension *hd)
+static int __hpp_dimension__add(struct hpp_dimension *hd, int level)
 {
 	struct perf_hpp_fmt *fmt;
 
 	if (hd->taken)
 		return 0;
 
-	fmt = __hpp_dimension__alloc_hpp(hd);
+	fmt = __hpp_dimension__alloc_hpp(hd, level);
 	if (!fmt)
 		return -1;
 
@@ -2165,7 +2172,7 @@ static int __hpp_dimension__add_output(struct perf_hpp_list *list,
 	if (hd->taken)
 		return 0;
 
-	fmt = __hpp_dimension__alloc_hpp(hd);
+	fmt = __hpp_dimension__alloc_hpp(hd, 0);
 	if (!fmt)
 		return -1;
 
@@ -2180,8 +2187,8 @@ int hpp_dimension__add_output(unsigned col)
 	return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]);
 }
 
-static int sort_dimension__add(const char *tok,
-			       struct perf_evlist *evlist __maybe_unused)
+static int sort_dimension__add(const char *tok, struct perf_evlist *evlist,
+			       int level)
 {
 	unsigned int i;
 
@@ -2220,7 +2227,7 @@ static int sort_dimension__add(const char *tok,
 			sort__has_thread = 1;
 		}
 
-		return __sort_dimension__add(sd);
+		return __sort_dimension__add(sd, level);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) {
@@ -2229,7 +2236,7 @@ static int sort_dimension__add(const char *tok,
 		if (strncasecmp(tok, hd->name, strlen(tok)))
 			continue;
 
-		return __hpp_dimension__add(hd);
+		return __hpp_dimension__add(hd, level);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
@@ -2244,7 +2251,7 @@ static int sort_dimension__add(const char *tok,
 		if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to)
 			sort__has_sym = 1;
 
-		__sort_dimension__add(sd);
+		__sort_dimension__add(sd, level);
 		return 0;
 	}
 
@@ -2260,11 +2267,11 @@ static int sort_dimension__add(const char *tok,
 		if (sd->entry == &sort_mem_daddr_sym)
 			sort__has_sym = 1;
 
-		__sort_dimension__add(sd);
+		__sort_dimension__add(sd, level);
 		return 0;
 	}
 
-	if (!add_dynamic_entry(evlist, tok))
+	if (!add_dynamic_entry(evlist, tok, level))
 		return 0;
 
 	return -ESRCH;
@@ -2274,10 +2281,11 @@ static int setup_sort_list(char *str, struct perf_evlist *evlist)
 {
 	char *tmp, *tok;
 	int ret = 0;
+	int level = 0;
 
 	for (tok = strtok_r(str, ", ", &tmp);
 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
-		ret = sort_dimension__add(tok, evlist);
+		ret = sort_dimension__add(tok, evlist, level++);
 		if (ret == -EINVAL) {
 			error("Invalid --sort key: `%s'", tok);
 			break;
@@ -2667,7 +2675,7 @@ int setup_sorting(struct perf_evlist *evlist)
 		return err;
 
 	if (parent_pattern != default_parent_pattern) {
-		err = sort_dimension__add("parent", evlist);
+		err = sort_dimension__add("parent", evlist, -1);
 		if (err < 0)
 			return err;
 	}
-- 
cgit v0.10.2


From f594bae08183fb6b57db55387794ece3e1edf6f6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 7 Mar 2016 16:44:44 -0300
Subject: perf stat: Document --detailed option

I'm surprised this remained undocumented since at least 2011. And it is
actually a very useful switch, as Steve and I came to realize recently.

Add the text from

  2cba3ffb9a9d ("perf stat: Add -d -d and -d -d -d options to show more CPU events")

which added the incrementing aspect to -d.

Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Davidlohr Bueso <dbueso@suse.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Mel Gorman <mgorman@suse.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 2cba3ffb9a9d ("perf stat: Add -d -d and -d -d -d options to show more CPU events")
Link: http://lkml.kernel.org/r/1457347294-32546-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 52ef7a9..14d9e8f 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -69,6 +69,14 @@ report::
 --scale::
 	scale/normalize counter values
 
+-d::
+--detailed::
+	print more detailed statistics, can be specified up to 3 times
+
+	   -d:          detailed events, L1 and LLC data cache
+        -d -d:     more detailed events, dTLB and iTLB events
+     -d -d -d:     very detailed events, adding prefetch events
+
 -r::
 --repeat=<n>::
 	repeat command and print average + stddev (max: 100). 0 means forever.
-- 
cgit v0.10.2


From c3bc0c436899d01c3a09fddb308d487cc032fbd2 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 7 Mar 2016 16:44:45 -0300
Subject: perf hists: Introduce perf_hpp__setup_hists_formats()

The perf_hpp__setup_hists_formats() is to build hists-specific output
formats (and sort keys).  Currently it's only used in order to build the
output format in a hierarchy with same sort keys, but it could be used
with different sort keys in non-hierarchy mode later.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457361308-514-2-git-send-email-namhyung@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 7c0585c..3a15e84 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -5,6 +5,7 @@
 #include "../util/util.h"
 #include "../util/sort.h"
 #include "../util/evsel.h"
+#include "../util/evlist.h"
 
 /* hist period print (hpp) functions */
 
@@ -715,3 +716,65 @@ void perf_hpp__set_user_width(const char *width_list_str)
 			break;
 	}
 }
+
+static int add_hierarchy_fmt(struct hists *hists, struct perf_hpp_fmt *fmt)
+{
+	struct perf_hpp_list_node *node = NULL;
+	struct perf_hpp_fmt *fmt_copy;
+	bool found = false;
+
+	list_for_each_entry(node, &hists->hpp_formats, list) {
+		if (node->level == fmt->level) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		node = malloc(sizeof(*node));
+		if (node == NULL)
+			return -1;
+
+		node->level = fmt->level;
+		perf_hpp_list__init(&node->hpp);
+
+		list_add_tail(&node->list, &hists->hpp_formats);
+	}
+
+	fmt_copy = perf_hpp_fmt__dup(fmt);
+	if (fmt_copy == NULL)
+		return -1;
+
+	list_add_tail(&fmt_copy->list, &node->hpp.fields);
+	list_add_tail(&fmt_copy->sort_list, &node->hpp.sorts);
+
+	return 0;
+}
+
+int perf_hpp__setup_hists_formats(struct perf_hpp_list *list,
+				  struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel;
+	struct perf_hpp_fmt *fmt;
+	struct hists *hists;
+	int ret;
+
+	if (!symbol_conf.report_hierarchy)
+		return 0;
+
+	evlist__for_each(evlist, evsel) {
+		hists = evsel__hists(evsel);
+
+		perf_hpp_list__for_each_sort_list(list, fmt) {
+			if (perf_hpp__is_dynamic_entry(fmt) &&
+			    !perf_hpp__defined_dynamic_entry(fmt, hists))
+				continue;
+
+			ret = add_hierarchy_fmt(hists, fmt);
+			if (ret < 0)
+				return ret;
+		}
+	}
+
+	return 0;
+}
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 4b8b67b..fea92fc 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -2105,6 +2105,7 @@ int __hists__init(struct hists *hists, struct perf_hpp_list *hpp_list)
 	pthread_mutex_init(&hists->lock, NULL);
 	hists->socket_filter = -1;
 	hists->hpp_list = hpp_list;
+	INIT_LIST_HEAD(&hists->hpp_formats);
 	return 0;
 }
 
@@ -2133,8 +2134,19 @@ static void hists__delete_all_entries(struct hists *hists)
 static void hists_evsel__exit(struct perf_evsel *evsel)
 {
 	struct hists *hists = evsel__hists(evsel);
+	struct perf_hpp_fmt *fmt, *pos;
+	struct perf_hpp_list_node *node, *tmp;
 
 	hists__delete_all_entries(hists);
+
+	list_for_each_entry_safe(node, tmp, &hists->hpp_formats, list) {
+		perf_hpp_list__for_each_format_safe(&node->hpp, fmt, pos) {
+			list_del(&fmt->list);
+			free(fmt);
+		}
+		list_del(&node->list);
+		free(node);
+	}
 }
 
 static int hists_evsel__init(struct perf_evsel *evsel)
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index f4ef513..3cab9dc 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -78,6 +78,7 @@ struct hists {
 	u16			col_len[HISTC_NR_COLS];
 	int			socket_filter;
 	struct perf_hpp_list	*hpp_list;
+	struct list_head	hpp_formats;
 	int			nr_sort_keys;
 };
 
@@ -244,6 +245,12 @@ struct perf_hpp_list {
 
 extern struct perf_hpp_list perf_hpp_list;
 
+struct perf_hpp_list_node {
+	struct list_head	list;
+	struct perf_hpp_list	hpp;
+	int			level;
+};
+
 void perf_hpp_list__column_register(struct perf_hpp_list *list,
 				    struct perf_hpp_fmt *format);
 void perf_hpp_list__register_sort_field(struct perf_hpp_list *list,
@@ -299,6 +306,8 @@ void perf_hpp__cancel_cumulate(void);
 void perf_hpp__setup_output_field(struct perf_hpp_list *list);
 void perf_hpp__reset_output_field(struct perf_hpp_list *list);
 void perf_hpp__append_sort_keys(struct perf_hpp_list *list);
+int perf_hpp__setup_hists_formats(struct perf_hpp_list *list,
+				  struct perf_evlist *evlist);
 
 
 bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format);
@@ -308,6 +317,8 @@ bool perf_hpp__is_trace_entry(struct perf_hpp_fmt *fmt);
 bool perf_hpp__is_srcline_entry(struct perf_hpp_fmt *fmt);
 bool perf_hpp__is_srcfile_entry(struct perf_hpp_fmt *fmt);
 
+struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt);
+
 int hist_entry__filter(struct hist_entry *he, int type, const void *arg);
 
 static inline bool perf_hpp__should_skip(struct perf_hpp_fmt *format,
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index ab6eb7c..71d45d1 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1908,6 +1908,34 @@ __alloc_dynamic_entry(struct perf_evsel *evsel, struct format_field *field,
 	return hde;
 }
 
+struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt)
+{
+	struct perf_hpp_fmt *new_fmt = NULL;
+
+	if (perf_hpp__is_sort_entry(fmt)) {
+		struct hpp_sort_entry *hse, *new_hse;
+
+		hse = container_of(fmt, struct hpp_sort_entry, hpp);
+		new_hse = memdup(hse, sizeof(*hse));
+		if (new_hse)
+			new_fmt = &new_hse->hpp;
+	} else if (perf_hpp__is_dynamic_entry(fmt)) {
+		struct hpp_dynamic_entry *hde, *new_hde;
+
+		hde = container_of(fmt, struct hpp_dynamic_entry, hpp);
+		new_hde = memdup(hde, sizeof(*hde));
+		if (new_hde)
+			new_fmt = &new_hde->hpp;
+	} else {
+		new_fmt = memdup(fmt, sizeof(*fmt));
+	}
+
+	INIT_LIST_HEAD(&new_fmt->list);
+	INIT_LIST_HEAD(&new_fmt->sort_list);
+
+	return new_fmt;
+}
+
 static int parse_field_name(char *str, char **event, char **field, char **opt)
 {
 	char *event_name, *field_name, *opt_name;
@@ -2700,6 +2728,10 @@ int setup_sorting(struct perf_evlist *evlist)
 	/* and then copy output fields to sort keys */
 	perf_hpp__append_sort_keys(&perf_hpp_list);
 
+	/* setup hists-specific output fields */
+	if (perf_hpp__setup_hists_formats(&perf_hpp_list, evlist) < 0)
+		return -1;
+
 	return 0;
 }
 
-- 
cgit v0.10.2


From 1b2dbbf41a0f4cf7a5662bccb9a18128d16e5ffb Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 7 Mar 2016 16:44:46 -0300
Subject: perf hists: Use own hpp_list for hierarchy mode

Now each hists has its own hpp lists in hierarchy.  So instead of having
a pointer to a single perf_hpp_fmt in a hist entry, make it point the
hpp_list for its level.  This will be used to support multiple sort keys
in a single hierarchy level.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457361308-514-3-git-send-email-namhyung@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 5ffffcb..928b482 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1388,25 +1388,26 @@ static int hist_browser__show_hierarchy_entry(struct hist_browser *browser,
 					      HE_COLORSET_NORMAL);
 		}
 
-		ui_browser__write_nstring(&browser->b, "", 2);
-		width -= 2;
+		perf_hpp_list__for_each_format(entry->hpp_list, fmt) {
+			ui_browser__write_nstring(&browser->b, "", 2);
+			width -= 2;
 
-		/*
-		 * No need to call hist_entry__snprintf_alignment()
-		 * since this fmt is always the last column in the
-		 * hierarchy mode.
-		 */
-		fmt = entry->fmt;
-		if (fmt->color) {
-			width -= fmt->color(fmt, &hpp, entry);
-		} else {
-			int i = 0;
+			/*
+			 * No need to call hist_entry__snprintf_alignment()
+			 * since this fmt is always the last column in the
+			 * hierarchy mode.
+			 */
+			if (fmt->color) {
+				width -= fmt->color(fmt, &hpp, entry);
+			} else {
+				int i = 0;
 
-			width -= fmt->entry(fmt, &hpp, entry);
-			ui_browser__printf(&browser->b, "%s", ltrim(s));
+				width -= fmt->entry(fmt, &hpp, entry);
+				ui_browser__printf(&browser->b, "%s", ltrim(s));
 
-			while (isspace(s[i++]))
-				width++;
+				while (isspace(s[i++]))
+					width++;
+			}
 		}
 	}
 
@@ -1934,7 +1935,7 @@ static int hist_browser__fprintf_hierarchy_entry(struct hist_browser *browser,
 	struct perf_hpp_fmt *fmt;
 	bool first = true;
 	int ret;
-	int hierarchy_indent = (nr_sort_keys + 1) * HIERARCHY_INDENT;
+	int hierarchy_indent = nr_sort_keys * HIERARCHY_INDENT;
 
 	printed = fprintf(fp, "%*s", level * HIERARCHY_INDENT, "");
 
@@ -1962,9 +1963,13 @@ static int hist_browser__fprintf_hierarchy_entry(struct hist_browser *browser,
 	ret = scnprintf(hpp.buf, hpp.size, "%*s", hierarchy_indent, "");
 	advance_hpp(&hpp, ret);
 
-	fmt = he->fmt;
-	ret = fmt->entry(fmt, &hpp, he);
-	advance_hpp(&hpp, ret);
+	perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+		ret = scnprintf(hpp.buf, hpp.size, "  ");
+		advance_hpp(&hpp, ret);
+
+		ret = fmt->entry(fmt, &hpp, he);
+		advance_hpp(&hpp, ret);
+	}
 
 	printed += fprintf(fp, "%s\n", rtrim(s));
 
diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index a5758fd..4534e2d 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -412,6 +412,7 @@ static void perf_gtk__add_hierarchy_entries(struct hists *hists,
 	for (node = rb_first(root); node; node = rb_next(node)) {
 		GtkTreeIter iter;
 		float percent;
+		char *bf;
 
 		he = rb_entry(node, struct hist_entry, rb_node);
 		if (he->filtered)
@@ -437,13 +438,20 @@ static void perf_gtk__add_hierarchy_entries(struct hists *hists,
 			gtk_tree_store_set(store, &iter, col_idx++, hpp->buf, -1);
 		}
 
-		fmt = he->fmt;
-		if (fmt->color)
-			fmt->color(fmt, hpp, he);
-		else
-			fmt->entry(fmt, hpp, he);
+		bf = hpp->buf;
+		perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+			int ret;
+
+			if (fmt->color)
+				ret = fmt->color(fmt, hpp, he);
+			else
+				ret = fmt->entry(fmt, hpp, he);
+
+			snprintf(hpp->buf + ret, hpp->size - ret, "  ");
+			advance_hpp(hpp, ret + 2);
+		}
 
-		gtk_tree_store_set(store, &iter, col_idx, rtrim(hpp->buf), -1);
+		gtk_tree_store_set(store, &iter, col_idx, rtrim(bf), -1);
 
 		if (!he->leaf) {
 			perf_gtk__add_hierarchy_entries(hists, &he->hroot_out,
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 3a15e84..95795ef 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -722,6 +722,7 @@ static int add_hierarchy_fmt(struct hists *hists, struct perf_hpp_fmt *fmt)
 	struct perf_hpp_list_node *node = NULL;
 	struct perf_hpp_fmt *fmt_copy;
 	bool found = false;
+	bool skip = perf_hpp__should_skip(fmt, hists);
 
 	list_for_each_entry(node, &hists->hpp_formats, list) {
 		if (node->level == fmt->level) {
@@ -735,6 +736,7 @@ static int add_hierarchy_fmt(struct hists *hists, struct perf_hpp_fmt *fmt)
 		if (node == NULL)
 			return -1;
 
+		node->skip = skip;
 		node->level = fmt->level;
 		perf_hpp_list__init(&node->hpp);
 
@@ -745,6 +747,9 @@ static int add_hierarchy_fmt(struct hists *hists, struct perf_hpp_fmt *fmt)
 	if (fmt_copy == NULL)
 		return -1;
 
+	if (!skip)
+		node->skip = false;
+
 	list_add_tail(&fmt_copy->list, &node->hpp.fields);
 	list_add_tail(&fmt_copy->sort_list, &node->hpp.sorts);
 
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 6d06fbb..073642a 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -451,33 +451,33 @@ static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
 		advance_hpp(hpp, ret);
 	}
 
-	if (sep)
-		ret = scnprintf(hpp->buf, hpp->size, "%s", sep);
-	else
+	if (!sep)
 		ret = scnprintf(hpp->buf, hpp->size, "%*s",
-				(nr_sort_key - 1) * HIERARCHY_INDENT + 2, "");
+				(nr_sort_key - 1) * HIERARCHY_INDENT, "");
 	advance_hpp(hpp, ret);
 
 	printed += fprintf(fp, "%s", buf);
 
-	hpp->buf  = buf;
-	hpp->size = size;
-
-	/*
-	 * No need to call hist_entry__snprintf_alignment() since this
-	 * fmt is always the last column in the hierarchy mode.
-	 */
-	fmt = he->fmt;
-	if (perf_hpp__use_color() && fmt->color)
-		fmt->color(fmt, hpp, he);
-	else
-		fmt->entry(fmt, hpp, he);
-
-	/*
-	 * dynamic entries are right-aligned but we want left-aligned
-	 * in the hierarchy mode
-	 */
-	printed += fprintf(fp, "%s\n", ltrim(buf));
+	perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+		hpp->buf  = buf;
+		hpp->size = size;
+
+		/*
+		 * No need to call hist_entry__snprintf_alignment() since this
+		 * fmt is always the last column in the hierarchy mode.
+		 */
+		if (perf_hpp__use_color() && fmt->color)
+			fmt->color(fmt, hpp, he);
+		else
+			fmt->entry(fmt, hpp, he);
+
+		/*
+		 * dynamic entries are right-aligned but we want left-aligned
+		 * in the hierarchy mode
+		 */
+		printed += fprintf(fp, "%s%s", sep ?: "  ", ltrim(buf));
+	}
+	printed += putc('\n', fp);
 
 	if (symbol_conf.use_callchain && he->leaf) {
 		u64 total = hists__total_period(hists);
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index fea92fc..29da9e0 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1091,18 +1091,25 @@ static void hists__apply_filters(struct hists *hists, struct hist_entry *he);
 static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
 						 struct rb_root *root,
 						 struct hist_entry *he,
-						 struct perf_hpp_fmt *fmt)
+						 struct perf_hpp_list *hpp_list)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	struct hist_entry *iter, *new;
+	struct perf_hpp_fmt *fmt;
 	int64_t cmp;
 
 	while (*p != NULL) {
 		parent = *p;
 		iter = rb_entry(parent, struct hist_entry, rb_node_in);
 
-		cmp = fmt->collapse(fmt, iter, he);
+		cmp = 0;
+		perf_hpp_list__for_each_sort_list(hpp_list, fmt) {
+			cmp = fmt->collapse(fmt, iter, he);
+			if (cmp)
+				break;
+		}
+
 		if (!cmp) {
 			he_stat__add_stat(&iter->stat, &he->stat);
 			return iter;
@@ -1121,24 +1128,26 @@ static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
 	hists__apply_filters(hists, new);
 	hists->nr_entries++;
 
-	/* save related format for output */
-	new->fmt = fmt;
+	/* save related format list for output */
+	new->hpp_list = hpp_list;
 
 	/* some fields are now passed to 'new' */
-	if (perf_hpp__is_trace_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
-		he->trace_output = NULL;
-	else
-		new->trace_output = NULL;
+	perf_hpp_list__for_each_sort_list(hpp_list, fmt) {
+		if (perf_hpp__is_trace_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
+			he->trace_output = NULL;
+		else
+			new->trace_output = NULL;
 
-	if (perf_hpp__is_srcline_entry(fmt))
-		he->srcline = NULL;
-	else
-		new->srcline = NULL;
+		if (perf_hpp__is_srcline_entry(fmt))
+			he->srcline = NULL;
+		else
+			new->srcline = NULL;
 
-	if (perf_hpp__is_srcfile_entry(fmt))
-		he->srcfile = NULL;
-	else
-		new->srcfile = NULL;
+		if (perf_hpp__is_srcfile_entry(fmt))
+			he->srcfile = NULL;
+		else
+			new->srcfile = NULL;
+	}
 
 	rb_link_node(&new->rb_node_in, parent, p);
 	rb_insert_color(&new->rb_node_in, root);
@@ -1149,21 +1158,19 @@ static int hists__hierarchy_insert_entry(struct hists *hists,
 					 struct rb_root *root,
 					 struct hist_entry *he)
 {
-	struct perf_hpp_fmt *fmt;
+	struct perf_hpp_list_node *node;
 	struct hist_entry *new_he = NULL;
 	struct hist_entry *parent = NULL;
 	int depth = 0;
 	int ret = 0;
 
-	hists__for_each_sort_list(hists, fmt) {
-		if (!perf_hpp__is_sort_entry(fmt) &&
-		    !perf_hpp__is_dynamic_entry(fmt))
-			continue;
-		if (perf_hpp__should_skip(fmt, hists))
+	list_for_each_entry(node, &hists->hpp_formats, list) {
+		/* skip period (overhead) and elided columns */
+		if (node->level == 0 || node->skip)
 			continue;
 
 		/* insert copy of 'he' for each fmt into the hierarchy */
-		new_he = hierarchy_insert_entry(hists, root, he, fmt);
+		new_he = hierarchy_insert_entry(hists, root, he, &node->hpp);
 		if (new_he == NULL) {
 			ret = -1;
 			break;
@@ -1358,6 +1365,7 @@ static void hierarchy_insert_output_entry(struct rb_root *root,
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	struct hist_entry *iter;
+	struct perf_hpp_fmt *fmt;
 
 	while (*p != NULL) {
 		parent = *p;
@@ -1373,8 +1381,10 @@ static void hierarchy_insert_output_entry(struct rb_root *root,
 	rb_insert_color(&he->rb_node, root);
 
 	/* update column width of dynamic entry */
-	if (perf_hpp__is_dynamic_entry(he->fmt))
-		he->fmt->sort(he->fmt, he, NULL);
+	perf_hpp_list__for_each_sort_list(he->hpp_list, fmt) {
+		if (perf_hpp__is_dynamic_entry(fmt))
+			fmt->sort(fmt, he, NULL);
+	}
 }
 
 static void hists__hierarchy_output_resort(struct hists *hists,
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 3cab9dc..2209188 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -249,6 +249,7 @@ struct perf_hpp_list_node {
 	struct list_head	list;
 	struct perf_hpp_list	hpp;
 	int			level;
+	bool			skip;
 };
 
 void perf_hpp_list__column_register(struct perf_hpp_list *list,
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 25a5529..ea1f722 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -130,6 +130,7 @@ struct hist_entry {
 	u32			raw_size;
 	void			*trace_output;
 	struct perf_hpp_fmt	*fmt;
+	struct perf_hpp_list	*hpp_list;
 	struct hist_entry	*parent_he;
 	union {
 		/* this is for hierarchical entry structure */
-- 
cgit v0.10.2


From a23f37e864609f0887c1cb77c4d5b62586484a61 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 7 Mar 2016 16:44:47 -0300
Subject: perf hists: Support multiple sort keys in a hierarchy level

This implements having multiple sort keys in a single hierarchy level.
Originally only single sort key is supported for each level, but now
using the group syntax with '{ }', it can set more than one sort key in
one level.  Note that now it needs to quote in order to prevent shell
interpretation.

For example:

  $ perf report --hierarchy -s '{comm,dso},sym'
  ...
  #       Overhead  Command / Shared Object / Symbol
  # ..............  ..........................................
  #
      48.67%        swapper          [kernel.vmlinux]
         34.42%        [k] intel_idle
          1.30%        [k] __tick_nohz_idle_enter
          1.03%        [k] cpuidle_reflect
       8.87%        firefox          libpthread-2.22.so
          6.60%        [.] __GI___libc_recvmsg
          1.18%        [.] pthread_cond_signal@@GLIBC_2.3.2
          1.09%        [.] 0x000000000000ff4b
       6.11%        Xorg             libc-2.22.so
          5.27%        [.] __memcpy_sse2_unaligned

In the above example, the command name and the shared object name are
shown on the same line but the symbol name is on the different line.
Since the first two are grouped by '{}', they are in the same level.

Suggested-and-Tested=by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457361308-514-4-git-send-email-namhyung@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 71d45d1..041f236 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2310,18 +2310,40 @@ static int setup_sort_list(char *str, struct perf_evlist *evlist)
 	char *tmp, *tok;
 	int ret = 0;
 	int level = 0;
+	int next_level = 1;
+	bool in_group = false;
+
+	do {
+		tok = str;
+		tmp = strpbrk(str, "{}, ");
+		if (tmp) {
+			if (in_group)
+				next_level = level;
+			else
+				next_level = level + 1;
+
+			if (*tmp == '{')
+				in_group = true;
+			else if (*tmp == '}')
+				in_group = false;
+
+			*tmp = '\0';
+			str = tmp + 1;
+		}
 
-	for (tok = strtok_r(str, ", ", &tmp);
-			tok; tok = strtok_r(NULL, ", ", &tmp)) {
-		ret = sort_dimension__add(tok, evlist, level++);
-		if (ret == -EINVAL) {
-			error("Invalid --sort key: `%s'", tok);
-			break;
-		} else if (ret == -ESRCH) {
-			error("Unknown --sort key: `%s'", tok);
-			break;
+		if (*tok) {
+			ret = sort_dimension__add(tok, evlist, level);
+			if (ret == -EINVAL) {
+				error("Invalid --sort key: `%s'", tok);
+				break;
+			} else if (ret == -ESRCH) {
+				error("Unknown --sort key: `%s'", tok);
+				break;
+			}
 		}
-	}
+
+		level = next_level;
+	} while (tmp);
 
 	return ret;
 }
-- 
cgit v0.10.2


From 2dbbe9f26c082be5aa0e8ba5480e7bac43b2c4f0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 7 Mar 2016 16:44:48 -0300
Subject: perf hists: Fix indent for multiple hierarchy sort key

When multiple sort keys are used in a single hierarchy, it should indent
using number of hierarchy levels instead of number of sort keys.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457361308-514-5-git-send-email-namhyung@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 928b482..2f02ce7 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1280,7 +1280,7 @@ static int hist_browser__show_entry(struct hist_browser *browser,
 static int hist_browser__show_hierarchy_entry(struct hist_browser *browser,
 					      struct hist_entry *entry,
 					      unsigned short row,
-					      int level, int nr_sort_keys)
+					      int level)
 {
 	int printed = 0;
 	int width = browser->b.width;
@@ -1294,7 +1294,7 @@ static int hist_browser__show_hierarchy_entry(struct hist_browser *browser,
 		.current_entry	= current_entry,
 	};
 	int column = 0;
-	int hierarchy_indent = (nr_sort_keys - 1) * HIERARCHY_INDENT;
+	int hierarchy_indent = (entry->hists->nr_hpp_node - 2) * HIERARCHY_INDENT;
 
 	if (current_entry) {
 		browser->he_selection = entry;
@@ -1436,8 +1436,7 @@ show_callchain:
 }
 
 static int hist_browser__show_no_entry(struct hist_browser *browser,
-				       unsigned short row,
-				       int level, int nr_sort_keys)
+				       unsigned short row, int level)
 {
 	int width = browser->b.width;
 	bool current_entry = ui_browser__is_current_entry(&browser->b, row);
@@ -1445,6 +1444,7 @@ static int hist_browser__show_no_entry(struct hist_browser *browser,
 	int column = 0;
 	int ret;
 	struct perf_hpp_fmt *fmt;
+	int indent = browser->hists->nr_hpp_node - 2;
 
 	if (current_entry) {
 		browser->he_selection = NULL;
@@ -1485,8 +1485,8 @@ static int hist_browser__show_no_entry(struct hist_browser *browser,
 		width -= ret;
 	}
 
-	ui_browser__write_nstring(&browser->b, "", nr_sort_keys * HIERARCHY_INDENT);
-	width -= nr_sort_keys * HIERARCHY_INDENT;
+	ui_browser__write_nstring(&browser->b, "", indent * HIERARCHY_INDENT);
+	width -= indent * HIERARCHY_INDENT;
 
 	if (column >= browser->b.horiz_scroll) {
 		char buf[32];
@@ -1553,7 +1553,7 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows
 	struct perf_hpp_fmt *fmt;
 	size_t ret = 0;
 	int column = 0;
-	int nr_sort_keys = hists->nr_sort_keys;
+	int indent = hists->nr_hpp_node - 2;
 	bool first = true;
 
 	ret = scnprintf(buf, size, " ");
@@ -1577,7 +1577,7 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows
 	}
 
 	ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "%*s",
-			(nr_sort_keys - 1) * HIERARCHY_INDENT, "");
+			indent * HIERARCHY_INDENT, "");
 	if (advance_hpp_check(&dummy_hpp, ret))
 		return ret;
 
@@ -1645,7 +1645,6 @@ static unsigned int hist_browser__refresh(struct ui_browser *browser)
 	u16 header_offset = 0;
 	struct rb_node *nd;
 	struct hist_browser *hb = container_of(browser, struct hist_browser, b);
-	int nr_sort = hb->hists->nr_sort_keys;
 
 	if (hb->show_headers) {
 		hist_browser__show_headers(hb);
@@ -1672,14 +1671,12 @@ static unsigned int hist_browser__refresh(struct ui_browser *browser)
 
 		if (symbol_conf.report_hierarchy) {
 			row += hist_browser__show_hierarchy_entry(hb, h, row,
-								  h->depth,
-								  nr_sort);
+								  h->depth);
 			if (row == browser->rows)
 				break;
 
 			if (h->has_no_entry) {
-				hist_browser__show_no_entry(hb, row, h->depth,
-							    nr_sort);
+				hist_browser__show_no_entry(hb, row, h->depth);
 				row++;
 			}
 		} else {
diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index 95795ef..f03c4f7 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -740,6 +740,7 @@ static int add_hierarchy_fmt(struct hists *hists, struct perf_hpp_fmt *fmt)
 		node->level = fmt->level;
 		perf_hpp_list__init(&node->hpp);
 
+		hists->nr_hpp_node++;
 		list_add_tail(&node->list, &hists->hpp_formats);
 	}
 
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 073642a..543d713 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -412,7 +412,7 @@ static int hist_entry__snprintf(struct hist_entry *he, struct perf_hpp *hpp)
 
 static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
 					 struct perf_hpp *hpp,
-					 int nr_sort_key, struct hists *hists,
+					 struct hists *hists,
 					 FILE *fp)
 {
 	const char *sep = symbol_conf.field_sep;
@@ -453,7 +453,7 @@ static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
 
 	if (!sep)
 		ret = scnprintf(hpp->buf, hpp->size, "%*s",
-				(nr_sort_key - 1) * HIERARCHY_INDENT, "");
+				(hists->nr_hpp_node - 2) * HIERARCHY_INDENT, "");
 	advance_hpp(hpp, ret);
 
 	printed += fprintf(fp, "%s", buf);
@@ -504,12 +504,8 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
 	if (size == 0 || size > bfsz)
 		size = hpp.size = bfsz;
 
-	if (symbol_conf.report_hierarchy) {
-		int nr_sort = hists->nr_sort_keys;
-
-		return hist_entry__hierarchy_fprintf(he, &hpp, nr_sort,
-						     hists, fp);
-	}
+	if (symbol_conf.report_hierarchy)
+		return hist_entry__hierarchy_fprintf(he, &hpp, hists, fp);
 
 	hist_entry__snprintf(he, &hpp);
 
@@ -521,29 +517,29 @@ static int hist_entry__fprintf(struct hist_entry *he, size_t size,
 	return ret;
 }
 
-static int print_hierarchy_indent(const char *sep, int nr_sort,
+static int print_hierarchy_indent(const char *sep, int indent,
 				  const char *line, FILE *fp)
 {
-	if (sep != NULL || nr_sort < 1)
+	if (sep != NULL || indent < 2)
 		return 0;
 
-	return fprintf(fp, "%-.*s", (nr_sort - 1) * HIERARCHY_INDENT, line);
+	return fprintf(fp, "%-.*s", (indent - 2) * HIERARCHY_INDENT, line);
 }
 
 static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
 				  const char *sep, FILE *fp)
 {
 	bool first = true;
-	int nr_sort;
+	int indent;
 	int depth;
 	unsigned width = 0;
 	unsigned header_width = 0;
 	struct perf_hpp_fmt *fmt;
 
-	nr_sort = hists->nr_sort_keys;
+	indent = hists->nr_hpp_node;
 
 	/* preserve max indent depth for column headers */
-	print_hierarchy_indent(sep, nr_sort, spaces, fp);
+	print_hierarchy_indent(sep, indent, spaces, fp);
 
 	hists__for_each_format(hists, fmt) {
 		if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
@@ -582,7 +578,7 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
 	fprintf(fp, "\n# ");
 
 	/* preserve max indent depth for initial dots */
-	print_hierarchy_indent(sep, nr_sort, dots, fp);
+	print_hierarchy_indent(sep, indent, dots, fp);
 
 	first = true;
 	hists__for_each_format(hists, fmt) {
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 2209188..2cb017f 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -80,6 +80,7 @@ struct hists {
 	struct perf_hpp_list	*hpp_list;
 	struct list_head	hpp_formats;
 	int			nr_sort_keys;
+	int			nr_hpp_node;
 };
 
 struct hist_entry_iter;
-- 
cgit v0.10.2


From f58c95e344c26223c6503e6ecb0c1e11806d91e0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 7 Mar 2016 16:44:49 -0300
Subject: perf report: Use hierarchy hpp list on stdio

Now hpp formats are linked using perf_hpp_list_node when hierarchy is
enabled.  Use this info to print entries with multiple sort keys in a
single hierarchy properly.

For example, the below example shows using 4 sort keys with 2 levels.

  $ perf report --hierarchy -s '{prev_pid,prev_comm},{next_pid,next_comm}' \
   --percent-limit 1 -i perf.data.sched
  ...
  #    Overhead  prev_pid+prev_comm / next_pid+next_comm
  # ...........  .......................................
  #
      22.36%     0  swapper/0
          9.48%     17773  transmission-gt
          5.25%     109  kworker/0:1H
          1.53%     6524  Xephyr
      21.39%     17773  transmission-gt
          9.52%     0  swapper/0
          9.04%     0  swapper/2
          1.78%     0  swapper/3

Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457361308-514-6-git-send-email-namhyung@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 543d713..7aff5ac 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -417,6 +417,7 @@ static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
 {
 	const char *sep = symbol_conf.field_sep;
 	struct perf_hpp_fmt *fmt;
+	struct perf_hpp_list_node *fmt_node;
 	char *buf = hpp->buf;
 	size_t size = hpp->size;
 	int ret, printed = 0;
@@ -428,10 +429,10 @@ static int hist_entry__hierarchy_fprintf(struct hist_entry *he,
 	ret = scnprintf(hpp->buf, hpp->size, "%*s", he->depth * HIERARCHY_INDENT, "");
 	advance_hpp(hpp, ret);
 
-	hists__for_each_format(he->hists, fmt) {
-		if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
-			break;
-
+	/* the first hpp_list_node is for overhead columns */
+	fmt_node = list_first_entry(&hists->hpp_formats,
+				    struct perf_hpp_list_node, list);
+	perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
 		/*
 		 * If there's no field_sep, we still need
 		 * to display initial '  '.
@@ -529,50 +530,49 @@ static int print_hierarchy_indent(const char *sep, int indent,
 static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
 				  const char *sep, FILE *fp)
 {
-	bool first = true;
+	bool first_node, first_col;
 	int indent;
 	int depth;
 	unsigned width = 0;
 	unsigned header_width = 0;
 	struct perf_hpp_fmt *fmt;
+	struct perf_hpp_list_node *fmt_node;
 
 	indent = hists->nr_hpp_node;
 
 	/* preserve max indent depth for column headers */
 	print_hierarchy_indent(sep, indent, spaces, fp);
 
-	hists__for_each_format(hists, fmt) {
-		if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
-			break;
-
-		if (!first)
-			fprintf(fp, "%s", sep ?: "  ");
-		else
-			first = false;
+	/* the first hpp_list_node is for overhead columns */
+	fmt_node = list_first_entry(&hists->hpp_formats,
+				    struct perf_hpp_list_node, list);
 
+	perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
 		fmt->header(fmt, hpp, hists_to_evsel(hists));
-		fprintf(fp, "%s", hpp->buf);
+		fprintf(fp, "%s%s", hpp->buf, sep ?: "  ");
 	}
 
 	/* combine sort headers with ' / ' */
-	first = true;
-	hists__for_each_format(hists, fmt) {
-		if (!perf_hpp__is_sort_entry(fmt) && !perf_hpp__is_dynamic_entry(fmt))
-			continue;
-		if (perf_hpp__should_skip(fmt, hists))
-			continue;
-
-		if (!first)
+	first_node = true;
+	list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+		if (!first_node)
 			header_width += fprintf(fp, " / ");
-		else {
-			fprintf(fp, "%s", sep ?: "  ");
-			first = false;
-		}
+		first_node = false;
 
-		fmt->header(fmt, hpp, hists_to_evsel(hists));
-		rtrim(hpp->buf);
+		first_col = true;
+		perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+			if (perf_hpp__should_skip(fmt, hists))
+				continue;
+
+			if (!first_col)
+				header_width += fprintf(fp, "+");
+			first_col = false;
+
+			fmt->header(fmt, hpp, hists_to_evsel(hists));
+			rtrim(hpp->buf);
 
-		header_width += fprintf(fp, "%s", ltrim(hpp->buf));
+			header_width += fprintf(fp, "%s", ltrim(hpp->buf));
+		}
 	}
 
 	fprintf(fp, "\n# ");
@@ -580,29 +580,35 @@ static int print_hierarchy_header(struct hists *hists, struct perf_hpp *hpp,
 	/* preserve max indent depth for initial dots */
 	print_hierarchy_indent(sep, indent, dots, fp);
 
-	first = true;
-	hists__for_each_format(hists, fmt) {
-		if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
-			break;
+	/* the first hpp_list_node is for overhead columns */
+	fmt_node = list_first_entry(&hists->hpp_formats,
+				    struct perf_hpp_list_node, list);
 
-		if (!first)
-			fprintf(fp, "%s", sep ?: "  ");
-		else
-			first = false;
+	first_col = true;
+	perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+		if (!first_col)
+			fprintf(fp, "%s", sep ?: "..");
+		first_col = false;
 
 		width = fmt->width(fmt, hpp, hists_to_evsel(hists));
 		fprintf(fp, "%.*s", width, dots);
 	}
 
 	depth = 0;
-	hists__for_each_format(hists, fmt) {
-		if (!perf_hpp__is_sort_entry(fmt) && !perf_hpp__is_dynamic_entry(fmt))
-			continue;
-		if (perf_hpp__should_skip(fmt, hists))
-			continue;
+	list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+		first_col = true;
+		width = depth * HIERARCHY_INDENT;
 
-		width = fmt->width(fmt, hpp, hists_to_evsel(hists));
-		width += depth * HIERARCHY_INDENT;
+		perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+			if (perf_hpp__should_skip(fmt, hists))
+				continue;
+
+			if (!first_col)
+				width++;  /* for '+' sign between column header */
+			first_col = false;
+
+			width += fmt->width(fmt, hpp, hists_to_evsel(hists));
+		}
 
 		if (width > header_width)
 			header_width = width;
@@ -621,6 +627,7 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 		      int max_cols, float min_pcnt, FILE *fp)
 {
 	struct perf_hpp_fmt *fmt;
+	struct perf_hpp_list_node *fmt_node;
 	struct rb_node *nd;
 	size_t ret = 0;
 	unsigned int width;
@@ -650,6 +657,10 @@ size_t hists__fprintf(struct hists *hists, bool show_header, int max_rows,
 	fprintf(fp, "# ");
 
 	if (symbol_conf.report_hierarchy) {
+		list_for_each_entry(fmt_node, &hists->hpp_formats, list) {
+			perf_hpp_list__for_each_format(&fmt_node->hpp, fmt)
+				perf_hpp__reset_width(fmt, hists);
+		}
 		nr_rows += print_hierarchy_header(hists, &dummy_hpp, sep, fp);
 		goto print_entries;
 	}
@@ -734,9 +745,9 @@ print_entries:
 		 * display "no entry >= x.xx%" message.
 		 */
 		if (!h->leaf && !hist_entry__has_hierarchy_children(h, min_pcnt)) {
-			int nr_sort = hists->nr_sort_keys;
+			int depth = hists->nr_hpp_node + h->depth + 1;
 
-			print_hierarchy_indent(sep, nr_sort + h->depth + 1, spaces, fp);
+			print_hierarchy_indent(sep, depth, spaces, fp);
 			fprintf(fp, "%*sno entry >= %.2f%%\n", indent, "", min_pcnt);
 
 			if (max_rows && ++nr_rows >= max_rows)
-- 
cgit v0.10.2


From a61a22f6845f9e86e0ca60d1d256a35ca12312ef Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 7 Mar 2016 16:44:50 -0300
Subject: perf hists browser: Use hierarchy hpp list

Now hpp formats are linked using perf_hpp_list_node when hierarchy is
enabled.  Like in stdio, use this info to print entries with multiple
sort keys in a single hierarchy properly.

Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457361308-514-7-git-send-email-namhyung@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index 2f02ce7..e0e217e 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1289,6 +1289,7 @@ static int hist_browser__show_hierarchy_entry(struct hist_browser *browser,
 	off_t row_offset = entry->row_offset;
 	bool first = true;
 	struct perf_hpp_fmt *fmt;
+	struct perf_hpp_list_node *fmt_node;
 	struct hpp_arg arg = {
 		.b		= &browser->b,
 		.current_entry	= current_entry,
@@ -1320,7 +1321,10 @@ static int hist_browser__show_hierarchy_entry(struct hist_browser *browser,
 	ui_browser__write_nstring(&browser->b, "", level * HIERARCHY_INDENT);
 	width -= level * HIERARCHY_INDENT;
 
-	hists__for_each_format(entry->hists, fmt) {
+	/* the first hpp_list_node is for overhead columns */
+	fmt_node = list_first_entry(&entry->hists->hpp_formats,
+				    struct perf_hpp_list_node, list);
+	perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
 		char s[2048];
 		struct perf_hpp hpp = {
 			.buf		= s,
@@ -1332,10 +1336,6 @@ static int hist_browser__show_hierarchy_entry(struct hist_browser *browser,
 		    column++ < browser->b.horiz_scroll)
 			continue;
 
-		if (perf_hpp__is_sort_entry(fmt) ||
-		    perf_hpp__is_dynamic_entry(fmt))
-			break;
-
 		if (current_entry && browser->b.navkeypressed) {
 			ui_browser__set_color(&browser->b,
 					      HE_COLORSET_SELECTED);
@@ -1444,6 +1444,7 @@ static int hist_browser__show_no_entry(struct hist_browser *browser,
 	int column = 0;
 	int ret;
 	struct perf_hpp_fmt *fmt;
+	struct perf_hpp_list_node *fmt_node;
 	int indent = browser->hists->nr_hpp_node - 2;
 
 	if (current_entry) {
@@ -1461,15 +1462,14 @@ static int hist_browser__show_no_entry(struct hist_browser *browser,
 	ui_browser__write_nstring(&browser->b, "", level * HIERARCHY_INDENT);
 	width -= level * HIERARCHY_INDENT;
 
-	hists__for_each_format(browser->hists, fmt) {
+	/* the first hpp_list_node is for overhead columns */
+	fmt_node = list_first_entry(&browser->hists->hpp_formats,
+				    struct perf_hpp_list_node, list);
+	perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
 		if (perf_hpp__should_skip(fmt, browser->hists) ||
 		    column++ < browser->b.horiz_scroll)
 			continue;
 
-		if (perf_hpp__is_sort_entry(fmt) ||
-		    perf_hpp__is_dynamic_entry(fmt))
-			break;
-
 		ret = fmt->width(fmt, NULL, hists_to_evsel(browser->hists));
 
 		if (first) {
@@ -1551,22 +1551,23 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows
 		.size   = size,
 	};
 	struct perf_hpp_fmt *fmt;
+	struct perf_hpp_list_node *fmt_node;
 	size_t ret = 0;
 	int column = 0;
 	int indent = hists->nr_hpp_node - 2;
-	bool first = true;
+	bool first_node, first_col;
 
 	ret = scnprintf(buf, size, " ");
 	if (advance_hpp_check(&dummy_hpp, ret))
 		return ret;
 
-	hists__for_each_format(hists, fmt) {
+	/* the first hpp_list_node is for overhead columns */
+	fmt_node = list_first_entry(&hists->hpp_formats,
+				    struct perf_hpp_list_node, list);
+	perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
 		if (column++ < browser->b.horiz_scroll)
 			continue;
 
-		if (perf_hpp__is_sort_entry(fmt) || perf_hpp__is_dynamic_entry(fmt))
-			break;
-
 		ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
 		if (advance_hpp_check(&dummy_hpp, ret))
 			break;
@@ -1581,34 +1582,42 @@ static int hists_browser__scnprintf_hierarchy_headers(struct hist_browser *brows
 	if (advance_hpp_check(&dummy_hpp, ret))
 		return ret;
 
-	hists__for_each_format(hists, fmt) {
-		char *start;
-
-		if (!perf_hpp__is_sort_entry(fmt) && !perf_hpp__is_dynamic_entry(fmt))
-			continue;
-		if (perf_hpp__should_skip(fmt, hists))
-			continue;
-
-		if (first) {
-			first = false;
-		} else {
+	first_node = true;
+	list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+		if (!first_node) {
 			ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, " / ");
 			if (advance_hpp_check(&dummy_hpp, ret))
 				break;
 		}
+		first_node = false;
 
-		ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
-		dummy_hpp.buf[ret] = '\0';
-		rtrim(dummy_hpp.buf);
+		first_col = true;
+		perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
+			char *start;
 
-		start = ltrim(dummy_hpp.buf);
-		ret = strlen(start);
+			if (perf_hpp__should_skip(fmt, hists))
+				continue;
 
-		if (start != dummy_hpp.buf)
-			memmove(dummy_hpp.buf, start, ret + 1);
+			if (!first_col) {
+				ret = scnprintf(dummy_hpp.buf, dummy_hpp.size, "+");
+				if (advance_hpp_check(&dummy_hpp, ret))
+					break;
+			}
+			first_col = false;
 
-		if (advance_hpp_check(&dummy_hpp, ret))
-			break;
+			ret = fmt->header(fmt, &dummy_hpp, hists_to_evsel(hists));
+			dummy_hpp.buf[ret] = '\0';
+			rtrim(dummy_hpp.buf);
+
+			start = ltrim(dummy_hpp.buf);
+			ret = strlen(start);
+
+			if (start != dummy_hpp.buf)
+				memmove(dummy_hpp.buf, start, ret + 1);
+
+			if (advance_hpp_check(&dummy_hpp, ret))
+				break;
+		}
 	}
 
 	return ret;
@@ -1676,7 +1685,7 @@ static unsigned int hist_browser__refresh(struct ui_browser *browser)
 				break;
 
 			if (h->has_no_entry) {
-				hist_browser__show_no_entry(hb, row, h->depth);
+				hist_browser__show_no_entry(hb, row, h->depth + 1);
 				row++;
 			}
 		} else {
-- 
cgit v0.10.2


From 58ecd33be90647724a78ce5e0b42f5bc482771fd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Mon, 7 Mar 2016 16:44:51 -0300
Subject: perf report: Use hierarchy hpp list on gtk

Now hpp formats are linked using perf_hpp_list_node when hierarchy is
enabled.  Like in stdio, use this info to print entries with multiple
sort keys in a single hierarchy properly.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457361308-514-8-git-send-email-namhyung@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c
index 4534e2d..bd9bf7e 100644
--- a/tools/perf/ui/gtk/hists.c
+++ b/tools/perf/ui/gtk/hists.c
@@ -407,7 +407,9 @@ static void perf_gtk__add_hierarchy_entries(struct hists *hists,
 	struct rb_node *node;
 	struct hist_entry *he;
 	struct perf_hpp_fmt *fmt;
+	struct perf_hpp_list_node *fmt_node;
 	u64 total = hists__total_period(hists);
+	int size;
 
 	for (node = rb_first(root); node; node = rb_next(node)) {
 		GtkTreeIter iter;
@@ -425,11 +427,11 @@ static void perf_gtk__add_hierarchy_entries(struct hists *hists,
 		gtk_tree_store_append(store, &iter, parent);
 
 		col_idx = 0;
-		hists__for_each_format(hists, fmt) {
-			if (perf_hpp__is_sort_entry(fmt) ||
-			    perf_hpp__is_dynamic_entry(fmt))
-				break;
 
+		/* the first hpp_list_node is for overhead columns */
+		fmt_node = list_first_entry(&hists->hpp_formats,
+					    struct perf_hpp_list_node, list);
+		perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
 			if (fmt->color)
 				fmt->color(fmt, hpp, he);
 			else
@@ -439,6 +441,7 @@ static void perf_gtk__add_hierarchy_entries(struct hists *hists,
 		}
 
 		bf = hpp->buf;
+		size = hpp->size;
 		perf_hpp_list__for_each_format(he->hpp_list, fmt) {
 			int ret;
 
@@ -451,9 +454,12 @@ static void perf_gtk__add_hierarchy_entries(struct hists *hists,
 			advance_hpp(hpp, ret + 2);
 		}
 
-		gtk_tree_store_set(store, &iter, col_idx, rtrim(bf), -1);
+		gtk_tree_store_set(store, &iter, col_idx, ltrim(rtrim(bf)), -1);
 
 		if (!he->leaf) {
+			hpp->buf = bf;
+			hpp->size = size;
+
 			perf_gtk__add_hierarchy_entries(hists, &he->hroot_out,
 							store, &iter, hpp,
 							min_pcnt);
@@ -486,6 +492,7 @@ static void perf_gtk__show_hierarchy(GtkWidget *window, struct hists *hists,
 				     float min_pcnt)
 {
 	struct perf_hpp_fmt *fmt;
+	struct perf_hpp_list_node *fmt_node;
 	GType col_types[MAX_COLUMNS];
 	GtkCellRenderer *renderer;
 	GtkTreeStore *store;
@@ -494,7 +501,7 @@ static void perf_gtk__show_hierarchy(GtkWidget *window, struct hists *hists,
 	int nr_cols = 0;
 	char s[512];
 	char buf[512];
-	bool first = true;
+	bool first_node, first_col;
 	struct perf_hpp hpp = {
 		.buf		= s,
 		.size		= sizeof(s),
@@ -514,11 +521,11 @@ static void perf_gtk__show_hierarchy(GtkWidget *window, struct hists *hists,
 	renderer = gtk_cell_renderer_text_new();
 
 	col_idx = 0;
-	hists__for_each_format(hists, fmt) {
-		if (perf_hpp__is_sort_entry(fmt) ||
-		    perf_hpp__is_dynamic_entry(fmt))
-			break;
 
+	/* the first hpp_list_node is for overhead columns */
+	fmt_node = list_first_entry(&hists->hpp_formats,
+				    struct perf_hpp_list_node, list);
+	perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
 		gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
 							    -1, fmt->name,
 							    renderer, "markup",
@@ -527,20 +534,24 @@ static void perf_gtk__show_hierarchy(GtkWidget *window, struct hists *hists,
 
 	/* construct merged column header since sort keys share single column */
 	buf[0] = '\0';
-	hists__for_each_format(hists ,fmt) {
-		if (!perf_hpp__is_sort_entry(fmt) &&
-		    !perf_hpp__is_dynamic_entry(fmt))
-			continue;
-		if (perf_hpp__should_skip(fmt, hists))
-			continue;
-
-		if (first)
-			first = false;
-		else
+	first_node = true;
+	list_for_each_entry_continue(fmt_node, &hists->hpp_formats, list) {
+		if (!first_node)
 			strcat(buf, " / ");
+		first_node = false;
 
-		fmt->header(fmt, &hpp, hists_to_evsel(hists));
-		strcat(buf, rtrim(hpp.buf));
+		first_col = true;
+		perf_hpp_list__for_each_format(&fmt_node->hpp ,fmt) {
+			if (perf_hpp__should_skip(fmt, hists))
+				continue;
+
+			if (!first_col)
+				strcat(buf, "+");
+			first_col = false;
+
+			fmt->header(fmt, &hpp, hists_to_evsel(hists));
+			strcat(buf, ltrim(rtrim(hpp.buf)));
+		}
 	}
 
 	gtk_tree_view_insert_column_with_attributes(GTK_TREE_VIEW(view),
-- 
cgit v0.10.2


From 7a76aa95f6f6682db5629449d763251d1c9f8c4e Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Mon, 29 Feb 2016 15:59:18 +0100
Subject: s390/cpumf: Fix lpp detection

we have to check bit 40 of the facility list before issuing LPP
and not bit 48. Otherwise a guest running on a system with
"The decimal-floating-point zoned-conversion facility" and without
the "The set-program-parameters facility" might crash on an lpp
instruction.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: stable@vger.kernel.org # v4.4+
Fixes: e22cf8ca6f75 ("s390/cpumf: rework program parameter setting to detect guest samples")
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index c5febe8..03c2b46 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -16,7 +16,7 @@
 
 __HEAD
 ENTRY(startup_continue)
-	tm	__LC_STFLE_FAC_LIST+6,0x80	# LPP available ?
+	tm	__LC_STFLE_FAC_LIST+5,0x80	# LPP available ?
 	jz	0f
 	xc	__LC_LPP+1(7,0),__LC_LPP+1	# clear lpp and current_pid
 	mvi	__LC_LPP,0x80			#   and set LPP_MAGIC
-- 
cgit v0.10.2


From 6436257b491cc0d456c39330dfc22126148d5ed7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Tue, 8 Mar 2016 11:09:53 +0100
Subject: time/timekeeping: Work around false positive GCC warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Newer GCC versions trigger the following warning:

  kernel/time/timekeeping.c: In function ‘get_device_system_crosststamp’:
  kernel/time/timekeeping.c:987:5: warning: ‘clock_was_set_seq’ may be used uninitialized in this function [-Wmaybe-uninitialized]
    if (discontinuity) {
     ^
  kernel/time/timekeeping.c:1045:15: note: ‘clock_was_set_seq’ was declared here
    unsigned int clock_was_set_seq;
                 ^

GCC clearly is unable to recognize that the 'do_interp' boolean tracks
the initialization status of 'clock_was_set_seq'.

The GCC version used was:

  gcc version 5.3.1 20151207 (Red Hat 5.3.1-2) (GCC)

Work it around by initializing clock_was_set_seq to 0. Compilers that
are able to recognize the code flow will eliminate the unnecessary
initialization.

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 931b0b1..9c629bb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1042,7 +1042,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
 	struct system_counterval_t system_counterval;
 	struct timekeeper *tk = &tk_core.timekeeper;
 	cycle_t cycles, now, interval_start;
-	unsigned int clock_was_set_seq;
+	unsigned int clock_was_set_seq = 0;
 	ktime_t base_real, base_raw;
 	s64 nsec_real, nsec_raw;
 	u8 cs_was_changed_seq;
-- 
cgit v0.10.2


From adc53f2e0ae2fcff10a4b981df14729ffb1482fc Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:17 +0100
Subject: x86/mce: Move MCx_CONFIG MSR definitions

Those MSRs are used only by the MCE code so move them there.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1456785179-14378-2-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 2ea4527..80ba0a8 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -91,6 +91,10 @@
 #define MCE_LOG_LEN 32
 #define MCE_LOG_SIGNATURE	"MACHINECHECK"
 
+/* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
+#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+
 /*
  * This structure contains all data related to the MCE log.  Also
  * carries a signature to make it easier to find from external
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 5523465..b05402e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -264,10 +264,6 @@
 #define MSR_IA32_MC0_CTL2		0x00000280
 #define MSR_IA32_MCx_CTL2(x)		(MSR_IA32_MC0_CTL2 + (x))
 
-/* 'SMCA': AMD64 Scalable MCA */
-#define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
-#define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
-
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186
-- 
cgit v0.10.2


From be0aec23bf4624fd55650629fe8df20483487049 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:18 +0100
Subject: x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors

For Scalable MCA enabled processors, errors are listed per IP block. And
since it is not required for an IP to map to a particular bank, we need
to use HWID and McaType values from the MCx_IPID register to figure out
which IP a given bank represents.

We also have a new bit (TCC) in the MCx_STATUS register to indicate Task
context is corrupt.

Add logic here to decode errors from all known IP blocks for Fam17h
Model 00-0fh and to print TCC errors.

[ Minor fixups. ]
Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-3-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 80ba0a8..9c467fe 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,18 @@
 /* AMD-specific bits */
 #define MCI_STATUS_DEFERRED	(1ULL<<44)  /* declare an uncorrected error */
 #define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
+#define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ *  - Deferred error interrupt type is specifiable by bank.
+ *  - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ *    But should not be used to determine MSR numbers.
+ *  - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX		0x1
+#define MCI_IPID_MCATYPE	0xFFFF0000
+#define MCI_IPID_HWID		0xFFF
 
 /*
  * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -93,7 +105,9 @@
 
 /* AMD Scalable MCA */
 #define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
 #define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
 
 /*
  * This structure contains all data related to the MCE log.  Also
@@ -291,4 +305,49 @@ struct cper_sec_mem_err;
 extern void apei_mce_report_mem_error(int corrected,
 				      struct cper_sec_mem_err *mem_err);
 
+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum amd_ip_types {
+	SMCA_F17H_CORE = 0,	/* Core errors */
+	SMCA_DF,		/* Data Fabric */
+	SMCA_UMC,		/* Unified Memory Controller */
+	SMCA_PB,		/* Parameter Block */
+	SMCA_PSP,		/* Platform Security Processor */
+	SMCA_SMU,		/* System Management Unit */
+	N_AMD_IP_TYPES
+};
+
+struct amd_hwid {
+	const char *name;
+	unsigned int hwid;
+};
+
+extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
+
+enum amd_core_mca_blocks {
+	SMCA_LS = 0,	/* Load Store */
+	SMCA_IF,	/* Instruction Fetch */
+	SMCA_L2_CACHE,	/* L2 cache */
+	SMCA_DE,	/* Decoder unit */
+	RES,		/* Reserved */
+	SMCA_EX,	/* Execution unit */
+	SMCA_FP,	/* Floating Point */
+	SMCA_L3_CACHE,	/* L3 cache */
+	N_CORE_MCA_BLOCKS
+};
+
+extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+
+enum amd_df_mca_blocks {
+	SMCA_CS = 0,	/* Coherent Slave */
+	SMCA_PIE,	/* Power management, Interrupts, etc */
+	N_DF_BLOCKS
+};
+
+extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+#endif
+
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 88de27b..ee487a9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -71,6 +71,35 @@ static const char * const th_names[] = {
 	"execution_unit",
 };
 
+/* Define HWID to IP type mappings for Scalable MCA */
+struct amd_hwid amd_hwids[] = {
+	[SMCA_F17H_CORE]	= { "f17h_core",	0xB0 },
+	[SMCA_DF]		= { "data_fabric",	0x2E },
+	[SMCA_UMC]		= { "umc",		0x96 },
+	[SMCA_PB]		= { "param_block",	0x5 },
+	[SMCA_PSP]		= { "psp",		0xFF },
+	[SMCA_SMU]		= { "smu",		0x1 },
+};
+EXPORT_SYMBOL_GPL(amd_hwids);
+
+const char * const amd_core_mcablock_names[] = {
+	[SMCA_LS]		= "load_store",
+	[SMCA_IF]		= "insn_fetch",
+	[SMCA_L2_CACHE]		= "l2_cache",
+	[SMCA_DE]		= "decode_unit",
+	[RES]			= "",
+	[SMCA_EX]		= "execution_unit",
+	[SMCA_FP]		= "floating_point",
+	[SMCA_L3_CACHE]		= "l3_cache",
+};
+EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+
+const char * const amd_df_mcablock_names[] = {
+	[SMCA_CS]		= "coherent_slave",
+	[SMCA_PIE]		= "pie",
+};
+EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
 static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
 static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index e3a945c..49768c0 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = {
 	"Status Register File",
 };
 
+/* Scalable MCA error strings */
+static const char * const f17h_ls_mce_desc[] = {
+	"Load queue parity",
+	"Store queue parity",
+	"Miss address buffer payload parity",
+	"L1 TLB parity",
+	"",						/* reserved */
+	"DC tag error type 6",
+	"DC tag error type 1",
+	"Internal error type 1",
+	"Internal error type 2",
+	"Sys Read data error thread 0",
+	"Sys read data error thread 1",
+	"DC tag error type 2",
+	"DC data error type 1 (poison comsumption)",
+	"DC data error type 2",
+	"DC data error type 3",
+	"DC tag error type 4",
+	"L2 TLB parity",
+	"PDC parity error",
+	"DC tag error type 3",
+	"DC tag error type 5",
+	"L2 fill data error",
+};
+
+static const char * const f17h_if_mce_desc[] = {
+	"microtag probe port parity error",
+	"IC microtag or full tag multi-hit error",
+	"IC full tag parity",
+	"IC data array parity",
+	"Decoupling queue phys addr parity error",
+	"L0 ITLB parity error",
+	"L1 ITLB parity error",
+	"L2 ITLB parity error",
+	"BPQ snoop parity on Thread 0",
+	"BPQ snoop parity on Thread 1",
+	"L1 BTB multi-match error",
+	"L2 BTB multi-match error",
+};
+
+static const char * const f17h_l2_mce_desc[] = {
+	"L2M tag multi-way-hit error",
+	"L2M tag ECC error",
+	"L2M data ECC error",
+	"HW assert",
+};
+
+static const char * const f17h_de_mce_desc[] = {
+	"uop cache tag parity error",
+	"uop cache data parity error",
+	"Insn buffer parity error",
+	"Insn dispatch queue parity error",
+	"Fetch address FIFO parity",
+	"Patch RAM data parity",
+	"Patch RAM sequencer parity",
+	"uop buffer parity"
+};
+
+static const char * const f17h_ex_mce_desc[] = {
+	"Watchdog timeout error",
+	"Phy register file parity",
+	"Flag register file parity",
+	"Immediate displacement register file parity",
+	"Address generator payload parity",
+	"EX payload parity",
+	"Checkpoint queue parity",
+	"Retire dispatch queue parity",
+};
+
+static const char * const f17h_fp_mce_desc[] = {
+	"Physical register file parity",
+	"Freelist parity error",
+	"Schedule queue parity",
+	"NSQ parity error",
+	"Retire queue parity",
+	"Status register file parity",
+};
+
+static const char * const f17h_l3_mce_desc[] = {
+	"Shadow tag macro ECC error",
+	"Shadow tag macro multi-way-hit error",
+	"L3M tag ECC error",
+	"L3M tag multi-way-hit error",
+	"L3M data ECC error",
+	"XI parity, L3 fill done channel error",
+	"L3 victim queue parity",
+	"L3 HW assert",
+};
+
+static const char * const f17h_cs_mce_desc[] = {
+	"Illegal request from transport layer",
+	"Address violation",
+	"Security violation",
+	"Illegal response from transport layer",
+	"Unexpected response",
+	"Parity error on incoming request or probe response data",
+	"Parity error on incoming read response data",
+	"Atomic request parity",
+	"ECC error on probe filter access",
+};
+
+static const char * const f17h_pie_mce_desc[] = {
+	"HW assert",
+	"Internal PIE register security violation",
+	"Error on GMI link",
+	"Poison data written to internal PIE register",
+};
+
+static const char * const f17h_umc_mce_desc[] = {
+	"DRAM ECC error",
+	"Data poison error on DRAM",
+	"SDP parity error",
+	"Advanced peripheral bus error",
+	"Command/address parity error",
+	"Write data CRC error",
+};
+
+static const char * const f17h_pb_mce_desc[] = {
+	"Parameter Block RAM ECC error",
+};
+
+static const char * const f17h_psp_mce_desc[] = {
+	"PSP RAM ECC or parity error",
+};
+
+static const char * const f17h_smu_mce_desc[] = {
+	"SMU RAM ECC or parity error",
+};
+
 static bool f12h_mc0_mce(u16 ec, u8 xec)
 {
 	bool ret = false;
@@ -691,6 +820,177 @@ static void decode_mc6_mce(struct mce *m)
 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
 }
 
+static void decode_f17h_core_errors(const char *ip_name, u8 xec,
+				   unsigned int mca_type)
+{
+	const char * const *error_desc_array;
+	size_t len;
+
+	pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+	switch (mca_type) {
+	case SMCA_LS:
+		error_desc_array = f17h_ls_mce_desc;
+		len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
+
+		if (xec == 0x4) {
+			pr_cont("Unrecognized LS MCA error code.\n");
+			return;
+		}
+		break;
+
+	case SMCA_IF:
+		error_desc_array = f17h_if_mce_desc;
+		len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
+		break;
+
+	case SMCA_L2_CACHE:
+		error_desc_array = f17h_l2_mce_desc;
+		len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
+		break;
+
+	case SMCA_DE:
+		error_desc_array = f17h_de_mce_desc;
+		len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
+		break;
+
+	case SMCA_EX:
+		error_desc_array = f17h_ex_mce_desc;
+		len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
+		break;
+
+	case SMCA_FP:
+		error_desc_array = f17h_fp_mce_desc;
+		len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
+		break;
+
+	case SMCA_L3_CACHE:
+		error_desc_array = f17h_l3_mce_desc;
+		len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
+		break;
+
+	default:
+		pr_cont("Corrupted MCA core error info.\n");
+		return;
+	}
+
+	if (xec > len) {
+		pr_cont("Unrecognized %s MCA bank error code.\n",
+			 amd_core_mcablock_names[mca_type]);
+		return;
+	}
+
+	pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+static void decode_df_errors(u8 xec, unsigned int mca_type)
+{
+	const char * const *error_desc_array;
+	size_t len;
+
+	pr_emerg(HW_ERR "Data Fabric Error: ");
+
+	switch (mca_type) {
+	case  SMCA_CS:
+		error_desc_array = f17h_cs_mce_desc;
+		len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
+		break;
+
+	case SMCA_PIE:
+		error_desc_array = f17h_pie_mce_desc;
+		len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
+		break;
+
+	default:
+		pr_cont("Corrupted MCA Data Fabric info.\n");
+		return;
+	}
+
+	if (xec > len) {
+		pr_cont("Unrecognized %s MCA bank error code.\n",
+			 amd_df_mcablock_names[mca_type]);
+		return;
+	}
+
+	pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+/* Decode errors according to Scalable MCA specification */
+static void decode_smca_errors(struct mce *m)
+{
+	u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
+	unsigned int hwid, mca_type, i;
+	u8 xec = XEC(m->status, xec_mask);
+	const char * const *error_desc_array;
+	const char *ip_name;
+	u32 low, high;
+	size_t len;
+
+	if (rdmsr_safe(addr, &low, &high)) {
+		pr_emerg("Invalid IP block specified, error information is unreliable.\n");
+		return;
+	}
+
+	hwid = high & MCI_IPID_HWID;
+	mca_type = (high & MCI_IPID_MCATYPE) >> 16;
+
+	pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
+
+	/*
+	 * Based on hwid and mca_type values, decode errors from respective IPs.
+	 * Note: mca_type values make sense only in the context of an hwid.
+	 */
+	for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
+		if (amd_hwids[i].hwid == hwid)
+			break;
+
+	switch (i) {
+	case SMCA_F17H_CORE:
+		ip_name = (mca_type == SMCA_L3_CACHE) ?
+			  "L3 Cache" : "F17h Core";
+		return decode_f17h_core_errors(ip_name, xec, mca_type);
+		break;
+
+	case SMCA_DF:
+		return decode_df_errors(xec, mca_type);
+		break;
+
+	case SMCA_UMC:
+		error_desc_array = f17h_umc_mce_desc;
+		len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
+		break;
+
+	case SMCA_PB:
+		error_desc_array = f17h_pb_mce_desc;
+		len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
+		break;
+
+	case SMCA_PSP:
+		error_desc_array = f17h_psp_mce_desc;
+		len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
+		break;
+
+	case SMCA_SMU:
+		error_desc_array = f17h_smu_mce_desc;
+		len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
+		break;
+
+	default:
+		pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
+		return;
+	}
+
+	ip_name = amd_hwids[i].name;
+	pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+	if (xec > len) {
+		pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
+		return;
+	}
+
+	pr_cont("%s.\n", error_desc_array[xec]);
+}
+
 static inline void amd_decode_err_code(u16 ec)
 {
 	if (INT_ERROR(ec)) {
@@ -752,6 +1052,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 	struct mce *m = (struct mce *)data;
 	struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
 	int ecc;
+	u32 ebx = cpuid_ebx(0x80000007);
 
 	if (amd_filter_mce(m))
 		return NOTIFY_STOP;
@@ -769,11 +1070,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"),
 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"));
 
-	if (c->x86 == 0x15 || c->x86 == 0x16)
+	if (c->x86 >= 0x15)
 		pr_cont("|%s|%s",
 			((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
 			((m->status & MCI_STATUS_POISON)   ? "Poison"   : "-"));
 
+	if (!!(ebx & BIT(3))) {
+		u32 low, high;
+		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+
+		if (!rdmsr_safe(addr, &low, &high) &&
+		    (low & MCI_CONFIG_MCAX))
+			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
+	}
+
 	/* do the two bits[14:13] together */
 	ecc = (m->status >> 45) & 0x3;
 	if (ecc)
@@ -784,6 +1094,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 	if (m->status & MCI_STATUS_ADDRV)
 		pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);
 
+	if (!!(ebx & BIT(3))) {
+		decode_smca_errors(m);
+		goto err_code;
+	}
+
 	if (!fam_ops)
 		goto err_code;
 
@@ -834,6 +1149,7 @@ static struct notifier_block amd_mce_dec_nb = {
 static int __init mce_amd_init(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
+	u32 ebx;
 
 	if (c->x86_vendor != X86_VENDOR_AMD)
 		return -ENODEV;
@@ -888,10 +1204,18 @@ static int __init mce_amd_init(void)
 		fam_ops->mc2_mce = f16h_mc2_mce;
 		break;
 
+	case 0x17:
+		ebx = cpuid_ebx(0x80000007);
+		xec_mask = 0x3f;
+		if (!(ebx & BIT(3))) {
+			printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
+			goto err_out;
+		}
+		break;
+
 	default:
 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
-		kfree(fam_ops);
-		fam_ops = NULL;
+		goto err_out;
 	}
 
 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
@@ -899,6 +1223,11 @@ static int __init mce_amd_init(void)
 	mce_register_decode_chain(&amd_mce_dec_nb);
 
 	return 0;
+
+err_out:
+	kfree(fam_ops);
+	fam_ops = NULL;
+	return -EINVAL;
 }
 early_initcall(mce_amd_init);
 
-- 
cgit v0.10.2


From 8dd1e17a55b0bb1206c71c7a4344c5e3037cdf65 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:19 +0100
Subject: x86/mce/AMD: Fix logic to obtain block address

In upcoming processors, the BLKPTR field is no longer used to indicate
the MSR number of the additional register. Insted, it simply indicates
the prescence of additional MSRs.

Fix the logic here to gather MSR address from MSR_AMD64_SMCA_MCx_MISC()
for newer processors and fall back to existing logic for older
processors.

[ Drop nextaddr_out label; style cleanups. ]
Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-4-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 9c467fe..72f8688 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -104,10 +104,14 @@
 #define MCE_LOG_SIGNATURE	"MACHINECHECK"
 
 /* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_MISC0	0xc0002003
 #define MSR_AMD64_SMCA_MC0_CONFIG	0xc0002004
 #define MSR_AMD64_SMCA_MC0_IPID		0xc0002005
+#define MSR_AMD64_SMCA_MC0_MISC1	0xc000200a
+#define MSR_AMD64_SMCA_MCx_MISC(x)	(MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_CONFIG(x)	(MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
 #define MSR_AMD64_SMCA_MCx_IPID(x)	(MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISCy(x, y)	((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
 
 /*
  * This structure contains all data related to the MCE log.  Also
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ee487a9..a53eb1b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -304,6 +304,51 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
 	wrmsr(MSR_CU_DEF_ERR, low, high);
 }
 
+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+			     unsigned int bank, unsigned int block)
+{
+	u32 addr = 0, offset = 0;
+
+	if (mce_flags.smca) {
+		if (!block) {
+			addr = MSR_AMD64_SMCA_MCx_MISC(bank);
+		} else {
+			/*
+			 * For SMCA enabled processors, BLKPTR field of the
+			 * first MISC register (MCx_MISC0) indicates presence of
+			 * additional MISC register set (MISC1-4).
+			 */
+			u32 low, high;
+
+			if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+				return addr;
+
+			if (!(low & MCI_CONFIG_MCAX))
+				return addr;
+
+			if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+			    (low & MASK_BLKPTR_LO))
+				addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+		}
+		return addr;
+	}
+
+	/* Fall back to method we used for older processors: */
+	switch (block) {
+	case 0:
+		addr = MSR_IA32_MCx_MISC(bank);
+		break;
+	case 1:
+		offset = ((low & MASK_BLKPTR_LO) >> 21);
+		if (offset)
+			addr = MCG_XBLK_ADDR + offset;
+		break;
+	default:
+		addr = ++current_addr;
+	}
+	return addr;
+}
+
 static int
 prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 			int offset, u32 misc_high)
@@ -366,16 +411,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		for (block = 0; block < NR_BLOCKS; ++block) {
-			if (block == 0)
-				address = MSR_IA32_MCx_MISC(bank);
-			else if (block == 1) {
-				address = (low & MASK_BLKPTR_LO) >> 21;
-				if (!address)
-					break;
-
-				address += MCG_XBLK_ADDR;
-			} else
-				++address;
+			address = get_block_address(address, low, high, bank, block);
+			if (!address)
+				break;
 
 			if (rdmsr_safe(address, &low, &high))
 				break;
@@ -480,16 +518,9 @@ static void amd_threshold_interrupt(void)
 		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
 		for (block = 0; block < NR_BLOCKS; ++block) {
-			if (block == 0) {
-				address = MSR_IA32_MCx_MISC(bank);
-			} else if (block == 1) {
-				address = (low & MASK_BLKPTR_LO) >> 21;
-				if (!address)
-					break;
-				address += MCG_XBLK_ADDR;
-			} else {
-				++address;
-			}
+			address = get_block_address(address, low, high, bank, block);
+			if (!address)
+				break;
 
 			if (rdmsr_safe(address, &low, &high))
 				break;
@@ -709,16 +740,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
 	if (err)
 		goto out_free;
 recurse:
-	if (!block) {
-		address = (low & MASK_BLKPTR_LO) >> 21;
-		if (!address)
-			return 0;
-		address += MCG_XBLK_ADDR;
-	} else {
-		++address;
-	}
+	address = get_block_address(address, low, high, bank, ++block);
+	if (!address)
+		return 0;
 
-	err = allocate_threshold_blocks(cpu, bank, ++block, address);
+	err = allocate_threshold_blocks(cpu, bank, block, address);
 	if (err)
 		goto out_free;
 
-- 
cgit v0.10.2


From 2cd3b5f9033f0b051842a279dac5a54271cbd3c8 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:20 +0100
Subject: x86/mce: Clarify comments regarding deferred error

Deferred errors indicate errors that hardware could not fix. But it
still does not cause any interruption to program flow. So it does not
generate any #MC and UC bit in MCx_STATUS is not set.

Correct comment.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-5-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 72f8688..cfff341 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,7 +40,7 @@
 #define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
 
 /* AMD-specific bits */
-#define MCI_STATUS_DEFERRED	(1ULL<<44)  /* declare an uncorrected error */
+#define MCI_STATUS_DEFERRED	(1ULL<<44)  /* uncorrected error, deferred exception */
 #define MCI_STATUS_POISON	(1ULL<<43)  /* access poisonous data */
 #define MCI_STATUS_TCC		(1ULL<<55)  /* Task context corrupt */
 
-- 
cgit v0.10.2


From ea2ca36b658cfc6081ee454e97593c81f646806e Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Mon, 7 Mar 2016 14:02:21 +0100
Subject: x86/mce/AMD: Document some functionality

In an attempt to aid in understanding of what the threshold_block
structure holds, provide comments to describe the members here. Also,
trim comments around threshold_restart_bank() and update copyright info.

No functional change is introduced.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
[ Shorten comments. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1457021458-2522-6-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 3c56ef1..5e828da 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -27,15 +27,23 @@ struct amd_l3_cache {
 };
 
 struct threshold_block {
-	unsigned int		block;
-	unsigned int		bank;
-	unsigned int		cpu;
-	u32			address;
-	u16			interrupt_enable;
-	bool			interrupt_capable;
-	u16			threshold_limit;
-	struct kobject		kobj;
-	struct list_head	miscj;
+	unsigned int	 block;			/* Number within bank */
+	unsigned int	 bank;			/* MCA bank the block belongs to */
+	unsigned int	 cpu;			/* CPU which controls MCA bank */
+	u32		 address;		/* MSR address for the block */
+	u16		 interrupt_enable;	/* Enable/Disable APIC interrupt */
+	bool		 interrupt_capable;	/* Bank can generate an interrupt. */
+
+	u16		 threshold_limit;	/*
+						 * Value upon which threshold
+						 * interrupt is generated.
+						 */
+
+	struct kobject	 kobj;			/* sysfs object */
+	struct list_head miscj;			/*
+						 * List of threshold blocks
+						 * within a bank.
+						 */
 };
 
 struct threshold_bank {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index a53eb1b..9d656fd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,5 +1,5 @@
 /*
- *  (c) 2005-2015 Advanced Micro Devices, Inc.
+ *  (c) 2005-2016 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
@@ -201,10 +201,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
 	return 1;
 };
 
-/*
- * Called via smp_call_function_single(), must be called with correct
- * cpu affinity.
- */
+/* Reprogram MCx_MISC MSR behind this threshold bank. */
 static void threshold_restart_bank(void *_tr)
 {
 	struct thresh_restart *tr = _tr;
-- 
cgit v0.10.2


From 927a5570855836e5d5859a80ce7e91e963545e8f Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 2 Mar 2016 13:24:14 +0200
Subject: perf/core: Fix perf_sched_count derailment

The error path in perf_event_open() is such that asking for a sampling
event on a PMU that doesn't generate interrupts will end up in dropping
the perf_sched_count even though it hasn't been incremented for this
event yet.

Given a sufficient amount of these calls, we'll end up disabling
scheduler's jump label even though we'd still have active events in the
system, thereby facilitating the arrival of the infernal regions upon us.

I'm fixing this by moving account_event() inside perf_event_alloc().

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1456917854-29427-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5dcc0bd..b723149 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8000,6 +8000,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		}
 	}
 
+	/* symmetric to unaccount_event() in _free_event() */
+	account_event(event);
+
 	return event;
 
 err_per_task:
@@ -8363,8 +8366,6 @@ SYSCALL_DEFINE5(perf_event_open,
 		}
 	}
 
-	account_event(event);
-
 	/*
 	 * Special case software events and allow them to be part of
 	 * any hardware group.
@@ -8661,8 +8662,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 	/* Mark owner so we could distinguish it from user events. */
 	event->owner = TASK_TOMBSTONE;
 
-	account_event(event);
-
 	ctx = find_get_context(event->pmu, task, event);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
-- 
cgit v0.10.2


From e72daf3f4d764c47fb71c9bdc7f9c54a503825b1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 1 Mar 2016 20:03:52 +0100
Subject: perf/x86/intel: Use PAGE_SIZE for PEBS buffer size on Core2

Using PAGE_SIZE buffers makes the WRMSR to PERF_GLOBAL_CTRL in
intel_pmu_enable_all() mysteriously hang on Core2. As a workaround, we
don't do this.

The hard lockup is easily triggered by running 'perf test attr'
repeatedly. Most of the time it gets stuck on sample session with
small periods.

  # perf test attr -vv
  14: struct perf_event_attr setup                             :
  --- start ---
  ...
    'PERF_TEST_ATTR=/tmp/tmpuEKz3B /usr/bin/perf record -o /tmp/tmpuEKz3B/perf.data -c 123 kill >/dev/null 2>&1' ret 1

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/20160301190352.GA8355@krava.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index c8a243d..22ece02 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -269,7 +269,7 @@ static int alloc_pebs_buffer(int cpu)
 	if (!x86_pmu.pebs)
 		return 0;
 
-	buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node);
+	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -286,7 +286,7 @@ static int alloc_pebs_buffer(int cpu)
 		per_cpu(insn_buffer, cpu) = ibuffer;
 	}
 
-	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
 
 	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
 	ds->pebs_index = ds->pebs_buffer_base;
@@ -1319,6 +1319,7 @@ void __init intel_ds_init(void)
 
 	x86_pmu.bts  = boot_cpu_has(X86_FEATURE_BTS);
 	x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+	x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
 	if (x86_pmu.pebs) {
 		char pebs_type = x86_pmu.intel_cap.pebs_trap ?  '+' : '-';
 		int format = x86_pmu.intel_cap.pebs_format;
@@ -1327,6 +1328,14 @@ void __init intel_ds_init(void)
 		case 0:
 			pr_cont("PEBS fmt0%c, ", pebs_type);
 			x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+			/*
+			 * Using >PAGE_SIZE buffers makes the WRMSR to
+			 * PERF_GLOBAL_CTRL in intel_pmu_enable_all()
+			 * mysteriously hang on Core2.
+			 *
+			 * As a workaround, we don't do this.
+			 */
+			x86_pmu.pebs_buffer_size = PAGE_SIZE;
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
 			break;
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7bb61e3..1ab6279 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -586,6 +586,7 @@ struct x86_pmu {
 			pebs_broken	:1,
 			pebs_prec_dist	:1;
 	int		pebs_record_size;
+	int		pebs_buffer_size;
 	void		(*drain_pebs)(struct pt_regs *regs);
 	struct event_constraint *pebs_constraints;
 	void		(*pebs_aliases)(struct perf_event *event);
-- 
cgit v0.10.2


From c3d266c8a9838cc141b69548bc3b1b18808ae8c4 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Thu, 3 Mar 2016 18:07:28 -0500
Subject: perf/x86/intel: Fix PEBS warning by only restoring active PMU in pmi

This patch tries to fix a PEBS warning found in my stress test. The
following perf command can easily trigger the pebs warning or spurious
NMI error on Skylake/Broadwell/Haswell platforms:

  sudo perf record -e 'cpu/umask=0x04,event=0xc4/pp,cycles,branches,ref-cycles,cache-misses,cache-references' --call-graph fp -b -c1000 -a

Also the NMI watchdog must be enabled.

For this case, the events number is larger than counter number. So
perf has to do multiplexing.

In perf_mux_hrtimer_handler, it does perf_pmu_disable(), schedule out
old events, rotate_ctx, schedule in new events and finally
perf_pmu_enable().

If the old events include precise event, the MSR_IA32_PEBS_ENABLE
should be cleared when perf_pmu_disable().  The MSR_IA32_PEBS_ENABLE
should keep 0 until the perf_pmu_enable() is called and the new event is
precise event.

However, there is a corner case which could restore PEBS_ENABLE to
stale value during the above period. In perf_pmu_disable(), GLOBAL_CTRL
will be set to 0 to stop overflow and followed PMI. But there may be
pending PMI from an earlier overflow, which cannot be stopped. So even
GLOBAL_CTRL is cleared, the kernel still be possible to get PMI. At
the end of the PMI handler, __intel_pmu_enable_all() will be called,
which will restore the stale values if old events haven't scheduled
out.

Once the stale pebs value is set, it's impossible to be corrected if
the new events are non-precise. Because the pebs_enabled will be set
to 0. x86_pmu.enable_all() will ignore the MSR_IA32_PEBS_ENABLE
setting. As a result, the following NMI with stale PEBS_ENABLE
trigger pebs warning.

The pending PMI after enabled=0 will become harmless if the NMI handler
does not change the state. This patch checks cpuc->enabled in pmi and
only restore the state when PMU is active.

Here is the dump:

  Call Trace:
   <NMI>  [<ffffffff813c3a2e>] dump_stack+0x63/0x85
   [<ffffffff810a46f2>] warn_slowpath_common+0x82/0xc0
   [<ffffffff810a483a>] warn_slowpath_null+0x1a/0x20
   [<ffffffff8100fe2e>] intel_pmu_drain_pebs_nhm+0x2be/0x320
   [<ffffffff8100caa9>] intel_pmu_handle_irq+0x279/0x460
   [<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
   [<ffffffff811f290d>] ? vunmap_page_range+0x20d/0x330
   [<ffffffff811f2f11>] ?  unmap_kernel_range_noflush+0x11/0x20
   [<ffffffff8148379f>] ? ghes_copy_tofrom_phys+0x10f/0x2a0
   [<ffffffff814839c8>] ? ghes_read_estatus+0x98/0x170
   [<ffffffff81005a7d>] perf_event_nmi_handler+0x2d/0x50
   [<ffffffff810310b9>] nmi_handle+0x69/0x120
   [<ffffffff810316f6>] default_do_nmi+0xe6/0x100
   [<ffffffff810317f2>] do_nmi+0xe2/0x130
   [<ffffffff817aea71>] end_repeat_nmi+0x1a/0x1e
   [<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
   [<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
   [<ffffffff810639b6>] ? native_write_msr_safe+0x6/0x40
   <<EOE>>  <IRQ>  [<ffffffff81006df8>] ?  x86_perf_event_set_period+0xd8/0x180
   [<ffffffff81006eec>] x86_pmu_start+0x4c/0x100
   [<ffffffff8100722d>] x86_pmu_enable+0x28d/0x300
   [<ffffffff811994d7>] perf_pmu_enable.part.81+0x7/0x10
   [<ffffffff8119cb70>] perf_mux_hrtimer_handler+0x200/0x280
   [<ffffffff8119c970>] ?  __perf_install_in_context+0xc0/0xc0
   [<ffffffff8110f92d>] __hrtimer_run_queues+0xfd/0x280
   [<ffffffff811100d8>] hrtimer_interrupt+0xa8/0x190
   [<ffffffff81199080>] ?  __perf_read_group_add.part.61+0x1a0/0x1a0
   [<ffffffff81051bd8>] local_apic_timer_interrupt+0x38/0x60
   [<ffffffff817af01d>] smp_apic_timer_interrupt+0x3d/0x50
   [<ffffffff817ad15c>] apic_timer_interrupt+0x8c/0xa0
   <EOI>  [<ffffffff81199080>] ?  __perf_read_group_add.part.61+0x1a0/0x1a0
   [<ffffffff81123de5>] ?  smp_call_function_single+0xd5/0x130
   [<ffffffff81123ddb>] ?  smp_call_function_single+0xcb/0x130
   [<ffffffff81199080>] ?  __perf_read_group_add.part.61+0x1a0/0x1a0
   [<ffffffff8119765a>] event_function_call+0x10a/0x120
   [<ffffffff8119c660>] ? ctx_resched+0x90/0x90
   [<ffffffff811971e0>] ? cpu_clock_event_read+0x30/0x30
   [<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
   [<ffffffff8119772b>] _perf_event_enable+0x5b/0x70
   [<ffffffff81197388>] perf_event_for_each_child+0x38/0xa0
   [<ffffffff811976d0>] ? _perf_event_disable+0x60/0x60
   [<ffffffff811a0ffd>] perf_ioctl+0x12d/0x3c0
   [<ffffffff8134d855>] ? selinux_file_ioctl+0x95/0x1e0
   [<ffffffff8124a3a1>] do_vfs_ioctl+0xa1/0x5a0
   [<ffffffff81036d29>] ? sched_clock+0x9/0x10
   [<ffffffff8124a919>] SyS_ioctl+0x79/0x90
   [<ffffffff817ac4b2>] entry_SYSCALL_64_fastpath+0x1a/0xa4
  ---[ end trace aef202839fe9a71d ]---
  Uhhuh. NMI received for unknown reason 2d on CPU 2.
  Do you have a strange power saving mode enabled?

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1457046448-6184-1-git-send-email-kan.liang@intel.com
[ Fixed various typos and other small details. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7402c818..5e830d0 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -597,6 +597,19 @@ void x86_pmu_disable_all(void)
 	}
 }
 
+/*
+ * There may be PMI landing after enabled=0. The PMI hitting could be before or
+ * after disable_all.
+ *
+ * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
+ * It will not be re-enabled in the NMI handler again, because enabled=0. After
+ * handling the NMI, disable_all will be called, which will not change the
+ * state either. If PMI hits after disable_all, the PMU is already disabled
+ * before entering NMI handler. The NMI handler will not change the state
+ * either.
+ *
+ * So either situation is harmless.
+ */
 static void x86_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index a7ec685..b3f6349 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1502,7 +1502,15 @@ static __initconst const u64 knl_hw_cache_extra_regs
 };
 
 /*
- * Use from PMIs where the LBRs are already disabled.
+ * Used from PMIs where the LBRs are already disabled.
+ *
+ * This function could be called consecutively. It is required to remain in
+ * disabled state if called consecutively.
+ *
+ * During consecutive calls, the same disable value will be written to related
+ * registers, so the PMU state remains unchanged. hw.state in
+ * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive
+ * calls.
  */
 static void __intel_pmu_disable_all(void)
 {
@@ -1929,7 +1937,10 @@ again:
 		goto again;
 
 done:
-	__intel_pmu_enable_all(0, true);
+	/* Only restore PMU state when it's active. See x86_pmu_disable(). */
+	if (cpuc->enabled)
+		__intel_pmu_enable_all(0, true);
+
 	/*
 	 * Only unmask the NMI after the overflow counters
 	 * have been reset. This avoids spurious NMIs on
diff --git a/arch/x86/events/intel/knc.c b/arch/x86/events/intel/knc.c
index 206226e..548d5f7 100644
--- a/arch/x86/events/intel/knc.c
+++ b/arch/x86/events/intel/knc.c
@@ -263,7 +263,9 @@ again:
 		goto again;
 
 done:
-	knc_pmu_enable_all(0);
+	/* Only restore PMU state when it's active. See x86_pmu_disable(). */
+	if (cpuc->enabled)
+		knc_pmu_enable_all(0);
 
 	return handled;
 }
-- 
cgit v0.10.2


From 5690ae28e472d25e330ad0c637a5cea3fc39fb32 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 3 Mar 2016 20:50:40 +0100
Subject: perf/x86/intel: Add definition for PT PMI bit

This patch adds a definition for GLOBAL_OVFL_STATUS bit 55
which is used with the Processor Trace (PT) feature.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: namhyung@kernel.org
Link: http://lkml.kernel.org/r/1457034642-21837-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 7bcb861..5a2ed3e 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -165,6 +165,7 @@ struct x86_pmu_capability {
 #define GLOBAL_STATUS_ASIF				BIT_ULL(60)
 #define GLOBAL_STATUS_COUNTERS_FROZEN			BIT_ULL(59)
 #define GLOBAL_STATUS_LBRS_FROZEN			BIT_ULL(58)
+#define GLOBAL_STATUS_TRACE_TOPAPMI			BIT_ULL(55)
 
 /*
  * IBS cpuid feature detection
-- 
cgit v0.10.2


From 8077eca079a212f26419c57226f28696b7100683 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 3 Mar 2016 20:50:41 +0100
Subject: perf/x86/pebs: Add workaround for broken OVFL status on HSW+

This patch fixes an issue with the GLOBAL_OVERFLOW_STATUS bits on
Haswell, Broadwell and Skylake processors when using PEBS.

The SDM stipulates that when the PEBS iterrupt threshold is crossed,
an interrupt is posted and the kernel is interrupted. The kernel will
find GLOBAL_OVF_SATUS bit 62 set indicating there are PEBS records to
drain. But the bits corresponding to the actual counters should NOT be
set. The kernel follows the SDM and assumes that all PEBS events are
processed in the drain_pebs() callback. The kernel then checks for
remaining overflows on any other (non-PEBS) events and processes these
in the for_each_bit_set(&status) loop.

As it turns out, under certain conditions on HSW and later processors,
on PEBS buffer interrupt, bit 62 is set but the counter bits may be
set as well. In that case, the kernel drains PEBS and generates
SAMPLES with the EXACT tag, then it processes the counter bits, and
generates normal (non-EXACT) SAMPLES.

I ran into this problem by trying to understand why on HSW sampling on
a PEBS event was sometimes returning SAMPLES without the EXACT tag.
This should not happen on user level code because HSW has the
eventing_ip which always point to the instruction that caused the
event.

The workaround in this patch simply ensures that the bits for the
counters used for PEBS events are cleared after the PEBS buffer has
been drained. With this fix 100% of the PEBS samples on my user code
report the EXACT tag.

Before:
  $ perf record -e cpu/event=0xd0,umask=0x81/upp ./multichase
  $ perf report -D | fgrep SAMPLES
  PERF_RECORD_SAMPLE(IP, 0x2): 11775/11775: 0x406de5 period: 73469 addr: 0 exact=Y
                           \--- EXACT tag is missing

After:
  $ perf record -e cpu/event=0xd0,umask=0x81/upp ./multichase
  $ perf report -D | fgrep SAMPLES
  PERF_RECORD_SAMPLE(IP, 0x4002): 11775/11775: 0x406de5 period: 73469 addr: 0 exact=Y
                           \--- EXACT tag is set

The problem tends to appear more often when multiple PEBS events are used.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: namhyung@kernel.org
Link: http://lkml.kernel.org/r/1457034642-21837-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index b3f6349..6567c62 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1892,6 +1892,16 @@ again:
 	if (__test_and_clear_bit(62, (unsigned long *)&status)) {
 		handled++;
 		x86_pmu.drain_pebs(regs);
+		/*
+		 * There are cases where, even though, the PEBS ovfl bit is set
+		 * in GLOBAL_OVF_STATUS, the PEBS events may also have their
+		 * overflow bits set for their counters. We must clear them
+		 * here because they have been processed as exact samples in
+		 * the drain_pebs() routine. They must not be processed again
+		 * in the for_each_bit_set() loop for regular samples below.
+		 */
+		status &= ~cpuc->pebs_enabled;
+		status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
 	}
 
 	/*
-- 
cgit v0.10.2


From b3e6246336a4a329644418a1c66e2c6bed44ef81 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 3 Mar 2016 20:50:42 +0100
Subject: perf/x86/pebs: Add proper PEBS constraints for Broadwell

This patch adds a Broadwell specific PEBS event constraint table.

Broadwell has a fix for the HT corruption bug erratum HSD29 on
Haswell. Therefore, there is no need to mark events 0xd0, 0xd1, 0xd2,
0xd3 has requiring the exclusive mode across both sibling HT threads.
This holds true for regular counting and sampling (see core.c) and
PEBS (ds.c) which we fix in this patch.

In doing so, we relax evnt scheduling for these events, they can now
be programmed on any 4 counters without impacting what is measured on
the sibling thread.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@redhat.com
Cc: adrian.hunter@intel.com
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: namhyung@kernel.org
Link: http://lkml.kernel.org/r/1457034642-21837-4-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 6567c62..edac81c 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3602,7 +3602,7 @@ __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_hsw();
 
 		x86_pmu.event_constraints = intel_bdw_event_constraints;
-		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.pebs_constraints = intel_bdw_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_snbep_extra_regs;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_ivb;
 		x86_pmu.pebs_prec_dist = true;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 22ece02..a99a8cb 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -722,6 +722,30 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_bdw_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+	INTEL_PLD_CONSTRAINT(0x01cd, 0xf),    /* MEM_TRANS_RETIRED.* */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c0, 0x2),
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
+
 struct event_constraint intel_skl_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
 	/* INST_RETIRED.PREC_DIST, inv=1, cmask=16 (cycles:ppp). */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 1ab6279..24e259e 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -861,6 +861,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];
 
 extern struct event_constraint intel_hsw_pebs_event_constraints[];
 
+extern struct event_constraint intel_bdw_pebs_event_constraints[];
+
 extern struct event_constraint intel_skl_pebs_event_constraints[];
 
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);
-- 
cgit v0.10.2


From e17dc65328057c00db7e1bfea249c8771a78b30b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Tue, 1 Mar 2016 14:25:24 -0800
Subject: perf/x86/intel: Fix PEBS data source interpretation on
 Nehalem/Westmere

Jiri reported some time ago that some entries in the PEBS data source table
in perf do not agree with the SDM. We investigated and the bits
changed for Sandy Bridge, but the SDM was not updated.

perf already implements the bits correctly for Sandy Bridge
and later. This patch patches it up for Nehalem and Westmere.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jolsa@kernel.org
Link: http://lkml.kernel.org/r/1456871124-15985-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index edac81c..68fa55b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3417,6 +3417,7 @@ __init int intel_pmu_init(void)
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
 			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
+		intel_pmu_pebs_data_source_nhm();
 		x86_add_quirk(intel_nehalem_quirk);
 
 		pr_cont("Nehalem events, ");
@@ -3480,6 +3481,7 @@ __init int intel_pmu_init(void)
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
 			X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
 
+		intel_pmu_pebs_data_source_nhm();
 		pr_cont("Westmere events, ");
 		break;
 
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index a99a8cb..ce7211a 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -51,7 +51,8 @@ union intel_x86_pebs_dse {
 #define OP_LH (P(OP, LOAD) | P(LVL, HIT))
 #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS))
 
-static const u64 pebs_data_source[] = {
+/* Version for Sandy Bridge and later */
+static u64 pebs_data_source[] = {
 	P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */
 	OP_LH | P(LVL, L1)  | P(SNOOP, NONE),	/* 0x01: L1 local */
 	OP_LH | P(LVL, LFB) | P(SNOOP, NONE),	/* 0x02: LFB hit */
@@ -70,6 +71,14 @@ static const u64 pebs_data_source[] = {
 	OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */
 };
 
+/* Patch up minor differences in the bits */
+void __init intel_pmu_pebs_data_source_nhm(void)
+{
+	pebs_data_source[0x05] = OP_LH | P(LVL, L3)  | P(SNOOP, HIT);
+	pebs_data_source[0x06] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+	pebs_data_source[0x07] = OP_LH | P(LVL, L3)  | P(SNOOP, HITM);
+}
+
 static u64 precise_store_data(u64 status)
 {
 	union intel_x86_pebs_dse dse;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 24e259e..68155ca 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -907,6 +907,8 @@ void intel_pmu_lbr_init_skl(void);
 
 void intel_pmu_lbr_init_knl(void);
 
+void intel_pmu_pebs_data_source_nhm(void);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
-- 
cgit v0.10.2


From 72f9f3fdc928dc3ecd223e801b32d930b662b6ed Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@unitn.it>
Date: Mon, 7 Mar 2016 12:27:04 +0100
Subject: sched/deadline: Remove dl_new from struct sched_dl_entity

The dl_new field of struct sched_dl_entity is currently used to
identify new deadline tasks, so that their deadline and runtime
can be properly initialised.

However, these tasks can be easily identified by checking if
their deadline is smaller than the current time when they switch
to SCHED_DEADLINE. So, dl_new can be removed by introducing this
check in switched_to_dl(); this allows to simplify the
SCHED_DEADLINE code.

Signed-off-by: Luca Abeni <luca.abeni@unitn.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1457350024-7825-2-git-send-email-luca.abeni@unitn.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9519b66..838a89a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1333,10 +1333,6 @@ struct sched_dl_entity {
 	 * task has to wait for a replenishment to be performed at the
 	 * next firing of dl_timer.
 	 *
-	 * @dl_new tells if a new instance arrived. If so we must
-	 * start executing it with full runtime and reset its absolute
-	 * deadline;
-	 *
 	 * @dl_boosted tells if we are boosted due to DI. If so we are
 	 * outside bandwidth enforcement mechanism (but only until we
 	 * exit the critical section);
@@ -1344,7 +1340,7 @@ struct sched_dl_entity {
 	 * @dl_yielded tells if task gave up the cpu before consuming
 	 * all its available runtime during the last job.
 	 */
-	int dl_throttled, dl_new, dl_boosted, dl_yielded;
+	int dl_throttled, dl_boosted, dl_yielded;
 
 	/*
 	 * Bandwidth enforcement timer. Each -deadline task has its
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 423452c..249e37d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2051,7 +2051,6 @@ void __dl_clear_params(struct task_struct *p)
 	dl_se->dl_bw = 0;
 
 	dl_se->dl_throttled = 0;
-	dl_se->dl_new = 1;
 	dl_se->dl_yielded = 0;
 }
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 15abf04..c7a036f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
+
+	/*
+	 * We are racing with the deadline timer. So, do nothing because
+	 * the deadline timer handler will take care of properly recharging
+	 * the runtime and postponing the deadline
+	 */
+	if (dl_se->dl_throttled)
+		return;
 
 	/*
 	 * We use the regular wall clock time to set deadlines in the
@@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
 	 */
 	dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
 	dl_se->runtime = pi_se->dl_runtime;
-	dl_se->dl_new = 0;
 }
 
 /*
@@ -503,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 
-	/*
-	 * The arrival of a new instance needs special treatment, i.e.,
-	 * the actual scheduling parameters have to be "renewed".
-	 */
-	if (dl_se->dl_new) {
-		setup_new_dl_entity(dl_se, pi_se);
-		return;
-	}
-
 	if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
 	    dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
 		dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
@@ -608,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	}
 
 	/*
-	 * This is possible if switched_from_dl() raced against a running
-	 * callback that took the above !dl_task() path and we've since then
-	 * switched back into SCHED_DEADLINE.
-	 *
-	 * There's nothing to do except drop our task reference.
-	 */
-	if (dl_se->dl_new)
-		goto unlock;
-
-	/*
 	 * The task might have been boosted by someone else and might be in the
 	 * boosting/deboosting path, its not throttled.
 	 */
@@ -925,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 	 * parameters of the task might need updating. Otherwise,
 	 * we want a replenishment of its runtime.
 	 */
-	if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
+	if (flags & ENQUEUE_WAKEUP)
 		update_dl_entity(dl_se, pi_se);
 	else if (flags & ENQUEUE_REPLENISH)
 		replenish_dl_entity(dl_se, pi_se);
@@ -1726,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
+	if (dl_time_before(p->dl.deadline, rq_clock(rq)))
+		setup_new_dl_entity(&p->dl, &p->dl);
+
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
-- 
cgit v0.10.2


From f9c904b7613b8b4c85b10cd6b33ad41b2843fa9d Mon Sep 17 00:00:00 2001
From: Chris Friesen <cbf123@mail.usask.ca>
Date: Sat, 5 Mar 2016 23:18:48 -0600
Subject: sched/cputime: Fix steal_account_process_tick() to always return
 jiffies

The callers of steal_account_process_tick() expect it to return
whether a jiffy should be considered stolen or not.

Currently the return value of steal_account_process_tick() is in
units of cputime, which vary between either jiffies or nsecs
depending on CONFIG_VIRT_CPU_ACCOUNTING_GEN.

If cputime has nsecs granularity and there is a tiny amount of
stolen time (a few nsecs, say) then we will consider the entire
tick stolen and will not account the tick on user/system/idle,
causing /proc/stats to show invalid data.

The fix is to change steal_account_process_tick() to accumulate
the stolen time and only account it once it's worth a jiffy.

(Thanks to Frederic Weisbecker for suggestions to fix a bug in my
first version of the patch.)

Signed-off-by: Chris Friesen <chris.friesen@windriver.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/56DBBDB8.40305@mail.usask.ca
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 01d9898..75f98c5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void)
 #ifdef CONFIG_PARAVIRT
 	if (static_key_false(&paravirt_steal_enabled)) {
 		u64 steal;
-		cputime_t steal_ct;
+		unsigned long steal_jiffies;
 
 		steal = paravirt_steal_clock(smp_processor_id());
 		steal -= this_rq()->prev_steal_time;
 
 		/*
-		 * cputime_t may be less precise than nsecs (eg: if it's
-		 * based on jiffies). Lets cast the result to cputime
+		 * steal is in nsecs but our caller is expecting steal
+		 * time in jiffies. Lets cast the result to jiffies
 		 * granularity and account the rest on the next rounds.
 		 */
-		steal_ct = nsecs_to_cputime(steal);
-		this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+		steal_jiffies = nsecs_to_jiffies(steal);
+		this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
 
-		account_steal_time(steal_ct);
-		return steal_ct;
+		account_steal_time(jiffies_to_cputime(steal_jiffies));
+		return steal_jiffies;
 	}
 #endif
 	return false;
-- 
cgit v0.10.2


From 1a8aa8acab4f3949e05ceb51e36f627b1651814c Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Sun, 6 Mar 2016 19:11:15 +0100
Subject: x86/apic: Deinline __default_send_IPI_*, save ~200 bytes

__default_send_IPI_shortcut: 49 bytes, 2 callsites
__default_send_IPI_dest_field: 108 bytes, 7 callsites

     text     data      bss       dec     hex filename
 96184086 20860488 36122624 153167198 921255e vmlinux_before
 96183823 20860520 36122624 153166967 9212477 vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Borislav Petkov <bp@alien.de>
Cc: Daniel J Blueman <daniel@numascale.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1457287876-6001-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index cfc9a0d..a4fe16e 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -57,67 +57,13 @@ static inline void __xapic_wait_icr_idle(void)
 		cpu_relax();
 }
 
-static inline void
-__default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
-{
-	/*
-	 * Subtle. In the case of the 'never do double writes' workaround
-	 * we have to lock out interrupts to be safe.  As we don't care
-	 * of the value read we use an atomic rmw access to avoid costly
-	 * cli/sti.  Otherwise we use an even cheaper single atomic write
-	 * to the APIC.
-	 */
-	unsigned int cfg;
-
-	/*
-	 * Wait for idle.
-	 */
-	__xapic_wait_icr_idle();
-
-	/*
-	 * No need to touch the target chip field
-	 */
-	cfg = __prepare_ICR(shortcut, vector, dest);
-
-	/*
-	 * Send the IPI. The write to APIC_ICR fires this off.
-	 */
-	native_apic_mem_write(APIC_ICR, cfg);
-}
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
 
 /*
  * This is used to send an IPI with no shorthand notation (the destination is
  * specified in bits 56 to 63 of the ICR).
  */
-static inline void
- __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
-{
-	unsigned long cfg;
-
-	/*
-	 * Wait for idle.
-	 */
-	if (unlikely(vector == NMI_VECTOR))
-		safe_apic_wait_icr_idle();
-	else
-		__xapic_wait_icr_idle();
-
-	/*
-	 * prepare target chip field
-	 */
-	cfg = __prepare_ICR2(mask);
-	native_apic_mem_write(APIC_ICR2, cfg);
-
-	/*
-	 * program the ICR
-	 */
-	cfg = __prepare_ICR(0, vector, dest);
-
-	/*
-	 * Send the IPI. The write to APIC_ICR fires this off.
-	 */
-	native_apic_mem_write(APIC_ICR, cfg);
-}
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest);
 
 extern void default_send_IPI_single(int cpu, int vector);
 extern void default_send_IPI_single_phys(int cpu, int vector);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index eb45fc9..28bde88 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -18,6 +18,66 @@
 #include <asm/proto.h>
 #include <asm/ipi.h>
 
+void __default_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest)
+{
+	/*
+	 * Subtle. In the case of the 'never do double writes' workaround
+	 * we have to lock out interrupts to be safe.  As we don't care
+	 * of the value read we use an atomic rmw access to avoid costly
+	 * cli/sti.  Otherwise we use an even cheaper single atomic write
+	 * to the APIC.
+	 */
+	unsigned int cfg;
+
+	/*
+	 * Wait for idle.
+	 */
+	__xapic_wait_icr_idle();
+
+	/*
+	 * No need to touch the target chip field
+	 */
+	cfg = __prepare_ICR(shortcut, vector, dest);
+
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	native_apic_mem_write(APIC_ICR, cfg);
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+void __default_send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest)
+{
+	unsigned long cfg;
+
+	/*
+	 * Wait for idle.
+	 */
+	if (unlikely(vector == NMI_VECTOR))
+		safe_apic_wait_icr_idle();
+	else
+		__xapic_wait_icr_idle();
+
+	/*
+	 * prepare target chip field
+	 */
+	cfg = __prepare_ICR2(mask);
+	native_apic_mem_write(APIC_ICR2, cfg);
+
+	/*
+	 * program the ICR
+	 */
+	cfg = __prepare_ICR(0, vector, dest);
+
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	native_apic_mem_write(APIC_ICR, cfg);
+}
+
 void default_send_IPI_single_phys(int cpu, int vector)
 {
 	unsigned long flags;
-- 
cgit v0.10.2


From fe2f95468e4bdf4a526be4f86efaefe48ca63b83 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Sun, 6 Mar 2016 19:11:16 +0100
Subject: x86/apic: Deinline _flat_send_IPI_mask, save ~150 bytes

_flat_send_IPI_mask: 157 bytes, 3 callsites

     text     data      bss       dec     hex filename
 96183823 20860520 36122624 153166967 9212477 vmlinux1_before
 96183699 20860520 36122624 153166843 92123fb vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Borislav Petkov <bp@alien.de>
Cc: Daniel J Blueman <daniel@numascale.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1457287876-6001-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 9968f30..76f89e2 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -53,7 +53,7 @@ void flat_init_apic_ldr(void)
 	apic_write(APIC_LDR, val);
 }
 
-static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
+static void _flat_send_IPI_mask(unsigned long mask, int vector)
 {
 	unsigned long flags;
 
-- 
cgit v0.10.2


From 7099e2e1f4d9051f31bbfa5803adf954bb5d76ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= <rkrcmar@redhat.com>
Date: Fri, 4 Mar 2016 15:08:42 +0100
Subject: KVM: VMX: disable PEBS before a guest entry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linux guests on Haswell (and also SandyBridge and Broadwell, at least)
would crash if you decided to run a host command that uses PEBS, like
  perf record -e 'cpu/mem-stores/pp' -a

This happens because KVM is using VMX MSR switching to disable PEBS, but
SDM [2015-12] 18.4.4.4 Re-configuring PEBS Facilities explains why it
isn't safe:
  When software needs to reconfigure PEBS facilities, it should allow a
  quiescent period between stopping the prior event counting and setting
  up a new PEBS event. The quiescent period is to allow any latent
  residual PEBS records to complete its capture at their previously
  specified buffer address (provided by IA32_DS_AREA).

There might not be a quiescent period after the MSR switch, so a CPU
ends up using host's MSR_IA32_DS_AREA to access an area in guest's
memory.  (Or MSR switching is just buggy on some models.)

The guest can learn something about the host this way:
If the guest doesn't map address pointed by MSR_IA32_DS_AREA, it results
in #PF where we leak host's MSR_IA32_DS_AREA through CR2.

After that, a malicious guest can map and configure memory where
MSR_IA32_DS_AREA is pointing and can therefore get an output from
host's tracing.

This is not a critical leak as the host must initiate with PEBS tracing
and I have not been able to get a record from more than one instruction
before vmentry in vmx_vcpu_run() (that place has most registers already
overwritten with guest's).

We could disable PEBS just few instructions before vmentry, but
disabling it earlier shouldn't affect host tracing too much.
We also don't need to switch MSR_IA32_PEBS_ENABLE on VMENTRY, but that
optimization isn't worth its code, IMO.

(If you are implementing PEBS for guests, be sure to handle the case
 where both host and guest enable PEBS, because this patch doesn't.)

Fixes: 26a4f3c08de4 ("perf/x86: disable PEBS on a guest entry.")
Cc: <stable@vger.kernel.org>
Reported-by: Jiří Olša <jolsa@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0ff4537..6e51493 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1813,6 +1813,13 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
 			return;
 		}
 		break;
+	case MSR_IA32_PEBS_ENABLE:
+		/* PEBS needs a quiescent period after being disabled (to write
+		 * a record).  Disabling PEBS through VMX MSR swapping doesn't
+		 * provide that period, so a CPU could write host's record into
+		 * guest's memory.
+		 */
+		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
 	}
 
 	for (i = 0; i < m->nr; ++i)
-- 
cgit v0.10.2


From 9522b37f5a8c7bfabe46eecadf2e130f1103f337 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2016 12:24:30 +0100
Subject: KVM: s390: correct fprs on SIGP (STOP AND) STORE STATUS

With MACHINE_HAS_VX, we convert the floating point registers from the
vector registeres when storing the status. For other VCPUs, these are
stored to vcpu->run->s.regs.vrs, but we are using current->thread.fpu.vxrs,
which resolves to the currently loaded VCPU.

So kvm_s390_store_status_unloaded() currently writes the wrong floating
point registers (converted from the vector registers) when called from
another VCPU on a z13.

This is only the case for old user space not handling SIGP STORE STATUS and
SIGP STOP AND STORE STATUS, but relying on the kernel implementation. All
other calls come from the loaded VCPU via kvm_s390_store_status().

Fixes: 9abc2a08a7d6 (KVM: s390: fix memory overwrites when vx is disabled)
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: stable@vger.kernel.org # v4.4+
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4af21c7..03dfe9c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -2381,7 +2381,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
 
 	/* manually convert vector registers if necessary */
 	if (MACHINE_HAS_VX) {
-		convert_vx_to_fp(fprs, current->thread.fpu.vxrs);
+		convert_vx_to_fp(fprs, (__vector128 *) vcpu->run->s.regs.vrs);
 		rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
 				     fprs, 128);
 	} else {
-- 
cgit v0.10.2


From 8e2a7f5b9a8c49f1f4e1dc8972198510f43c0b2e Mon Sep 17 00:00:00 2001
From: Kostenzer Felix <fkostenzer@live.at>
Date: Sun, 6 Mar 2016 23:20:06 +0100
Subject: x86/nmi: Mark 'ignore_nmis' as __read_mostly

ignore_nmis is used in two distinct places:

 1. modified through {stop,restart}_nmi by alternative_instructions
 2. read by do_nmi to determine if default_do_nmi should be called or not

thus the access pattern conforms to __read_mostly and do_nmi() is a fastpath.

Signed-off-by: Kostenzer Felix <fkostenzer@live.at>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 8a2cdd7..04b132a 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -30,6 +30,7 @@
 #include <asm/nmi.h>
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
+#include <asm/cache.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/nmi.h>
@@ -69,7 +70,7 @@ struct nmi_stats {
 
 static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
 
-static int ignore_nmis;
+static int ignore_nmis __read_mostly;
 
 int unknown_nmi_panic;
 /*
-- 
cgit v0.10.2


From 58a5aac5331388a175a42b6ed2154f0559cefb21 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 29 Feb 2016 15:50:19 -0800
Subject: x86/entry/32: Introduce and use X86_BUG_ESPFIX instead of
 paravirt_enabled

x86_64 has very clean espfix handling on paravirt: espfix64 is set
up in native_iret, so paravirt systems that override iret bypass
espfix64 automatically.  This is robust and straightforward.

x86_32 is messier.  espfix is set up before the IRET paravirt patch
point, so it can't be directly conditionalized on whether we use
native_iret.  We also can't easily move it into native_iret without
regressing performance due to a bizarre consideration.  Specifically,
on 64-bit kernels, the logic is:

  if (regs->ss & 0x4)
          setup_espfix;

On 32-bit kernels, the logic is:

  if ((regs->ss & 0x4) && (regs->cs & 0x3) == 3 &&
      (regs->flags & X86_EFLAGS_VM) == 0)
          setup_espfix;

The performance of setup_espfix itself is essentially irrelevant, but
the comparison happens on every IRET so its performance matters.  On
x86_64, there's no need for any registers except flags to implement
the comparison, so we fold the whole thing into native_iret.  On
x86_32, we don't do that because we need a free register to
implement the comparison efficiently.  We therefore do espfix setup
before restoring registers on x86_32.

This patch gets rid of the explicit paravirt_enabled check by
introducing X86_BUG_ESPFIX on 32-bit systems and using an ALTERNATIVE
to skip espfix on paravirt systems where iret != native_iret.  This is
also messy, but it's at least in line with other things we do.

This improves espfix performance by removing a branch, but no one
cares.  More importantly, it removes a paravirt_enabled user, which is
good because paravirt_enabled is ill-defined and is going away.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boris.ostrovsky@oracle.com
Cc: david.vrabel@citrix.com
Cc: konrad.wilk@oracle.com
Cc: lguest@lists.ozlabs.org
Cc: xen-devel@lists.xensource.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 66350e6..e130272 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -361,6 +361,8 @@ restore_all:
 	TRACE_IRQS_IRET
 restore_all_notrace:
 #ifdef CONFIG_X86_ESPFIX32
+	ALTERNATIVE	"jmp restore_nocheck", "", X86_BUG_ESPFIX
+
 	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
 	/*
 	 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
@@ -387,19 +389,6 @@ ENTRY(iret_exc	)
 
 #ifdef CONFIG_X86_ESPFIX32
 ldt_ss:
-#ifdef CONFIG_PARAVIRT
-	/*
-	 * The kernel can't run on a non-flat stack if paravirt mode
-	 * is active.  Rather than try to fixup the high bits of
-	 * ESP, bypass this code entirely.  This may break DOSemu
-	 * and/or Wine support in a paravirt VM, although the option
-	 * is still available to implement the setting of the high
-	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
-	 */
-	cmpl	$0, pv_info+PARAVIRT_enabled
-	jne	restore_nocheck
-#endif
-
 /*
  * Setup and switch to ESPFIX stack
  *
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6663fae..d11a3aa 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -286,4 +286,12 @@
 #define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
 #define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
 
+#ifdef CONFIG_X86_32
+/*
+ * 64-bit kernels don't use X86_BUG_ESPFIX.  Make the define conditional
+ * to avoid confusion.
+ */
+#define X86_BUG_ESPFIX		X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+#endif
+
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 079d83f..d8337f3 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -803,6 +803,31 @@ static void detect_nopl(struct cpuinfo_x86 *c)
 #else
 	set_cpu_cap(c, X86_FEATURE_NOPL);
 #endif
+
+	/*
+	 * ESPFIX is a strange bug.  All real CPUs have it.  Paravirt
+	 * systems that run Linux at CPL > 0 may or may not have the
+	 * issue, but, even if they have the issue, there's absolutely
+	 * nothing we can do about it because we can't use the real IRET
+	 * instruction.
+	 *
+	 * NB: For the time being, only 32-bit kernels support
+	 * X86_BUG_ESPFIX as such.  64-bit kernels directly choose
+	 * whether to apply espfix using paravirt hooks.  If any
+	 * non-paravirt system ever shows up that does *not* have the
+	 * ESPFIX issue, we can change this.
+	 */
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_PARAVIRT
+	do {
+		extern void native_iret(void);
+		if (pv_cpu_ops.iret == native_iret)
+			set_cpu_bug(c, X86_BUG_ESPFIX);
+	} while (0);
+#else
+	set_cpu_bug(c, X86_BUG_ESPFIX);
+#endif
+#endif
 }
 
 static void generic_identify(struct cpuinfo_x86 *c)
-- 
cgit v0.10.2


From 0dd0036f6e07f741a1356b424b84a3164b6e59cf Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 29 Feb 2016 15:50:20 -0800
Subject: x86/asm-offsets: Remove PARAVIRT_enabled

It no longer has any users.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boris.ostrovsky@oracle.com
Cc: david.vrabel@citrix.com
Cc: konrad.wilk@oracle.com
Cc: lguest@lists.ozlabs.org
Cc: xen-devel@lists.xensource.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 84a7524..5c04246 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -59,7 +59,6 @@ void common(void) {
 
 #ifdef CONFIG_PARAVIRT
 	BLANK();
-	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
 	OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
 	OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
-- 
cgit v0.10.2


From 29b75eb2d56a714190a93d7be4525e617591077a Mon Sep 17 00:00:00 2001
From: Jianyu Zhan <nasa4836@gmail.com>
Date: Mon, 7 Mar 2016 09:32:24 +0800
Subject: futex: Replace barrier() in unqueue_me() with READ_ONCE()

Commit e91467ecd1ef ("bug in futex unqueue_me") introduced a barrier() in
unqueue_me() to prevent the compiler from rereading the lock pointer which
might change after a check for NULL.

Replace the barrier() with a READ_ONCE() for the following reasons:

1) READ_ONCE() is a weaker form of barrier() that affects only the specific
   load operation, while barrier() is a general compiler level memory barrier.
   READ_ONCE() was not available at the time when the barrier was added.

2) Aside of that READ_ONCE() is descriptive and self explainatory while a
   barrier without comment is not clear to the casual reader.

No functional change.

[ tglx: Massaged changelog ]

Signed-off-by: Jianyu Zhan <nasa4836@gmail.com>
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Darren Hart <dvhart@linux.intel.com>
Cc: dave@stgolabs.net
Cc: peterz@infradead.org
Cc: linux@rasmusvillemoes.dk
Cc: akpm@linux-foundation.org
Cc: fengguang.wu@intel.com
Cc: bigeasy@linutronix.de
Link: http://lkml.kernel.org/r/1457314344-5685-1-git-send-email-nasa4836@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/futex.c b/kernel/futex.c
index bae542e..a5d2e74 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2010,8 +2010,12 @@ static int unqueue_me(struct futex_q *q)
 
 	/* In the common case we don't take the spinlock, which is nice. */
 retry:
-	lock_ptr = q->lock_ptr;
-	barrier();
+	/*
+	 * q->lock_ptr can change between this read and the following spin_lock.
+	 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+	 * optimizing lock_ptr out of the logic below.
+	 */
+	lock_ptr = READ_ONCE(q->lock_ptr);
 	if (lock_ptr != NULL) {
 		spin_lock(lock_ptr);
 		/*
-- 
cgit v0.10.2


From c8213a638f65bf487c10593c216525952cca3690 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Thu, 3 Mar 2016 19:26:24 -0500
Subject: drm/radeon/dp: add back special handling for NUTMEG
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When I fixed the dp rate selection in:
092c96a8ab9d1bd60ada2ed385cc364ce084180e
drm/radeon: fix dp link rate selection (v2)
I accidently dropped the special handling for NUTMEG
DP bridge chips.  They require a fixed link rate.

Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Ken Wang <Qingqing.Wang@amd.com>
Reviewed-by: Harry Wentland <harry.wentland@amd.com>
Tested-by: Ken Moffat <zarniwhoop@ntlworld.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c
index 44ee72e..6af8325 100644
--- a/drivers/gpu/drm/radeon/atombios_dp.c
+++ b/drivers/gpu/drm/radeon/atombios_dp.c
@@ -315,15 +315,27 @@ int radeon_dp_get_dp_link_config(struct drm_connector *connector,
 	unsigned max_lane_num = drm_dp_max_lane_count(dpcd);
 	unsigned lane_num, i, max_pix_clock;
 
-	for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
-		for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
-			max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+	if (radeon_connector_encoder_get_dp_bridge_encoder_id(connector) ==
+	    ENCODER_OBJECT_ID_NUTMEG) {
+		for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+			max_pix_clock = (lane_num * 270000 * 8) / bpp;
 			if (max_pix_clock >= pix_clock) {
 				*dp_lanes = lane_num;
-				*dp_rate = link_rates[i];
+				*dp_rate = 270000;
 				return 0;
 			}
 		}
+	} else {
+		for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+			for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+				max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+				if (max_pix_clock >= pix_clock) {
+					*dp_lanes = lane_num;
+					*dp_rate = link_rates[i];
+					return 0;
+				}
+			}
+		}
 	}
 
 	return -EINVAL;
-- 
cgit v0.10.2


From 02d27234759dc4fe14a880ec1e1dee108cb0b503 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Thu, 3 Mar 2016 19:34:28 -0500
Subject: drm/amdgpu/dp: add back special handling for NUTMEG
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When I fixed the dp rate selection in:
3b73b168cffd9c392584d3f665021fa2190f8612
drm/amdgpu: fix dp link rate selection (v2)
I accidently dropped the special handling for NUTMEG
DP bridge chips.  They require a fixed link rate.

Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Ken Wang <Qingqing.Wang@amd.com>
Reviewed-by: Harry Wentland <harry.wentland@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
index 21aacc1..bf731e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
+++ b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
@@ -265,15 +265,27 @@ static int amdgpu_atombios_dp_get_dp_link_config(struct drm_connector *connector
 	unsigned max_lane_num = drm_dp_max_lane_count(dpcd);
 	unsigned lane_num, i, max_pix_clock;
 
-	for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
-		for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
-			max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+	if (amdgpu_connector_encoder_get_dp_bridge_encoder_id(connector) ==
+	    ENCODER_OBJECT_ID_NUTMEG) {
+		for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+			max_pix_clock = (lane_num * 270000 * 8) / bpp;
 			if (max_pix_clock >= pix_clock) {
 				*dp_lanes = lane_num;
-				*dp_rate = link_rates[i];
+				*dp_rate = 270000;
 				return 0;
 			}
 		}
+	} else {
+		for (lane_num = 1; lane_num <= max_lane_num; lane_num <<= 1) {
+			for (i = 0; i < ARRAY_SIZE(link_rates) && link_rates[i] <= max_link_rate; i++) {
+				max_pix_clock = (lane_num * link_rates[i] * 8) / bpp;
+				if (max_pix_clock >= pix_clock) {
+					*dp_lanes = lane_num;
+					*dp_rate = link_rates[i];
+					return 0;
+				}
+			}
+		}
 	}
 
 	return -EINVAL;
-- 
cgit v0.10.2


From 7a8698058ae493ae53b1a8a2fa23d2e37000d73e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Tue, 8 Mar 2016 17:40:41 +0100
Subject: perf/x86/intel/rapl: Simplify quirk handling even more

Drop the quirk() function pointer in favor of a simple boolean which
says whether the quirk should be applied or not. Update comment while at
it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Harish Chegondi <harish.chegondi@intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-tip-commits@vger.kernel.org
Link: http://lkml.kernel.org/r/20160308164041.GF16568@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 019e541..b834a3f 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -592,18 +592,7 @@ static int rapl_cpu_notifier(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static __init void rapl_hsw_server_quirk(void)
-{
-	/*
-	 * DRAM domain on HSW server has fixed energy unit which can be
-	 * different than the unit from power unit MSR.
-	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
-	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
-	 */
-	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
-}
-
-static int rapl_check_hw_unit(void (*quirk)(void))
+static int rapl_check_hw_unit(bool apply_quirk)
 {
 	u64 msr_rapl_power_unit_bits;
 	int i;
@@ -614,9 +603,14 @@ static int rapl_check_hw_unit(void (*quirk)(void))
 	for (i = 0; i < NR_RAPL_DOMAINS; i++)
 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 
-	/* Apply cpu model quirk */
-	if (quirk)
-		quirk();
+	/*
+	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
+	 * different than the unit from power unit MSR. See
+	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+	 */
+	if (apply_quirk)
+		rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
 
 	/*
 	 * Calculate the timer rate:
@@ -704,7 +698,7 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
 
 static int __init rapl_pmu_init(void)
 {
-	void (*quirk)(void) = NULL;
+	bool apply_quirk = false;
 	int ret;
 
 	if (!x86_match_cpu(rapl_cpu_match))
@@ -717,7 +711,7 @@ static int __init rapl_pmu_init(void)
 		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
 		break;
 	case 63: /* Haswell-Server */
-		quirk = rapl_hsw_server_quirk;
+		apply_quirk = true;
 		rapl_cntr_mask = RAPL_IDX_SRV;
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 		break;
@@ -733,7 +727,7 @@ static int __init rapl_pmu_init(void)
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 		break;
 	case 87: /* Knights Landing */
-		quirk = rapl_hsw_server_quirk;
+		apply_quirk = true;
 		rapl_cntr_mask = RAPL_IDX_KNL;
 		rapl_pmu_events_group.attrs = rapl_events_knl_attr;
 		break;
@@ -741,7 +735,7 @@ static int __init rapl_pmu_init(void)
 		return -ENODEV;
 	}
 
-	ret = rapl_check_hw_unit(quirk);
+	ret = rapl_check_hw_unit(apply_quirk);
 	if (ret)
 		return ret;
 
-- 
cgit v0.10.2


From 92b0729c34cab1f46d89aace3e66015f0bb4a682 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Thu, 18 Feb 2016 11:47:26 -0800
Subject: x86/mm, x86/mce: Add memcpy_mcsafe()

Make use of the EXTABLE_FAULT exception table entries to write
a kernel copy routine that doesn't crash the system if it
encounters a machine check. Prime use case for this is to copy
from large arrays of non-volatile memory used as storage.

We have to use an unrolled copy loop for now because current
hardware implementations treat a machine check in "rep mov"
as fatal. When that is fixed we can simplify.

Return type is a "bool". True means that we copied OK, false means
that it didn't.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@gmail.com>
Link: http://lkml.kernel.org/r/a44e1055efc2d2a9473307b22c91caa437aa3f8b.1456439214.git.tony.luck@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ff8b9a1..ca6ba36 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -78,6 +78,19 @@ int strcmp(const char *cs, const char *ct);
 #define memset(s, c, n) __memset(s, c, n)
 #endif
 
+/**
+ * memcpy_mcsafe - copy memory with indication if a machine check happened
+ *
+ * @dst:	destination address
+ * @src:	source address
+ * @cnt:	number of bytes to copy
+ *
+ * Low level memory copy function that catches machine checks
+ *
+ * Return true for success, false for fail
+ */
+bool memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a0695be..cd05942 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -37,6 +37,8 @@ EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(_copy_from_user);
 EXPORT_SYMBOL(_copy_to_user);
 
+EXPORT_SYMBOL_GPL(memcpy_mcsafe);
+
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 16698bb..7d37641 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -177,3 +177,120 @@ ENTRY(memcpy_orig)
 .Lend:
 	retq
 ENDPROC(memcpy_orig)
+
+#ifndef CONFIG_UML
+/*
+ * memcpy_mcsafe - memory copy with machine check exception handling
+ * Note that we only catch machine checks when reading the source addresses.
+ * Writes to target are posted and don't generate machine checks.
+ */
+ENTRY(memcpy_mcsafe)
+	cmpl $8, %edx
+	/* Less than 8 bytes? Go to byte copy loop */
+	jb .L_no_whole_words
+
+	/* Check for bad alignment of source */
+	testl $7, %esi
+	/* Already aligned */
+	jz .L_8byte_aligned
+
+	/* Copy one byte at a time until source is 8-byte aligned */
+	movl %esi, %ecx
+	andl $7, %ecx
+	subl $8, %ecx
+	negl %ecx
+	subl %ecx, %edx
+.L_copy_leading_bytes:
+	movb (%rsi), %al
+	movb %al, (%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .L_copy_leading_bytes
+
+.L_8byte_aligned:
+	/* Figure out how many whole cache lines (64-bytes) to copy */
+	movl %edx, %ecx
+	andl $63, %edx
+	shrl $6, %ecx
+	jz .L_no_whole_cache_lines
+
+	/* Loop copying whole cache lines */
+.L_cache_w0: movq (%rsi), %r8
+.L_cache_w1: movq 1*8(%rsi), %r9
+.L_cache_w2: movq 2*8(%rsi), %r10
+.L_cache_w3: movq 3*8(%rsi), %r11
+	movq %r8, (%rdi)
+	movq %r9, 1*8(%rdi)
+	movq %r10, 2*8(%rdi)
+	movq %r11, 3*8(%rdi)
+.L_cache_w4: movq 4*8(%rsi), %r8
+.L_cache_w5: movq 5*8(%rsi), %r9
+.L_cache_w6: movq 6*8(%rsi), %r10
+.L_cache_w7: movq 7*8(%rsi), %r11
+	movq %r8, 4*8(%rdi)
+	movq %r9, 5*8(%rdi)
+	movq %r10, 6*8(%rdi)
+	movq %r11, 7*8(%rdi)
+	leaq 64(%rsi), %rsi
+	leaq 64(%rdi), %rdi
+	decl %ecx
+	jnz .L_cache_w0
+
+	/* Are there any trailing 8-byte words? */
+.L_no_whole_cache_lines:
+	movl %edx, %ecx
+	andl $7, %edx
+	shrl $3, %ecx
+	jz .L_no_whole_words
+
+	/* Copy trailing words */
+.L_copy_trailing_words:
+	movq (%rsi), %r8
+	mov %r8, (%rdi)
+	leaq 8(%rsi), %rsi
+	leaq 8(%rdi), %rdi
+	decl %ecx
+	jnz .L_copy_trailing_words
+
+	/* Any trailing bytes? */
+.L_no_whole_words:
+	andl %edx, %edx
+	jz .L_done_memcpy_trap
+
+	/* Copy trailing bytes */
+	movl %edx, %ecx
+.L_copy_trailing_bytes:
+	movb (%rsi), %al
+	movb %al, (%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz .L_copy_trailing_bytes
+
+	/* Copy successful. Return true */
+.L_done_memcpy_trap:
+	xorq %rax, %rax
+	ret
+ENDPROC(memcpy_mcsafe)
+
+	.section .fixup, "ax"
+	/* Return false for any failure */
+.L_memcpy_mcsafe_fail:
+	mov	$1, %rax
+	ret
+
+	.previous
+
+	_ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail)
+	_ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail)
+#endif
-- 
cgit v0.10.2


From d74e766e1916d0e09b86e4b5b9d0f819628fd546 Mon Sep 17 00:00:00 2001
From: Alex Deucher <alexander.deucher@amd.com>
Date: Tue, 8 Mar 2016 11:31:00 -0500
Subject: Revert "drm/radeon/pm: adjust display configuration after powerstate"

This reverts commit 39d4275058baf53e89203407bf3841ff2c74fa32.

This caused a regression on some older hardware.

bug:
https://bugzilla.kernel.org/show_bug.cgi?id=113891

Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org

diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c
index 0f14d89..7a98823 100644
--- a/drivers/gpu/drm/radeon/radeon_pm.c
+++ b/drivers/gpu/drm/radeon/radeon_pm.c
@@ -1079,6 +1079,8 @@ force:
 
 	/* update display watermarks based on new power state */
 	radeon_bandwidth_update(rdev);
+	/* update displays */
+	radeon_dpm_display_configuration_changed(rdev);
 
 	/* wait for the rings to drain */
 	for (i = 0; i < RADEON_NUM_RINGS; i++) {
@@ -1095,9 +1097,6 @@ force:
 
 	radeon_dpm_post_set_power_state(rdev);
 
-	/* update displays */
-	radeon_dpm_display_configuration_changed(rdev);
-
 	rdev->pm.dpm.current_active_crtcs = rdev->pm.dpm.new_active_crtcs;
 	rdev->pm.dpm.current_active_crtc_count = rdev->pm.dpm.new_active_crtc_count;
 	rdev->pm.dpm.single_display = single_display;
-- 
cgit v0.10.2


From 0dda8851a88d6c9d6389f1caf69627154c39aceb Mon Sep 17 00:00:00 2001
From: Bob Moore <robert.moore@intel.com>
Date: Fri, 19 Feb 2016 14:17:36 +0800
Subject: ACPICA: Revert "Parser: Fix for SuperName method invocation"

ACPICA commit eade8f78f2aa21e8eabc3380a5728db47273bcf1

Revert commit ae90fbf562d7 (ACPICA: Parser: Fix for SuperName method
invocation).

Support for method invocations as part of super_name will be
removed from the ACPI specification, since no AML interpreter
supports it.

Fixes: ae90fbf562d7 (ACPICA: Parser: Fix for SuperName method invocation)
Link: https://github.com/acpica/acpica/commit/eade8f78
Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lv Zheng <lv.zheng@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/acpi/acpica/psargs.c b/drivers/acpi/acpica/psargs.c
index 3052185..d48cbed 100644
--- a/drivers/acpi/acpica/psargs.c
+++ b/drivers/acpi/acpica/psargs.c
@@ -269,8 +269,7 @@ acpi_ps_get_next_namepath(struct acpi_walk_state *walk_state,
 	 */
 	if (ACPI_SUCCESS(status) &&
 	    possible_method_call && (node->type == ACPI_TYPE_METHOD)) {
-		if (GET_CURRENT_ARG_TYPE(walk_state->arg_types) ==
-		    ARGP_SUPERNAME) {
+		if (walk_state->opcode == AML_UNLOAD_OP) {
 			/*
 			 * acpi_ps_get_next_namestring has increased the AML pointer,
 			 * so we need to restore the saved AML pointer for method call.
@@ -697,7 +696,7 @@ static union acpi_parse_object *acpi_ps_get_next_field(struct acpi_parse_state
  *
  * PARAMETERS:  walk_state          - Current state
  *              parser_state        - Current parser state object
- *              arg_type            - The parser argument type (ARGP_*)
+ *              arg_type            - The argument type (AML_*_ARG)
  *              return_arg          - Where the next arg is returned
  *
  * RETURN:      Status, and an op object containing the next argument.
@@ -817,9 +816,9 @@ acpi_ps_get_next_arg(struct acpi_walk_state *walk_state,
 				return_ACPI_STATUS(AE_NO_MEMORY);
 			}
 
-			/* super_name allows argument to be a method call */
+			/* To support super_name arg of Unload */
 
-			if (arg_type == ARGP_SUPERNAME) {
+			if (walk_state->opcode == AML_UNLOAD_OP) {
 				status =
 				    acpi_ps_get_next_namepath(walk_state,
 							      parser_state, arg,
-- 
cgit v0.10.2


From 7781203416ffc4e731619f8a8b93a37599a8f502 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Tue, 8 Mar 2016 15:44:36 +0200
Subject: device property: fwnode->secondary may contain ERR_PTR(-ENODEV)

This fixes BUG triggered when fwnode->secondary is not NULL,
but has ERR_PTR(-ENODEV) instead.

BUG: unable to handle kernel paging request at ffffffffffffffed
IP: [<ffffffff81677b86>] __fwnode_property_read_string+0x26/0x160
PGD 200e067 PUD 2010067 PMD 0
Oops: 0000 [#1] SMP KASAN
Modules linked in: dwc3_pci(+) dwc3
CPU: 0 PID: 1138 Comm: modprobe Not tainted 4.5.0-rc5+ #61
task: ffff88015aaf5b00 ti: ffff88007b958000 task.ti: ffff88007b958000
RIP: 0010:[<ffffffff81677b86>]  [<ffffffff81677b86>] __fwnode_property_read_string+0x26/0x160
RSP: 0018:ffff88007b95eff8  EFLAGS: 00010246
RAX: fffffbfffffffffd RBX: ffffffffffffffed RCX: ffff88015999cd37
RDX: dffffc0000000000 RSI: ffffffff81e11bc0 RDI: ffffffffffffffed
RBP: ffff88007b95f020 R08: 0000000000000000 R09: 0000000000000000
R10: ffff88007b90f7cf R11: 0000000000000000 R12: ffff88007b95f0a0
R13: 00000000fffffffa R14: ffffffff81e11bc0 R15: ffff880159ea37a0
FS:  00007ff35f46c700(0000) GS:ffff88015b800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffffffffffffffed CR3: 000000007b8be000 CR4: 00000000001006f0
Stack:
 ffff88015999cd20 ffffffff81e11bc0 ffff88007b95f0a0 ffff88007b383dd8
 ffff880159ea37a0 ffff88007b95f048 ffffffff81677d03 ffff88007b952460
 ffffffff81e11bc0 ffff88007b95f0a0 ffff88007b95f070 ffffffff81677d40
Call Trace:
 [<ffffffff81677d03>] fwnode_property_read_string+0x43/0x50
 [<ffffffff81677d40>] device_property_read_string+0x30/0x40
...

Fixes: 362c0b30249e (device property: Fallback to secondary fwnode if primary misses the property)
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

diff --git a/drivers/base/property.c b/drivers/base/property.c
index c359351..a163f2c5 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -218,7 +218,7 @@ bool fwnode_property_present(struct fwnode_handle *fwnode, const char *propname)
 	bool ret;
 
 	ret = __fwnode_property_present(fwnode, propname);
-	if (ret == false && fwnode && fwnode->secondary)
+	if (ret == false && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
 		ret = __fwnode_property_present(fwnode->secondary, propname);
 	return ret;
 }
@@ -423,7 +423,7 @@ EXPORT_SYMBOL_GPL(device_property_match_string);
 	int _ret_;									\
 	_ret_ = FWNODE_PROP_READ(_fwnode_, _propname_, _type_, _proptype_,		\
 				 _val_, _nval_);					\
-	if (_ret_ == -EINVAL && _fwnode_ && _fwnode_->secondary)			\
+	if (_ret_ == -EINVAL && _fwnode_ && !IS_ERR_OR_NULL(_fwnode_->secondary))	\
 		_ret_ = FWNODE_PROP_READ(_fwnode_->secondary, _propname_, _type_,	\
 				_proptype_, _val_, _nval_);				\
 	_ret_;										\
@@ -593,7 +593,7 @@ int fwnode_property_read_string_array(struct fwnode_handle *fwnode,
 	int ret;
 
 	ret = __fwnode_property_read_string_array(fwnode, propname, val, nval);
-	if (ret == -EINVAL && fwnode && fwnode->secondary)
+	if (ret == -EINVAL && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
 		ret = __fwnode_property_read_string_array(fwnode->secondary,
 							  propname, val, nval);
 	return ret;
@@ -621,7 +621,7 @@ int fwnode_property_read_string(struct fwnode_handle *fwnode,
 	int ret;
 
 	ret = __fwnode_property_read_string(fwnode, propname, val);
-	if (ret == -EINVAL && fwnode && fwnode->secondary)
+	if (ret == -EINVAL && fwnode && !IS_ERR_OR_NULL(fwnode->secondary))
 		ret = __fwnode_property_read_string(fwnode->secondary,
 						    propname, val);
 	return ret;
-- 
cgit v0.10.2


From db57d7460ea74de2204ddc303520753f256ea67d Mon Sep 17 00:00:00 2001
From: Tomasz Nowicki <tn@semihalf.com>
Date: Tue, 19 Jan 2016 14:11:14 +0100
Subject: irqchip/gic-v3: Refactor gic_of_init() for GICv3 driver

Isolate hardware abstraction (FDT) code to gic_of_init().
Rest of the logic goes to gic_init_bases() and expects well
defined data to initialize GIC properly. The same solution
is used for GICv2 driver.

This is needed for ACPI initialization later.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Tomasz Nowicki <tomasz.nowicki@linaro.org>
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index d7be6dd..31205c7 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -811,59 +811,16 @@ static void gicv3_enable_quirks(void)
 #endif
 }
 
-static int __init gic_of_init(struct device_node *node, struct device_node *parent)
+static int __init gic_init_bases(void __iomem *dist_base,
+				 struct redist_region *rdist_regs,
+				 u32 nr_redist_regions,
+				 u64 redist_stride,
+				 struct fwnode_handle *handle)
 {
-	void __iomem *dist_base;
-	struct redist_region *rdist_regs;
-	u64 redist_stride;
-	u32 nr_redist_regions;
+	struct device_node *node;
 	u32 typer;
-	u32 reg;
 	int gic_irqs;
 	int err;
-	int i;
-
-	dist_base = of_iomap(node, 0);
-	if (!dist_base) {
-		pr_err("%s: unable to map gic dist registers\n",
-			node->full_name);
-		return -ENXIO;
-	}
-
-	reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
-	if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4) {
-		pr_err("%s: no distributor detected, giving up\n",
-			node->full_name);
-		err = -ENODEV;
-		goto out_unmap_dist;
-	}
-
-	if (of_property_read_u32(node, "#redistributor-regions", &nr_redist_regions))
-		nr_redist_regions = 1;
-
-	rdist_regs = kzalloc(sizeof(*rdist_regs) * nr_redist_regions, GFP_KERNEL);
-	if (!rdist_regs) {
-		err = -ENOMEM;
-		goto out_unmap_dist;
-	}
-
-	for (i = 0; i < nr_redist_regions; i++) {
-		struct resource res;
-		int ret;
-
-		ret = of_address_to_resource(node, 1 + i, &res);
-		rdist_regs[i].redist_base = of_iomap(node, 1 + i);
-		if (ret || !rdist_regs[i].redist_base) {
-			pr_err("%s: couldn't map region %d\n",
-			       node->full_name, i);
-			err = -ENODEV;
-			goto out_unmap_rdist;
-		}
-		rdist_regs[i].phys_base = res.start;
-	}
-
-	if (of_property_read_u64(node, "redistributor-stride", &redist_stride))
-		redist_stride = 0;
 
 	if (!is_hyp_mode_available())
 		static_key_slow_dec(&supports_deactivate);
@@ -889,8 +846,8 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
 		gic_irqs = 1020;
 	gic_data.irq_nr = gic_irqs;
 
-	gic_data.domain = irq_domain_add_tree(node, &gic_irq_domain_ops,
-					      &gic_data);
+	gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops,
+						 &gic_data);
 	gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist));
 
 	if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) {
@@ -900,7 +857,9 @@ static int __init gic_of_init(struct device_node *node, struct device_node *pare
 
 	set_handle_irq(gic_handle_irq);
 
-	if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis())
+	node = to_of_node(handle);
+	if (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) && gic_dist_supports_lpis() &&
+	    node) /* Temp hack to prevent ITS init for ACPI */
 		its_init(node, &gic_data.rdists, gic_data.domain);
 
 	gic_smp_init();
@@ -914,6 +873,73 @@ out_free:
 	if (gic_data.domain)
 		irq_domain_remove(gic_data.domain);
 	free_percpu(gic_data.rdists.rdist);
+	return err;
+}
+
+static int __init gic_validate_dist_version(void __iomem *dist_base)
+{
+	u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
+
+	if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4)
+		return -ENODEV;
+
+	return 0;
+}
+
+static int __init gic_of_init(struct device_node *node, struct device_node *parent)
+{
+	void __iomem *dist_base;
+	struct redist_region *rdist_regs;
+	u64 redist_stride;
+	u32 nr_redist_regions;
+	int err, i;
+
+	dist_base = of_iomap(node, 0);
+	if (!dist_base) {
+		pr_err("%s: unable to map gic dist registers\n",
+			node->full_name);
+		return -ENXIO;
+	}
+
+	err = gic_validate_dist_version(dist_base);
+	if (err) {
+		pr_err("%s: no distributor detected, giving up\n",
+			node->full_name);
+		goto out_unmap_dist;
+	}
+
+	if (of_property_read_u32(node, "#redistributor-regions", &nr_redist_regions))
+		nr_redist_regions = 1;
+
+	rdist_regs = kzalloc(sizeof(*rdist_regs) * nr_redist_regions, GFP_KERNEL);
+	if (!rdist_regs) {
+		err = -ENOMEM;
+		goto out_unmap_dist;
+	}
+
+	for (i = 0; i < nr_redist_regions; i++) {
+		struct resource res;
+		int ret;
+
+		ret = of_address_to_resource(node, 1 + i, &res);
+		rdist_regs[i].redist_base = of_iomap(node, 1 + i);
+		if (ret || !rdist_regs[i].redist_base) {
+			pr_err("%s: couldn't map region %d\n",
+			       node->full_name, i);
+			err = -ENODEV;
+			goto out_unmap_rdist;
+		}
+		rdist_regs[i].phys_base = res.start;
+	}
+
+	if (of_property_read_u64(node, "redistributor-stride", &redist_stride))
+		redist_stride = 0;
+
+	err = gic_init_bases(dist_base, rdist_regs, nr_redist_regions,
+			     redist_stride, &node->fwnode);
+	if (!err)
+		return 0;
+
 out_unmap_rdist:
 	for (i = 0; i < nr_redist_regions; i++)
 		if (rdist_regs[i].redist_base)
-- 
cgit v0.10.2


From ffa7d6166a9611ed9ad117b56f7f513901b31408 Mon Sep 17 00:00:00 2001
From: Tomasz Nowicki <tn@semihalf.com>
Date: Tue, 19 Jan 2016 14:11:15 +0100
Subject: irqchip/gic-v3: Add ACPI support for GICv3/4 initialization

With the refator of gic_of_init(), GICv3/4 can be initialized
by gic_init_bases() with gic distributor base address and gic
redistributor region(s).

So get the redistributor region base addresses from MADT GIC
redistributor subtable, and the distributor base address from
GICD subtable to init GICv3 irqchip in ACPI way.

Note: GIC redistributor base address may also be provided in
GICC structures on systems supporting GICv3 and above if the GIC
Redistributors are not in the always-on power domain, this
patch didn't implement such feature yet.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Tomasz Nowicki <tn@semihalf.com>
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 31205c7..2cada42 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -15,10 +15,12 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/acpi.h>
 #include <linux/cpu.h>
 #include <linux/cpu_pm.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/irqdomain.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
@@ -764,6 +766,15 @@ static int gic_irq_domain_translate(struct irq_domain *d,
 		return 0;
 	}
 
+	if (is_fwnode_irqchip(fwspec->fwnode)) {
+		if(fwspec->param_count != 2)
+			return -EINVAL;
+
+		*hwirq = fwspec->param[0];
+		*type = fwspec->param[1];
+		return 0;
+	}
+
 	return -EINVAL;
 }
 
@@ -951,3 +962,129 @@ out_unmap_dist:
 }
 
 IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init);
+
+#ifdef CONFIG_ACPI
+static struct redist_region *redist_regs __initdata;
+static u32 nr_redist_regions __initdata;
+
+static int __init
+gic_acpi_parse_madt_redist(struct acpi_subtable_header *header,
+			   const unsigned long end)
+{
+	struct acpi_madt_generic_redistributor *redist =
+			(struct acpi_madt_generic_redistributor *)header;
+	void __iomem *redist_base;
+	static int count = 0;
+
+	redist_base = ioremap(redist->base_address, redist->length);
+	if (!redist_base) {
+		pr_err("Couldn't map GICR region @%llx\n", redist->base_address);
+		return -ENOMEM;
+	}
+
+	redist_regs[count].phys_base = redist->base_address;
+	redist_regs[count].redist_base = redist_base;
+	count++;
+	return 0;
+}
+
+static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header,
+				  const unsigned long end)
+{
+	/* Subtable presence means that redist exists, that's it */
+	return 0;
+}
+
+static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header,
+					   struct acpi_probe_entry *ape)
+{
+	struct acpi_madt_generic_distributor *dist;
+	int count;
+
+	dist = (struct acpi_madt_generic_distributor *)header;
+	if (dist->version != ape->driver_data)
+		return false;
+
+	/* We need to do that exercise anyway, the sooner the better */
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
+				      gic_acpi_match_gicr, 0);
+	if (count <= 0)
+		return false;
+
+	nr_redist_regions = count;
+	return true;
+}
+
+#define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K)
+
+static int __init
+gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
+{
+	struct acpi_madt_generic_distributor *dist;
+	struct fwnode_handle *domain_handle;
+	void __iomem *dist_base;
+	int i, err, count;
+
+	/* Get distributor base address */
+	dist = (struct acpi_madt_generic_distributor *)header;
+	dist_base = ioremap(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE);
+	if (!dist_base) {
+		pr_err("Unable to map GICD registers\n");
+		return -ENOMEM;
+	}
+
+	err = gic_validate_dist_version(dist_base);
+	if (err) {
+		pr_err("No distributor detected at @%p, giving up", dist_base);
+		goto out_dist_unmap;
+	}
+
+	redist_regs = kzalloc(sizeof(*redist_regs) * nr_redist_regions,
+			      GFP_KERNEL);
+	if (!redist_regs) {
+		err = -ENOMEM;
+		goto out_dist_unmap;
+	}
+
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
+				      gic_acpi_parse_madt_redist, 0);
+	if (count <= 0) {
+		err = -ENODEV;
+		goto out_redist_unmap;
+	}
+
+	domain_handle = irq_domain_alloc_fwnode(dist_base);
+	if (!domain_handle) {
+		err = -ENOMEM;
+		goto out_redist_unmap;
+	}
+
+	err = gic_init_bases(dist_base, redist_regs, nr_redist_regions, 0,
+			     domain_handle);
+	if (err)
+		goto out_fwhandle_free;
+
+	acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, domain_handle);
+	return 0;
+
+out_fwhandle_free:
+	irq_domain_free_fwnode(domain_handle);
+out_redist_unmap:
+	for (i = 0; i < nr_redist_regions; i++)
+		if (redist_regs[i].redist_base)
+			iounmap(redist_regs[i].redist_base);
+	kfree(redist_regs);
+out_dist_unmap:
+	iounmap(dist_base);
+	return err;
+}
+IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
+		     acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V3,
+		     gic_acpi_init);
+IRQCHIP_ACPI_DECLARE(gic_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
+		     acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V4,
+		     gic_acpi_init);
+IRQCHIP_ACPI_DECLARE(gic_v3_or_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
+		     acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_NONE,
+		     gic_acpi_init);
+#endif
-- 
cgit v0.10.2


From b70fb7af67158250bf16db467926e6e105d8bc49 Mon Sep 17 00:00:00 2001
From: Tomasz Nowicki <tn@semihalf.com>
Date: Tue, 19 Jan 2016 14:11:16 +0100
Subject: irqchip/gic-v3: ACPI: Add redistributor support via GICC structures

Following ACPI spec:
On systems supporting GICv3 and above, GICR Base Address in MADT GICC
structure holds the 64-bit physical address of the associated Redistributor.
If all of the GIC Redistributors are in the always-on power domain,
GICR structures should be used to describe the Redistributors instead,
and this field must be set to 0.

It means that we have two ways to initialize registirbutors map.
1. via GICD structure which can accommodate many redistributors as a region
2. via GICC which is able to describe single redistributor

This patch is going to add support for second option.
Considering redistributors, GICD and GICC subtables have be mutually
exclusive. While discovering and mapping redistributor, we need to know
its size in advance. For the GICC case, redistributor can be in
a power-domain that is off, thus we cannot relay on GICR TYPER register.
Therefore, we get GIC version from distributor register and map 2xSZ_64K
for GICv3 and 4xSZ_64K for GICv4.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Tomasz Nowicki <tn@semihalf.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 2cada42..dd16a60 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -40,6 +40,7 @@
 struct redist_region {
 	void __iomem		*redist_base;
 	phys_addr_t		phys_base;
+	bool			single_redist;
 };
 
 struct gic_chip_data {
@@ -436,6 +437,9 @@ static int gic_populate_rdist(void)
 				return 0;
 			}
 
+			if (gic_data.redist_regions[i].single_redist)
+				break;
+
 			if (gic_data.redist_stride) {
 				ptr += gic_data.redist_stride;
 			} else {
@@ -964,8 +968,21 @@ out_unmap_dist:
 IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init);
 
 #ifdef CONFIG_ACPI
+static void __iomem *dist_base;
 static struct redist_region *redist_regs __initdata;
 static u32 nr_redist_regions __initdata;
+static bool single_redist;
+
+static void __init
+gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base)
+{
+	static int count = 0;
+
+	redist_regs[count].phys_base = phys_base;
+	redist_regs[count].redist_base = redist_base;
+	redist_regs[count].single_redist = single_redist;
+	count++;
+}
 
 static int __init
 gic_acpi_parse_madt_redist(struct acpi_subtable_header *header,
@@ -974,7 +991,6 @@ gic_acpi_parse_madt_redist(struct acpi_subtable_header *header,
 	struct acpi_madt_generic_redistributor *redist =
 			(struct acpi_madt_generic_redistributor *)header;
 	void __iomem *redist_base;
-	static int count = 0;
 
 	redist_base = ioremap(redist->base_address, redist->length);
 	if (!redist_base) {
@@ -982,12 +998,49 @@ gic_acpi_parse_madt_redist(struct acpi_subtable_header *header,
 		return -ENOMEM;
 	}
 
-	redist_regs[count].phys_base = redist->base_address;
-	redist_regs[count].redist_base = redist_base;
-	count++;
+	gic_acpi_register_redist(redist->base_address, redist_base);
 	return 0;
 }
 
+static int __init
+gic_acpi_parse_madt_gicc(struct acpi_subtable_header *header,
+			 const unsigned long end)
+{
+	struct acpi_madt_generic_interrupt *gicc =
+				(struct acpi_madt_generic_interrupt *)header;
+	u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
+	u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2;
+	void __iomem *redist_base;
+
+	redist_base = ioremap(gicc->gicr_base_address, size);
+	if (!redist_base)
+		return -ENOMEM;
+
+	gic_acpi_register_redist(gicc->gicr_base_address, redist_base);
+	return 0;
+}
+
+static int __init gic_acpi_collect_gicr_base(void)
+{
+	acpi_tbl_entry_handler redist_parser;
+	enum acpi_madt_type type;
+
+	if (single_redist) {
+		type = ACPI_MADT_TYPE_GENERIC_INTERRUPT;
+		redist_parser = gic_acpi_parse_madt_gicc;
+	} else {
+		type = ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR;
+		redist_parser = gic_acpi_parse_madt_redist;
+	}
+
+	/* Collect redistributor base addresses in GICR entries */
+	if (acpi_table_parse_madt(type, redist_parser, 0) > 0)
+		return 0;
+
+	pr_info("No valid GICR entries exist\n");
+	return -ENODEV;
+}
+
 static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header,
 				  const unsigned long end)
 {
@@ -995,6 +1048,46 @@ static int __init gic_acpi_match_gicr(struct acpi_subtable_header *header,
 	return 0;
 }
 
+static int __init gic_acpi_match_gicc(struct acpi_subtable_header *header,
+				      const unsigned long end)
+{
+	struct acpi_madt_generic_interrupt *gicc =
+				(struct acpi_madt_generic_interrupt *)header;
+
+	/*
+	 * If GICC is enabled and has valid gicr base address, then it means
+	 * GICR base is presented via GICC
+	 */
+	if ((gicc->flags & ACPI_MADT_ENABLED) && gicc->gicr_base_address)
+		return 0;
+
+	return -ENODEV;
+}
+
+static int __init gic_acpi_count_gicr_regions(void)
+{
+	int count;
+
+	/*
+	 * Count how many redistributor regions we have. It is not allowed
+	 * to mix redistributor description, GICR and GICC subtables have to be
+	 * mutually exclusive.
+	 */
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
+				      gic_acpi_match_gicr, 0);
+	if (count > 0) {
+		single_redist = false;
+		return count;
+	}
+
+	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
+				      gic_acpi_match_gicc, 0);
+	if (count > 0)
+		single_redist = true;
+
+	return count;
+}
+
 static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header,
 					   struct acpi_probe_entry *ape)
 {
@@ -1006,8 +1099,7 @@ static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header,
 		return false;
 
 	/* We need to do that exercise anyway, the sooner the better */
-	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
-				      gic_acpi_match_gicr, 0);
+	count = gic_acpi_count_gicr_regions();
 	if (count <= 0)
 		return false;
 
@@ -1022,8 +1114,7 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
 {
 	struct acpi_madt_generic_distributor *dist;
 	struct fwnode_handle *domain_handle;
-	void __iomem *dist_base;
-	int i, err, count;
+	int i, err;
 
 	/* Get distributor base address */
 	dist = (struct acpi_madt_generic_distributor *)header;
@@ -1046,12 +1137,9 @@ gic_acpi_init(struct acpi_subtable_header *header, const unsigned long end)
 		goto out_dist_unmap;
 	}
 
-	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
-				      gic_acpi_parse_madt_redist, 0);
-	if (count <= 0) {
-		err = -ENODEV;
+	err = gic_acpi_collect_gicr_base();
+	if (err)
 		goto out_redist_unmap;
-	}
 
 	domain_handle = irq_domain_alloc_fwnode(dist_base);
 	if (!domain_handle) {
-- 
cgit v0.10.2


From f6ae5085d37b2eaf6cac30ccf4d425e95c7d4b63 Mon Sep 17 00:00:00 2001
From: Hanjun Guo <hanjun.guo@linaro.org>
Date: Tue, 19 Jan 2016 14:11:17 +0100
Subject: irqchip/gic-v3: Remove gic_root_node variable from the ITS code

The gic_root_node variable defined in ITS driver is not actually
used, so just remove it.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 43dfd15..1bd7105 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -103,7 +103,6 @@ struct its_device {
 
 static LIST_HEAD(its_nodes);
 static DEFINE_SPINLOCK(its_lock);
-static struct device_node *gic_root_node;
 static struct rdists *gic_rdists;
 
 #define gic_data_rdist()		(raw_cpu_ptr(gic_rdists->rdist))
@@ -1607,8 +1606,6 @@ int its_init(struct device_node *node, struct rdists *rdists,
 	}
 
 	gic_rdists = rdists;
-	gic_root_node = node;
-
 	its_alloc_lpi_tables();
 	its_lpi_init(rdists->id_bits);
 
-- 
cgit v0.10.2


From 04a0e4dee85642138dc7bd78f50ebee397e057a8 Mon Sep 17 00:00:00 2001
From: Tomasz Nowicki <tn@semihalf.com>
Date: Tue, 19 Jan 2016 14:11:18 +0100
Subject: irqchip/gic-v3-its: Mark its_init() and its children as __init

gicv3_init_bases() is the only caller for its_init(),
also it is a __init function, so mark its_init() as __init too,
then recursively mark the functions called as __init.

This will help to introduce ITS initialization using ACPI tables as
we will use acpi_table_parse_entries family functions there which
belong to __init section as well.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Tomasz Nowicki <tn@semihalf.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 1bd7105..3926179 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -670,7 +670,7 @@ static int its_chunk_to_lpi(int chunk)
 	return (chunk << IRQS_PER_CHUNK_SHIFT) + 8192;
 }
 
-static int its_lpi_init(u32 id_bits)
+static int __init its_lpi_init(u32 id_bits)
 {
 	lpi_chunks = its_lpi_to_chunk(1UL << id_bits);
 
@@ -1429,7 +1429,8 @@ static void its_enable_quirks(struct its_node *its)
 	gic_enable_quirks(iidr, its_quirks, its);
 }
 
-static int its_probe(struct device_node *node, struct irq_domain *parent)
+static int __init its_probe(struct device_node *node,
+			    struct irq_domain *parent)
 {
 	struct resource res;
 	struct its_node *its;
@@ -1590,7 +1591,7 @@ static struct of_device_id its_device_id[] = {
 	{},
 };
 
-int its_init(struct device_node *node, struct rdists *rdists,
+int __init its_init(struct device_node *node, struct rdists *rdists,
 	     struct irq_domain *parent_domain)
 {
 	struct device_node *np;
-- 
cgit v0.10.2


From a9af316c83e3d81e26985193ffafdf143b327c07 Mon Sep 17 00:00:00 2001
From: Xuelin Shi <xuelin.shi@nxp.com>
Date: Tue, 8 Mar 2016 14:02:01 +0800
Subject: dmaengine: fsldma: fix memory leak

adding unmap of sources and destinations while doing dequeue.

Signed-off-by: Xuelin Shi <xuelin.shi@nxp.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>

diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index 2209f75..aac85c3 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -522,6 +522,8 @@ static dma_cookie_t fsldma_run_tx_complete_actions(struct fsldma_chan *chan,
 			chan_dbg(chan, "LD %p callback\n", desc);
 			txd->callback(txd->callback_param);
 		}
+
+		dma_descriptor_unmap(txd);
 	}
 
 	/* Run any dependencies */
-- 
cgit v0.10.2


From 0fc6fa2924d0dd54aa5c780a964c2812acf55ded Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@free-electrons.com>
Date: Fri, 19 Feb 2016 16:22:43 +0100
Subject: irqchip/gic-v3: Always return IRQ_SET_MASK_OK_DONE in
 gic_set_affinity

Always return IRQ_SET_MASK_OK_DONE instead of IRQ_SET_MASK_OK when the
affinity has been updated. When using stacked irqchips, returning
IRQ_SET_MASK_OK_DONE means skipping all descendant irqchips.

Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index dd16a60..5b7d3c2 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -640,7 +640,7 @@ static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
 	else
 		gic_dist_wait_for_rwp();
 
-	return IRQ_SET_MASK_OK;
+	return IRQ_SET_MASK_OK_DONE;
 }
 #else
 #define gic_set_affinity	NULL
-- 
cgit v0.10.2


From e6b78f2c3e14a9e3a909be3e6ec305d9f1cbabbd Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@free-electrons.com>
Date: Fri, 19 Feb 2016 16:22:44 +0100
Subject: irqchip: Add the Alpine MSIX interrupt controller

This patch adds the Alpine MSIX interrupt controller driver.

Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Tsahee Zidenberg <tsahee@annapurnalabs.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 00bbec6..7e8c441 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -65,6 +65,12 @@ config ARMADA_370_XP_IRQ
 	select GENERIC_IRQ_CHIP
 	select PCI_MSI_IRQ_DOMAIN if PCI_MSI
 
+config ALPINE_MSI
+	bool
+	depends on PCI && PCI_MSI
+	select GENERIC_IRQ_CHIP
+	select PCI_MSI_IRQ_DOMAIN
+
 config ATMEL_AIC_IRQ
 	bool
 	select GENERIC_IRQ_CHIP
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 8698710..b03cfcb 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_IRQCHIP)			+= irqchip.o
 
+obj-$(CONFIG_ALPINE_MSI)		+= irq-alpine-msi.o
 obj-$(CONFIG_ATH79)			+= irq-ath79-cpu.o
 obj-$(CONFIG_ATH79)			+= irq-ath79-misc.o
 obj-$(CONFIG_ARCH_BCM2835)		+= irq-bcm2835.o
diff --git a/drivers/irqchip/irq-alpine-msi.c b/drivers/irqchip/irq-alpine-msi.c
new file mode 100644
index 0000000..f871272
--- /dev/null
+++ b/drivers/irqchip/irq-alpine-msi.c
@@ -0,0 +1,293 @@
+/*
+ * Annapurna Labs MSIX support services
+ *
+ * Copyright (C) 2016, Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Antoine Tenart <antoine.tenart@free-electrons.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/irqchip.h>
+#include <linux/irqchip/arm-gic.h>
+#include <linux/msi.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/of_pci.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+
+#include <asm/irq.h>
+#include <asm-generic/msi.h>
+
+/* MSIX message address format: local GIC target */
+#define ALPINE_MSIX_SPI_TARGET_CLUSTER0		BIT(16)
+
+struct alpine_msix_data {
+	spinlock_t msi_map_lock;
+	phys_addr_t addr;
+	u32 spi_first;		/* The SGI number that MSIs start */
+	u32 num_spis;		/* The number of SGIs for MSIs */
+	unsigned long *msi_map;
+};
+
+static void alpine_msix_mask_msi_irq(struct irq_data *d)
+{
+	pci_msi_mask_irq(d);
+	irq_chip_mask_parent(d);
+}
+
+static void alpine_msix_unmask_msi_irq(struct irq_data *d)
+{
+	pci_msi_unmask_irq(d);
+	irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip alpine_msix_irq_chip = {
+	.name			= "MSIx",
+	.irq_mask		= alpine_msix_mask_msi_irq,
+	.irq_unmask		= alpine_msix_unmask_msi_irq,
+	.irq_eoi		= irq_chip_eoi_parent,
+	.irq_set_affinity	= irq_chip_set_affinity_parent,
+};
+
+static int alpine_msix_allocate_sgi(struct alpine_msix_data *priv, int num_req)
+{
+	int first;
+
+	spin_lock(&priv->msi_map_lock);
+
+	first = bitmap_find_next_zero_area(priv->msi_map, priv->num_spis, 0,
+					   num_req, 0);
+	if (first >= priv->num_spis) {
+		spin_unlock(&priv->msi_map_lock);
+		return -ENOSPC;
+	}
+
+	bitmap_set(priv->msi_map, first, num_req);
+
+	spin_unlock(&priv->msi_map_lock);
+
+	return priv->spi_first + first;
+}
+
+static void alpine_msix_free_sgi(struct alpine_msix_data *priv, unsigned sgi,
+				 int num_req)
+{
+	int first = sgi - priv->spi_first;
+
+	spin_lock(&priv->msi_map_lock);
+
+	bitmap_clear(priv->msi_map, first, num_req);
+
+	spin_unlock(&priv->msi_map_lock);
+}
+
+static void alpine_msix_compose_msi_msg(struct irq_data *data,
+					struct msi_msg *msg)
+{
+	struct alpine_msix_data *priv = irq_data_get_irq_chip_data(data);
+	phys_addr_t msg_addr = priv->addr;
+
+	msg_addr |= (data->hwirq << 3);
+
+	msg->address_hi = upper_32_bits(msg_addr);
+	msg->address_lo = lower_32_bits(msg_addr);
+	msg->data = 0;
+}
+
+static struct msi_domain_info alpine_msix_domain_info = {
+	.flags	= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+		  MSI_FLAG_PCI_MSIX,
+	.chip	= &alpine_msix_irq_chip,
+};
+
+static struct irq_chip middle_irq_chip = {
+	.name			= "alpine_msix_middle",
+	.irq_mask		= irq_chip_mask_parent,
+	.irq_unmask		= irq_chip_unmask_parent,
+	.irq_eoi		= irq_chip_eoi_parent,
+	.irq_set_affinity	= irq_chip_set_affinity_parent,
+	.irq_compose_msi_msg	= alpine_msix_compose_msi_msg,
+};
+
+static int alpine_msix_gic_domain_alloc(struct irq_domain *domain,
+					unsigned int virq, int sgi)
+{
+	struct irq_fwspec fwspec;
+	struct irq_data *d;
+	int ret;
+
+	if (!is_of_node(domain->parent->fwnode))
+		return -EINVAL;
+
+	fwspec.fwnode = domain->parent->fwnode;
+	fwspec.param_count = 3;
+	fwspec.param[0] = 0;
+	fwspec.param[1] = sgi;
+	fwspec.param[2] = IRQ_TYPE_EDGE_RISING;
+
+	ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
+	if (ret)
+		return ret;
+
+	d = irq_domain_get_irq_data(domain->parent, virq);
+	d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
+
+	return 0;
+}
+
+static int alpine_msix_middle_domain_alloc(struct irq_domain *domain,
+					   unsigned int virq,
+					   unsigned int nr_irqs, void *args)
+{
+	struct alpine_msix_data *priv = domain->host_data;
+	int sgi, err, i;
+
+	sgi = alpine_msix_allocate_sgi(priv, nr_irqs);
+	if (sgi < 0)
+		return sgi;
+
+	for (i = 0; i < nr_irqs; i++) {
+		err = alpine_msix_gic_domain_alloc(domain, virq + i, sgi + i);
+		if (err)
+			goto err_sgi;
+
+		irq_domain_set_hwirq_and_chip(domain, virq + i, sgi + i,
+					      &middle_irq_chip, priv);
+	}
+
+	return 0;
+
+err_sgi:
+	while (--i >= 0)
+		irq_domain_free_irqs_parent(domain, virq, i);
+	alpine_msix_free_sgi(priv, sgi, nr_irqs);
+	return err;
+}
+
+static void alpine_msix_middle_domain_free(struct irq_domain *domain,
+					   unsigned int virq,
+					   unsigned int nr_irqs)
+{
+	struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+	struct alpine_msix_data *priv = irq_data_get_irq_chip_data(d);
+
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+	alpine_msix_free_sgi(priv, d->hwirq, nr_irqs);
+}
+
+static const struct irq_domain_ops alpine_msix_middle_domain_ops = {
+	.alloc	= alpine_msix_middle_domain_alloc,
+	.free	= alpine_msix_middle_domain_free,
+};
+
+static int alpine_msix_init_domains(struct alpine_msix_data *priv,
+				    struct device_node *node)
+{
+	struct irq_domain *middle_domain, *msi_domain, *gic_domain;
+	struct device_node *gic_node;
+
+	gic_node = of_irq_find_parent(node);
+	if (!gic_node) {
+		pr_err("Failed to find the GIC node\n");
+		return -ENODEV;
+	}
+
+	gic_domain = irq_find_host(gic_node);
+	if (!gic_domain) {
+		pr_err("Failed to find the GIC domain\n");
+		return -ENXIO;
+	}
+
+	middle_domain = irq_domain_add_tree(NULL,
+					    &alpine_msix_middle_domain_ops,
+					    priv);
+	if (!middle_domain) {
+		pr_err("Failed to create the MSIX middle domain\n");
+		return -ENOMEM;
+	}
+
+	middle_domain->parent = gic_domain;
+
+	msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(node),
+					       &alpine_msix_domain_info,
+					       middle_domain);
+	if (!msi_domain) {
+		pr_err("Failed to create MSI domain\n");
+		irq_domain_remove(msi_domain);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int alpine_msix_init(struct device_node *node,
+			    struct device_node *parent)
+{
+	struct alpine_msix_data *priv;
+	struct resource res;
+	int ret;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	spin_lock_init(&priv->msi_map_lock);
+
+	ret = of_address_to_resource(node, 0, &res);
+	if (ret) {
+		pr_err("Failed to allocate resource\n");
+		goto err_priv;
+	}
+
+	/*
+	 * The 20 least significant bits of addr provide direct information
+	 * regarding the interrupt destination.
+	 *
+	 * To select the primary GIC as the target GIC, bits [18:17] must be set
+	 * to 0x0. In this case, bit 16 (SPI_TARGET_CLUSTER0) must be set.
+	 */
+	priv->addr = res.start & GENMASK_ULL(63,20);
+	priv->addr |= ALPINE_MSIX_SPI_TARGET_CLUSTER0;
+
+	if (of_property_read_u32(node, "al,msi-base-spi", &priv->spi_first)) {
+		pr_err("Unable to parse MSI base\n");
+		ret = -EINVAL;
+		goto err_priv;
+	}
+
+	if (of_property_read_u32(node, "al,msi-num-spis", &priv->num_spis)) {
+		pr_err("Unable to parse MSI numbers\n");
+		ret = -EINVAL;
+		goto err_priv;
+	}
+
+	priv->msi_map = kzalloc(sizeof(*priv->msi_map) * BITS_TO_LONGS(priv->num_spis),
+				GFP_KERNEL);
+	if (!priv->msi_map) {
+		ret = -ENOMEM;
+		goto err_priv;
+	}
+
+	pr_debug("Registering %d msixs, starting at %d\n",
+		 priv->num_spis, priv->spi_first);
+
+	ret = alpine_msix_init_domains(priv, node);
+	if (ret)
+		goto err_map;
+
+	return 0;
+
+err_map:
+	kfree(priv->msi_map);
+err_priv:
+	kfree(priv);
+	return ret;
+}
+IRQCHIP_DECLARE(alpine_msix, "al,alpine-msix", alpine_msix_init);
-- 
cgit v0.10.2


From a13690297ce49262ae44e41b25a954124609eea8 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <antoine.tenart@free-electrons.com>
Date: Fri, 19 Feb 2016 16:22:45 +0100
Subject: Documentation/bindings: Document the Alpine MSIX driver

Following the addition of the Alpine MSIX driver, this patch adds the
corresponding bindings documentation.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Antoine Tenart <antoine.tenart@free-electrons.com>
Signed-off-by: Tsahee Zidenberg <tsahee@annapurnalabs.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

diff --git a/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt b/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt
new file mode 100644
index 0000000..f6f1c14
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/al,alpine-msix.txt
@@ -0,0 +1,26 @@
+Alpine MSIX controller
+
+See arm,gic-v3.txt for SPI and MSI definitions.
+
+Required properties:
+
+- compatible: should be "al,alpine-msix"
+- reg: physical base address and size of the registers
+- interrupt-parent: specifies the parent interrupt controller.
+- interrupt-controller: identifies the node as an interrupt controller
+- msi-controller: identifies the node as an PCI Message Signaled Interrupt
+		  controller
+- al,msi-base-spi: SPI base of the MSI frame
+- al,msi-num-spis: number of SPIs assigned to the MSI frame, relative to SPI0
+
+Example:
+
+msix: msix {
+	compatible = "al,alpine-msix";
+	reg = <0x0 0xfbe00000 0x0 0x100000>;
+	interrupt-parent = <&gic>;
+	interrupt-controller;
+	msi-controller;
+	al,msi-base-spi = <160>;
+	al,msi-num-spis = <160>;
+};
-- 
cgit v0.10.2


From 82b0a434b436f5da69ddd24bd6a6fa5dc4484310 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 22 Feb 2016 15:01:01 +0100
Subject: irqchip/gic/realview: Support more RealView DCC variants

In the add-on file for the GIC dealing with the RealView family
we currently only handle the PB11MPCore, let's extend this to
manage the RealView EB ARM11MPCore as well. The Revision B of the
ARM11MPCore core tile is a bit special and needs special handling
as it moves a system control register around at random.

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: devicetree@vger.kernel.org
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>

diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt
index 5a1cb4b..793c20f 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/arm,gic.txt
@@ -16,6 +16,7 @@ Main node required properties:
 	"arm,cortex-a15-gic"
 	"arm,cortex-a7-gic"
 	"arm,cortex-a9-gic"
+	"arm,eb11mp-gic"
 	"arm,gic-400"
 	"arm,pl390"
 	"arm,tc11mp-gic"
diff --git a/drivers/irqchip/irq-gic-realview.c b/drivers/irqchip/irq-gic-realview.c
index aa46eb2..54c2964 100644
--- a/drivers/irqchip/irq-gic-realview.c
+++ b/drivers/irqchip/irq-gic-realview.c
@@ -10,7 +10,8 @@
 #include <linux/irqchip/arm-gic.h>
 
 #define REALVIEW_SYS_LOCK_OFFSET	0x20
-#define REALVIEW_PB11MP_SYS_PLD_CTRL1	0x74
+#define REALVIEW_SYS_PLD_CTRL1		0x74
+#define REALVIEW_EB_REVB_SYS_PLD_CTRL1	0xD8
 #define VERSATILE_LOCK_VAL		0xA05F
 #define PLD_INTMODE_MASK		BIT(22)|BIT(23)|BIT(24)
 #define PLD_INTMODE_LEGACY		0x0
@@ -18,26 +19,57 @@
 #define PLD_INTMODE_NEW_NO_DCC		BIT(23)
 #define PLD_INTMODE_FIQ_ENABLE		BIT(24)
 
+/* For some reason RealView EB Rev B moved this register */
+static const struct of_device_id syscon_pldset_of_match[] = {
+	{
+		.compatible = "arm,realview-eb11mp-revb-syscon",
+		.data = (void *)REALVIEW_EB_REVB_SYS_PLD_CTRL1,
+	},
+	{
+		.compatible = "arm,realview-eb11mp-revc-syscon",
+		.data = (void *)REALVIEW_SYS_PLD_CTRL1,
+	},
+	{
+		.compatible = "arm,realview-eb-syscon",
+		.data = (void *)REALVIEW_SYS_PLD_CTRL1,
+	},
+	{
+		.compatible = "arm,realview-pb11mp-syscon",
+		.data = (void *)REALVIEW_SYS_PLD_CTRL1,
+	},
+	{},
+};
+
 static int __init
 realview_gic_of_init(struct device_node *node, struct device_node *parent)
 {
 	static struct regmap *map;
+	struct device_node *np;
+	const struct of_device_id *gic_id;
+	u32 pld1_ctrl;
+
+	np = of_find_matching_node_and_match(NULL, syscon_pldset_of_match,
+					     &gic_id);
+	if (!np)
+		return -ENODEV;
+	pld1_ctrl = (u32)gic_id->data;
 
 	/* The PB11MPCore GIC needs to be configured in the syscon */
-	map = syscon_regmap_lookup_by_compatible("arm,realview-pb11mp-syscon");
+	map = syscon_node_to_regmap(np);
 	if (!IS_ERR(map)) {
 		/* new irq mode with no DCC */
 		regmap_write(map, REALVIEW_SYS_LOCK_OFFSET,
 			     VERSATILE_LOCK_VAL);
-		regmap_update_bits(map, REALVIEW_PB11MP_SYS_PLD_CTRL1,
+		regmap_update_bits(map, pld1_ctrl,
 				   PLD_INTMODE_NEW_NO_DCC,
 				   PLD_INTMODE_MASK);
 		regmap_write(map, REALVIEW_SYS_LOCK_OFFSET, 0x0000);
-		pr_info("TC11MP GIC: set up interrupt controller to NEW mode, no DCC\n");
+		pr_info("RealView GIC: set up interrupt controller to NEW mode, no DCC\n");
 	} else {
-		pr_err("TC11MP GIC setup: could not find syscon\n");
-		return -ENXIO;
+		pr_err("RealView GIC setup: could not find syscon\n");
+		return -ENODEV;
 	}
 	return gic_of_init(node, parent);
 }
 IRQCHIP_DECLARE(armtc11mp_gic, "arm,tc11mp-gic", realview_gic_of_init);
+IRQCHIP_DECLARE(armeb11mp_gic, "arm,eb11mp-gic", realview_gic_of_init);
-- 
cgit v0.10.2


From 3a99e6db539e53cc9c79282e80f8362b0cb96ac8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 9 Mar 2016 10:40:01 +0100
Subject: perf bench mem: Prepare the x86-64 build for upstream memcpy_mcsafe()
 changes

The following upcoming upstream commit:

  92b0729c34ca ("x86/mm, x86/mce: Add memcpy_mcsafe()")

Adds _ASM_EXTABLE_FAULT(), which is not available in user-space
and breaks the build.

We don't really need _ASM_EXTABLE_FAULT() in user-space, so simply
wrap it to nothing.

Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
index e4c2c30..5c3cce0 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -1,6 +1,11 @@
+
+/* Various wrappers to make the kernel .S file build in user-space: */
+
 #define memcpy MEMCPY /* don't hide glibc's memcpy() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
+#define _ASM_EXTABLE_FAULT(x, y)
+
 #include "../../../arch/x86/lib/memcpy_64.S"
 /*
  * We need to provide note.GNU-stack section, saying that we want
-- 
cgit v0.10.2


From 313f636d5c490c9741d3f750dc8da33029edbc6b Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Tue, 8 Mar 2016 16:19:44 -0800
Subject: kvm: cap halt polling at exactly halt_poll_ns

When growing halt-polling, there is no check that the poll time exceeds
the limit. It's possible for vcpu->halt_poll_ns grow once past
halt_poll_ns, and stay there until a halt which takes longer than
vcpu->halt_poll_ns. For example, booting a Linux guest with
halt_poll_ns=11000:

 ... kvm:kvm_halt_poll_ns: vcpu 0: halt_poll_ns 0 (shrink 10000)
 ... kvm:kvm_halt_poll_ns: vcpu 0: halt_poll_ns 10000 (grow 0)
 ... kvm:kvm_halt_poll_ns: vcpu 0: halt_poll_ns 20000 (grow 10000)

Signed-off-by: David Matlack <dmatlack@google.com>
Fixes: aca6ff29c4063a8d467cdee241e6b3bf7dc4a171
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a11cfd2..9102ae1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1952,6 +1952,9 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 	else
 		val *= halt_poll_ns_grow;
 
+	if (val > halt_poll_ns)
+		val = halt_poll_ns;
+
 	vcpu->halt_poll_ns = val;
 	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
 }
-- 
cgit v0.10.2


From f363938c70a04e6bc99023a5e0c44ef7879b903f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 21 Jan 2016 15:24:31 -0800
Subject: x86/fpu: Fix 'no387' regression

After fixing FPU option parsing, we now parse the 'no387' boot option
too early: no387 clears X86_FEATURE_FPU before it's even probed, so
the boot CPU promptly re-enables it.

I suspect it gets even more confused on SMP.

Fix the probing code to leave X86_FEATURE_FPU off if it's been
disabled by setup_clear_cpu_cap().

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Fixes: 4f81cbafcce2 ("x86/fpu: Fix early FPU command-line parsing")
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6d9f0a7..d53ab3d 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -78,13 +78,15 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
 	cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
 	write_cr0(cr0);
 
-	asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-		     : "+m" (fsw), "+m" (fcw));
+	if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+		asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+			     : "+m" (fsw), "+m" (fcw));
 
-	if (fsw == 0 && (fcw & 0x103f) == 0x003f)
-		set_cpu_cap(c, X86_FEATURE_FPU);
-	else
-		clear_cpu_cap(c, X86_FEATURE_FPU);
+		if (fsw == 0 && (fcw & 0x103f) == 0x003f)
+			set_cpu_cap(c, X86_FEATURE_FPU);
+		else
+			clear_cpu_cap(c, X86_FEATURE_FPU);
+	}
 
 #ifndef CONFIG_MATH_EMULATION
 	if (!cpu_has_fpu) {
-- 
cgit v0.10.2


From 46dad054a19297af65c417c97cb920aa5bdf7e8c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 7 Mar 2016 18:48:45 -0300
Subject: perf jitdump: DWARF is also needed

While building on a Docker container for ubuntu and installing package
by package one ends up with:

    MKDIR    /tmp/build/util/
    CC       /tmp/build/util/genelf.o
  util/genelf.c:22:19: fatal error: dwarf.h: No such file or directory
   #include <dwarf.h>
                   ^
  compilation terminated.
  mv: cannot stat '/tmp/build/util/.genelf.o.tmp': No such file or directory

Because the jitdump code needs the DWARF related development packages to
be installed. So make it dependent on that so that the build can succeed
without jitdump support.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/n/tip-le498robnmxd40237wej3w62@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index b288577..e219ed4 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -73,7 +73,7 @@ static int perf_event__repipe_oe_synth(struct perf_tool *tool,
 	return perf_event__repipe_synth(tool, event);
 }
 
-#ifdef HAVE_LIBELF_SUPPORT
+#if defined(HAVE_LIBELF_SUPPORT) && defined(HAVE_DWARF_SUPPORT)
 static int perf_event__drop_oe(struct perf_tool *tool __maybe_unused,
 			       union perf_event *event __maybe_unused,
 			       struct ordered_events *oe __maybe_unused)
@@ -245,7 +245,7 @@ static int perf_event__repipe_mmap(struct perf_tool *tool,
 	return err;
 }
 
-#ifdef HAVE_LIBELF_SUPPORT
+#if defined(HAVE_LIBELF_SUPPORT) && defined(HAVE_DWARF_SUPPORT)
 static int perf_event__jit_repipe_mmap(struct perf_tool *tool,
 				       union perf_event *event,
 				       struct perf_sample *sample,
@@ -283,7 +283,7 @@ static int perf_event__repipe_mmap2(struct perf_tool *tool,
 	return err;
 }
 
-#ifdef HAVE_LIBELF_SUPPORT
+#if defined(HAVE_LIBELF_SUPPORT) && defined(HAVE_DWARF_SUPPORT)
 static int perf_event__jit_repipe_mmap2(struct perf_tool *tool,
 					union perf_event *event,
 					struct perf_sample *sample,
@@ -795,7 +795,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 		"perf inject [<options>]",
 		NULL
 	};
-#ifndef HAVE_LIBELF_SUPPORT
+#if !defined(HAVE_LIBELF_SUPPORT) || !defined(HAVE_DWARF_SUPPORT)
 	set_option_nobuild(options, 'j', "jit", "NO_LIBELF=1", true);
 #endif
 	argc = parse_options(argc, argv, options, inject_usage, 0);
@@ -833,7 +833,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 		inject.tool.ordered_events = true;
 		inject.tool.ordering_requires_timestamps = true;
 	}
-#ifdef HAVE_LIBELF_SUPPORT
+#if defined(HAVE_LIBELF_SUPPORT) && defined(HAVE_DWARF_SUPPORT)
 	if (inject.jit_mode) {
 		inject.tool.mmap2	   = perf_event__jit_repipe_mmap2;
 		inject.tool.mmap	   = perf_event__jit_repipe_mmap;
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index df2b690..f130ce2 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -107,9 +107,12 @@ libperf-y += scripting-engines/
 libperf-$(CONFIG_ZLIB) += zlib.o
 libperf-$(CONFIG_LZMA) += lzma.o
 libperf-y += demangle-java.o
+
+ifdef CONFIG_DWARF
 libperf-$(CONFIG_LIBELF) += jitdump.o
 libperf-$(CONFIG_LIBELF) += genelf.o
 libperf-$(CONFIG_LIBELF) += genelf_debug.o
+endif
 
 CFLAGS_config.o   += -DETC_PERFCONFIG="BUILD_STR($(ETC_PERFCONFIG_SQ))"
 # avoid compiler warnings in 32-bit mode
-- 
cgit v0.10.2


From 616df645d7238e45d3b369933a30fee4e4e305e2 Mon Sep 17 00:00:00 2001
From: Chris Phlipot <cphlipot0@gmail.com>
Date: Tue, 8 Mar 2016 21:11:54 -0800
Subject: perf tools: Fix perf script python database export crash

Remove the union in evsel so that the database id and priv pointer can
be used simultainously without conflicting and crashing.

Detailed Description for the fixed bug follows:

perf script crashes with a segmentation fault on user space tool version
4.5.rc7.ge2857b when using the python database export API. It works
properly in 4.4 and prior versions.

the crash fist appeared in:

cfc8874a4859 ("perf script: Process cpu/threads maps")

How to reproduce the bug:

Remove any temporary files left over from a previous crash (if you have
already attemped to reproduce the bug):

  $ rm -r test_db-perf-data
  $ dropdb test_db

  $ perf record timeout 1 yes >/dev/null
  $ perf script -s scripts/python/export-to-postgresql.py test_db

  Stack Trace:
  Program received signal SIGSEGV, Segmentation fault.
  __GI___libc_free (mem=0x1) at malloc.c:2929
  2929	malloc.c: No such file or directory.
  (gdb) bt
    at util/stat.c:122
    argv=<optimized out>, prefix=<optimized out>) at builtin-script.c:2231
    argc=argc@entry=4, argv=argv@entry=0x7fffffffdf70) at perf.c:390
    at perf.c:451

Signed-off-by: Chris Phlipot <cphlipot0@gmail.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: cfc8874a4859 ("perf script: Process cpu/threads maps")
Link: http://lkml.kernel.org/r/1457500314-8912-1-git-send-email-cphlipot0@gmail.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index efad78f..501ea6e 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -93,10 +93,8 @@ struct perf_evsel {
 	const char		*unit;
 	struct event_format	*tp_format;
 	off_t			id_offset;
-	union {
-		void		*priv;
-		u64		db_id;
-	};
+	void			*priv;
+	u64			db_id;
 	struct cgroup_sel	*cgrp;
 	void			*handler;
 	struct cpu_map		*cpus;
-- 
cgit v0.10.2


From d7b617f51be4fffa3cbb5adf6d4258e616dce294 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 9 Mar 2016 11:04:17 +0100
Subject: perf tools: Pass perf_hpp_list all the way through setup_sort_list

Pass perf_hpp_list all the way through setup_sort_list so that the sort
entry can be added on the arbitrary list.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20160309100417.GA30910@krava.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 041f236..59a101e 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1614,19 +1614,21 @@ int hist_entry__filter(struct hist_entry *he, int type, const void *arg)
 	return hse->se->se_filter(he, type, arg);
 }
 
-static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd, int level)
+static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd,
+					  struct perf_hpp_list *list,
+					  int level)
 {
 	struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd, level);
 
 	if (hse == NULL)
 		return -1;
 
-	perf_hpp__register_sort_field(&hse->hpp);
+	perf_hpp_list__register_sort_field(list, &hse->hpp);
 	return 0;
 }
 
-static int __sort_dimension__add_hpp_output(struct perf_hpp_list *list,
-					    struct sort_dimension *sd)
+static int __sort_dimension__add_hpp_output(struct sort_dimension *sd,
+					    struct perf_hpp_list *list)
 {
 	struct hpp_sort_entry *hse = __sort_dimension__alloc_hpp(sd, 0);
 
@@ -2147,12 +2149,14 @@ out:
 	return ret;
 }
 
-static int __sort_dimension__add(struct sort_dimension *sd, int level)
+static int __sort_dimension__add(struct sort_dimension *sd,
+				 struct perf_hpp_list *list,
+				 int level)
 {
 	if (sd->taken)
 		return 0;
 
-	if (__sort_dimension__add_hpp_sort(sd, level) < 0)
+	if (__sort_dimension__add_hpp_sort(sd, list, level) < 0)
 		return -1;
 
 	if (sd->entry->se_collapse)
@@ -2163,7 +2167,9 @@ static int __sort_dimension__add(struct sort_dimension *sd, int level)
 	return 0;
 }
 
-static int __hpp_dimension__add(struct hpp_dimension *hd, int level)
+static int __hpp_dimension__add(struct hpp_dimension *hd,
+				struct perf_hpp_list *list,
+				int level)
 {
 	struct perf_hpp_fmt *fmt;
 
@@ -2175,7 +2181,7 @@ static int __hpp_dimension__add(struct hpp_dimension *hd, int level)
 		return -1;
 
 	hd->taken = 1;
-	perf_hpp__register_sort_field(fmt);
+	perf_hpp_list__register_sort_field(list, fmt);
 	return 0;
 }
 
@@ -2185,7 +2191,7 @@ static int __sort_dimension__add_output(struct perf_hpp_list *list,
 	if (sd->taken)
 		return 0;
 
-	if (__sort_dimension__add_hpp_output(list, sd) < 0)
+	if (__sort_dimension__add_hpp_output(sd, list) < 0)
 		return -1;
 
 	sd->taken = 1;
@@ -2215,7 +2221,8 @@ int hpp_dimension__add_output(unsigned col)
 	return __hpp_dimension__add_output(&perf_hpp_list, &hpp_sort_dimensions[col]);
 }
 
-static int sort_dimension__add(const char *tok, struct perf_evlist *evlist,
+static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
+			       struct perf_evlist *evlist __maybe_unused,
 			       int level)
 {
 	unsigned int i;
@@ -2255,7 +2262,7 @@ static int sort_dimension__add(const char *tok, struct perf_evlist *evlist,
 			sort__has_thread = 1;
 		}
 
-		return __sort_dimension__add(sd, level);
+		return __sort_dimension__add(sd, list, level);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(hpp_sort_dimensions); i++) {
@@ -2264,7 +2271,7 @@ static int sort_dimension__add(const char *tok, struct perf_evlist *evlist,
 		if (strncasecmp(tok, hd->name, strlen(tok)))
 			continue;
 
-		return __hpp_dimension__add(hd, level);
+		return __hpp_dimension__add(hd, list, level);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(bstack_sort_dimensions); i++) {
@@ -2279,7 +2286,7 @@ static int sort_dimension__add(const char *tok, struct perf_evlist *evlist,
 		if (sd->entry == &sort_sym_from || sd->entry == &sort_sym_to)
 			sort__has_sym = 1;
 
-		__sort_dimension__add(sd, level);
+		__sort_dimension__add(sd, list, level);
 		return 0;
 	}
 
@@ -2295,7 +2302,7 @@ static int sort_dimension__add(const char *tok, struct perf_evlist *evlist,
 		if (sd->entry == &sort_mem_daddr_sym)
 			sort__has_sym = 1;
 
-		__sort_dimension__add(sd, level);
+		__sort_dimension__add(sd, list, level);
 		return 0;
 	}
 
@@ -2305,7 +2312,8 @@ static int sort_dimension__add(const char *tok, struct perf_evlist *evlist,
 	return -ESRCH;
 }
 
-static int setup_sort_list(char *str, struct perf_evlist *evlist)
+static int setup_sort_list(struct perf_hpp_list *list, char *str,
+			   struct perf_evlist *evlist)
 {
 	char *tmp, *tok;
 	int ret = 0;
@@ -2332,7 +2340,7 @@ static int setup_sort_list(char *str, struct perf_evlist *evlist)
 		}
 
 		if (*tok) {
-			ret = sort_dimension__add(tok, evlist, level);
+			ret = sort_dimension__add(list, tok, evlist, level);
 			if (ret == -EINVAL) {
 				error("Invalid --sort key: `%s'", tok);
 				break;
@@ -2480,7 +2488,7 @@ static int __setup_sorting(struct perf_evlist *evlist)
 		}
 	}
 
-	ret = setup_sort_list(str, evlist);
+	ret = setup_sort_list(&perf_hpp_list, str, evlist);
 
 	free(str);
 	return ret;
@@ -2725,7 +2733,7 @@ int setup_sorting(struct perf_evlist *evlist)
 		return err;
 
 	if (parent_pattern != default_parent_pattern) {
-		err = sort_dimension__add("parent", evlist, -1);
+		err = sort_dimension__add(&perf_hpp_list, "parent", evlist, -1);
 		if (err < 0)
 			return err;
 	}
-- 
cgit v0.10.2


From ea8f75f981918c5946fc4029acdc86707fa901c1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Tue, 8 Mar 2016 19:42:30 +0100
Subject: perf tools: Omit unnecessary cast in perf_pmu__parse_scale

There's no need to use a const char pointer, we can used char pointer
from the beginning and omit the unnecessary cast.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160308184230.GB7897@krava.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c
index d8cd038..adef23b 100644
--- a/tools/perf/util/pmu.c
+++ b/tools/perf/util/pmu.c
@@ -98,7 +98,7 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
 	char scale[128];
 	int fd, ret = -1;
 	char path[PATH_MAX];
-	const char *lc;
+	char *lc;
 
 	snprintf(path, PATH_MAX, "%s/%s.scale", dir, name);
 
@@ -146,7 +146,7 @@ static int perf_pmu__parse_scale(struct perf_pmu_alias *alias, char *dir, char *
 	/* restore locale */
 	setlocale(LC_NUMERIC, lc);
 
-	free((char *) lc);
+	free(lc);
 
 	ret = 0;
 error:
-- 
cgit v0.10.2


From 8b30a8b3c636a155bab9176ad209964c9c22252d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Wed, 9 Mar 2016 14:48:21 +0100
Subject: x86/defconfigs/32: Set CONFIG_FRAME_WARN to the Kconfig default

Sync it to the Kconfig default for 32-bit.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: tim.gardner@canonical.com
Link: http://lkml.kernel.org/r/20160309134821.GD6564@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 028be48..e25a163 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -288,7 +288,7 @@ CONFIG_NLS_ISO8859_1=y
 CONFIG_NLS_UTF8=y
 CONFIG_PRINTK_TIME=y
 # CONFIG_ENABLE_WARN_DEPRECATED is not set
-CONFIG_FRAME_WARN=2048
+CONFIG_FRAME_WARN=1024
 CONFIG_MAGIC_SYSRQ=y
 # CONFIG_UNUSED_SYMBOLS is not set
 CONFIG_DEBUG_KERNEL=y
-- 
cgit v0.10.2


From f6e45661f9be546811b62b2b01f32f4bf0c436c0 Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <mcgrof@suse.com>
Date: Fri, 22 Jan 2016 18:34:22 -0800
Subject: dma, mm/pat: Rename dma_*_writecombine() to dma_*_wc()

Rename dma_*_writecombine() to dma_*_wc(), so that the naming
is coherent across the various write-combining APIs. Keep the
old names for compatibility for a while, these can be removed
at a later time. A guard is left to enable backporting of the
rename, and later remove of the old mapping defines seemlessly.

Build tested successfully with allmodconfig.

The following Coccinelle SmPL patch was used for this simple
transformation:

@ rename_dma_alloc_writecombine @
expression dev, size, dma_addr, gfp;
@@

-dma_alloc_writecombine(dev, size, dma_addr, gfp)
+dma_alloc_wc(dev, size, dma_addr, gfp)

@ rename_dma_free_writecombine @
expression dev, size, cpu_addr, dma_addr;
@@

-dma_free_writecombine(dev, size, cpu_addr, dma_addr)
+dma_free_wc(dev, size, cpu_addr, dma_addr)

@ rename_dma_mmap_writecombine @
expression dev, vma, cpu_addr, dma_addr, size;
@@

-dma_mmap_writecombine(dev, vma, cpu_addr, dma_addr, size)
+dma_mmap_wc(dev, vma, cpu_addr, dma_addr, size)

We also keep the old names as compatibility helpers, and
guard against their definition to make backporting easier.

Generated-by: Coccinelle SmPL
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: airlied@linux.ie
Cc: akpm@linux-foundation.org
Cc: benh@kernel.crashing.org
Cc: bhelgaas@google.com
Cc: bp@suse.de
Cc: dan.j.williams@intel.com
Cc: daniel.vetter@ffwll.ch
Cc: dhowells@redhat.com
Cc: julia.lawall@lip6.fr
Cc: konrad.wilk@oracle.com
Cc: linux-fbdev@vger.kernel.org
Cc: linux-pci@vger.kernel.org
Cc: luto@amacapital.net
Cc: mst@redhat.com
Cc: tomi.valkeinen@ti.com
Cc: toshi.kani@hp.com
Cc: vinod.koul@intel.com
Cc: xen-devel@lists.xensource.com
Link: http://lkml.kernel.org/r/1453516462-4844-1-git-send-email-mcgrof@do-not-panic.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/arm/mach-lpc32xx/phy3250.c b/arch/arm/mach-lpc32xx/phy3250.c
index 77d6b1b..ee06fab 100644
--- a/arch/arm/mach-lpc32xx/phy3250.c
+++ b/arch/arm/mach-lpc32xx/phy3250.c
@@ -86,8 +86,8 @@ static int lpc32xx_clcd_setup(struct clcd_fb *fb)
 {
 	dma_addr_t dma;
 
-	fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev,
-		PANEL_SIZE, &dma, GFP_KERNEL);
+	fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, PANEL_SIZE, &dma,
+					  GFP_KERNEL);
 	if (!fb->fb.screen_base) {
 		printk(KERN_ERR "CLCD: unable to map framebuffer\n");
 		return -ENOMEM;
@@ -116,15 +116,14 @@ static int lpc32xx_clcd_setup(struct clcd_fb *fb)
 
 static int lpc32xx_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
 {
-	return dma_mmap_writecombine(&fb->dev->dev, vma,
-		fb->fb.screen_base, fb->fb.fix.smem_start,
-		fb->fb.fix.smem_len);
+	return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+			   fb->fb.fix.smem_start, fb->fb.fix.smem_len);
 }
 
 static void lpc32xx_clcd_remove(struct clcd_fb *fb)
 {
-	dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-		fb->fb.screen_base, fb->fb.fix.smem_start);
+	dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+		    fb->fb.fix.smem_start);
 }
 
 /*
diff --git a/arch/arm/mach-netx/fb.c b/arch/arm/mach-netx/fb.c
index d122ee6..8814ee5 100644
--- a/arch/arm/mach-netx/fb.c
+++ b/arch/arm/mach-netx/fb.c
@@ -42,8 +42,8 @@ int netx_clcd_setup(struct clcd_fb *fb)
 
 	fb->panel = netx_panel;
 
-	fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, 1024*1024,
-						    &dma, GFP_KERNEL);
+	fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, 1024 * 1024, &dma,
+					  GFP_KERNEL);
 	if (!fb->fb.screen_base) {
 		printk(KERN_ERR "CLCD: unable to map framebuffer\n");
 		return -ENOMEM;
@@ -57,16 +57,14 @@ int netx_clcd_setup(struct clcd_fb *fb)
 
 int netx_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
 {
-	return dma_mmap_writecombine(&fb->dev->dev, vma,
-				     fb->fb.screen_base,
-				     fb->fb.fix.smem_start,
-				     fb->fb.fix.smem_len);
+	return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+			   fb->fb.fix.smem_start, fb->fb.fix.smem_len);
 }
 
 void netx_clcd_remove(struct clcd_fb *fb)
 {
-	dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-			      fb->fb.screen_base, fb->fb.fix.smem_start);
+	dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+		    fb->fb.fix.smem_start);
 }
 
 static AMBA_AHB_DEVICE(fb, "fb", 0, 0x00104000, { NETX_IRQ_LCD }, NULL);
diff --git a/arch/arm/mach-nspire/clcd.c b/arch/arm/mach-nspire/clcd.c
index abea126..ea0e5b2 100644
--- a/arch/arm/mach-nspire/clcd.c
+++ b/arch/arm/mach-nspire/clcd.c
@@ -90,8 +90,8 @@ int nspire_clcd_setup(struct clcd_fb *fb)
 	panel_size = ((panel->mode.xres * panel->mode.yres) * panel->bpp) / 8;
 	panel_size = ALIGN(panel_size, PAGE_SIZE);
 
-	fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev,
-		panel_size, &dma, GFP_KERNEL);
+	fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, panel_size, &dma,
+					  GFP_KERNEL);
 
 	if (!fb->fb.screen_base) {
 		pr_err("CLCD: unable to map framebuffer\n");
@@ -107,13 +107,12 @@ int nspire_clcd_setup(struct clcd_fb *fb)
 
 int nspire_clcd_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
 {
-	return dma_mmap_writecombine(&fb->dev->dev, vma,
-		fb->fb.screen_base, fb->fb.fix.smem_start,
-		fb->fb.fix.smem_len);
+	return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+			   fb->fb.fix.smem_start, fb->fb.fix.smem_len);
 }
 
 void nspire_clcd_remove(struct clcd_fb *fb)
 {
-	dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-		fb->fb.screen_base, fb->fb.fix.smem_start);
+	dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+		    fb->fb.fix.smem_start);
 }
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index e4f4312..f039cfa 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -1300,10 +1300,10 @@ static int iop_adma_probe(struct platform_device *pdev)
 	 * note: writecombine gives slightly better performance, but
 	 * requires that we explicitly flush the writes
 	 */
-	adev->dma_desc_pool_virt = dma_alloc_writecombine(&pdev->dev,
-							  plat_data->pool_size,
-							  &adev->dma_desc_pool,
-							  GFP_KERNEL);
+	adev->dma_desc_pool_virt = dma_alloc_wc(&pdev->dev,
+						plat_data->pool_size,
+						&adev->dma_desc_pool,
+						GFP_KERNEL);
 	if (!adev->dma_desc_pool_virt) {
 		ret = -ENOMEM;
 		goto err_free_adev;
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index 14091f8..3922a5d 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -964,8 +964,8 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
 	 * requires that we explicitly flush the writes
 	 */
 	mv_chan->dma_desc_pool_virt =
-	  dma_alloc_writecombine(&pdev->dev, MV_XOR_POOL_SIZE,
-				 &mv_chan->dma_desc_pool, GFP_KERNEL);
+	  dma_alloc_wc(&pdev->dev, MV_XOR_POOL_SIZE, &mv_chan->dma_desc_pool,
+		       GFP_KERNEL);
 	if (!mv_chan->dma_desc_pool_virt)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/dma/qcom_bam_dma.c b/drivers/dma/qcom_bam_dma.c
index 5a250cd..d34aef7 100644
--- a/drivers/dma/qcom_bam_dma.c
+++ b/drivers/dma/qcom_bam_dma.c
@@ -502,8 +502,8 @@ static int bam_alloc_chan(struct dma_chan *chan)
 		return 0;
 
 	/* allocate FIFO descriptor space, but only if necessary */
-	bchan->fifo_virt = dma_alloc_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE,
-				&bchan->fifo_phys, GFP_KERNEL);
+	bchan->fifo_virt = dma_alloc_wc(bdev->dev, BAM_DESC_FIFO_SIZE,
+					&bchan->fifo_phys, GFP_KERNEL);
 
 	if (!bchan->fifo_virt) {
 		dev_err(bdev->dev, "Failed to allocate desc fifo\n");
@@ -538,8 +538,8 @@ static void bam_free_chan(struct dma_chan *chan)
 	bam_reset_channel(bchan);
 	spin_unlock_irqrestore(&bchan->vc.lock, flags);
 
-	dma_free_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE, bchan->fifo_virt,
-				bchan->fifo_phys);
+	dma_free_wc(bdev->dev, BAM_DESC_FIFO_SIZE, bchan->fifo_virt,
+		    bchan->fifo_phys);
 	bchan->fifo_virt = NULL;
 
 	/* mask irq for pipe/channel */
@@ -1231,9 +1231,9 @@ static int bam_dma_remove(struct platform_device *pdev)
 		bam_dma_terminate_all(&bdev->channels[i].vc.chan);
 		tasklet_kill(&bdev->channels[i].vc.task);
 
-		dma_free_writecombine(bdev->dev, BAM_DESC_FIFO_SIZE,
-			bdev->channels[i].fifo_virt,
-			bdev->channels[i].fifo_phys);
+		dma_free_wc(bdev->dev, BAM_DESC_FIFO_SIZE,
+			    bdev->channels[i].fifo_virt,
+			    bdev->channels[i].fifo_phys);
 	}
 
 	tasklet_kill(&bdev->task);
diff --git a/drivers/gpu/drm/drm_gem_cma_helper.c b/drivers/gpu/drm/drm_gem_cma_helper.c
index e5df53b..1f500a1 100644
--- a/drivers/gpu/drm/drm_gem_cma_helper.c
+++ b/drivers/gpu/drm/drm_gem_cma_helper.c
@@ -109,8 +109,8 @@ struct drm_gem_cma_object *drm_gem_cma_create(struct drm_device *drm,
 	if (IS_ERR(cma_obj))
 		return cma_obj;
 
-	cma_obj->vaddr = dma_alloc_writecombine(drm->dev, size,
-			&cma_obj->paddr, GFP_KERNEL | __GFP_NOWARN);
+	cma_obj->vaddr = dma_alloc_wc(drm->dev, size, &cma_obj->paddr,
+				      GFP_KERNEL | __GFP_NOWARN);
 	if (!cma_obj->vaddr) {
 		dev_err(drm->dev, "failed to allocate buffer with size %zu\n",
 			size);
@@ -192,8 +192,8 @@ void drm_gem_cma_free_object(struct drm_gem_object *gem_obj)
 	cma_obj = to_drm_gem_cma_obj(gem_obj);
 
 	if (cma_obj->vaddr) {
-		dma_free_writecombine(gem_obj->dev->dev, cma_obj->base.size,
-				      cma_obj->vaddr, cma_obj->paddr);
+		dma_free_wc(gem_obj->dev->dev, cma_obj->base.size,
+			    cma_obj->vaddr, cma_obj->paddr);
 	} else if (gem_obj->import_attach) {
 		drm_prime_gem_destroy(gem_obj, cma_obj->sgt);
 	}
@@ -324,9 +324,8 @@ static int drm_gem_cma_mmap_obj(struct drm_gem_cma_object *cma_obj,
 	vma->vm_flags &= ~VM_PFNMAP;
 	vma->vm_pgoff = 0;
 
-	ret = dma_mmap_writecombine(cma_obj->base.dev->dev, vma,
-				    cma_obj->vaddr, cma_obj->paddr,
-				    vma->vm_end - vma->vm_start);
+	ret = dma_mmap_wc(cma_obj->base.dev->dev, vma, cma_obj->vaddr,
+			  cma_obj->paddr, vma->vm_end - vma->vm_start);
 	if (ret)
 		drm_gem_vm_close(vma);
 
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
index a33162c..3c1ce44 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c
@@ -1113,8 +1113,8 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
 	if (!cmdbuf)
 		return NULL;
 
-	cmdbuf->vaddr = dma_alloc_writecombine(gpu->dev, size, &cmdbuf->paddr,
-					       GFP_KERNEL);
+	cmdbuf->vaddr = dma_alloc_wc(gpu->dev, size, &cmdbuf->paddr,
+				     GFP_KERNEL);
 	if (!cmdbuf->vaddr) {
 		kfree(cmdbuf);
 		return NULL;
@@ -1128,8 +1128,8 @@ struct etnaviv_cmdbuf *etnaviv_gpu_cmdbuf_new(struct etnaviv_gpu *gpu, u32 size,
 
 void etnaviv_gpu_cmdbuf_free(struct etnaviv_cmdbuf *cmdbuf)
 {
-	dma_free_writecombine(cmdbuf->gpu->dev, cmdbuf->size,
-			      cmdbuf->vaddr, cmdbuf->paddr);
+	dma_free_wc(cmdbuf->gpu->dev, cmdbuf->size, cmdbuf->vaddr,
+		    cmdbuf->paddr);
 	kfree(cmdbuf);
 }
 
diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
index dfebdc4..85dfe36 100644
--- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
+++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
@@ -573,10 +573,9 @@ static int omap_dmm_remove(struct platform_device *dev)
 
 		kfree(omap_dmm->engines);
 		if (omap_dmm->refill_va)
-			dma_free_writecombine(omap_dmm->dev,
-				REFILL_BUFFER_SIZE * omap_dmm->num_engines,
-				omap_dmm->refill_va,
-				omap_dmm->refill_pa);
+			dma_free_wc(omap_dmm->dev,
+				    REFILL_BUFFER_SIZE * omap_dmm->num_engines,
+				    omap_dmm->refill_va, omap_dmm->refill_pa);
 		if (omap_dmm->dummy_page)
 			__free_page(omap_dmm->dummy_page);
 
@@ -701,9 +700,9 @@ static int omap_dmm_probe(struct platform_device *dev)
 	omap_dmm->dummy_pa = page_to_phys(omap_dmm->dummy_page);
 
 	/* alloc refill memory */
-	omap_dmm->refill_va = dma_alloc_writecombine(&dev->dev,
-				REFILL_BUFFER_SIZE * omap_dmm->num_engines,
-				&omap_dmm->refill_pa, GFP_KERNEL);
+	omap_dmm->refill_va = dma_alloc_wc(&dev->dev,
+					   REFILL_BUFFER_SIZE * omap_dmm->num_engines,
+					   &omap_dmm->refill_pa, GFP_KERNEL);
 	if (!omap_dmm->refill_va) {
 		dev_err(&dev->dev, "could not allocate refill memory\n");
 		goto fail;
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 8495a1a..359b0d7 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -1330,8 +1330,8 @@ void omap_gem_free_object(struct drm_gem_object *obj)
 			omap_gem_detach_pages(obj);
 
 		if (!is_shmem(obj)) {
-			dma_free_writecombine(dev->dev, obj->size,
-					omap_obj->vaddr, omap_obj->paddr);
+			dma_free_wc(dev->dev, obj->size, omap_obj->vaddr,
+				    omap_obj->paddr);
 		} else if (omap_obj->vaddr) {
 			vunmap(omap_obj->vaddr);
 		}
@@ -1395,8 +1395,8 @@ struct drm_gem_object *omap_gem_new(struct drm_device *dev,
 		/* attempt to allocate contiguous memory if we don't
 		 * have DMM for remappign discontiguous buffers
 		 */
-		omap_obj->vaddr =  dma_alloc_writecombine(dev->dev, size,
-				&omap_obj->paddr, GFP_KERNEL);
+		omap_obj->vaddr =  dma_alloc_wc(dev->dev, size,
+						&omap_obj->paddr, GFP_KERNEL);
 		if (!omap_obj->vaddr) {
 			kfree(omap_obj);
 
diff --git a/drivers/gpu/drm/sti/sti_cursor.c b/drivers/gpu/drm/sti/sti_cursor.c
index 8078631..bd736ac 100644
--- a/drivers/gpu/drm/sti/sti_cursor.c
+++ b/drivers/gpu/drm/sti/sti_cursor.c
@@ -157,17 +157,15 @@ static void sti_cursor_atomic_update(struct drm_plane *drm_plane,
 		cursor->height = src_h;
 
 		if (cursor->pixmap.base)
-			dma_free_writecombine(cursor->dev,
-					      cursor->pixmap.size,
-					      cursor->pixmap.base,
-					      cursor->pixmap.paddr);
+			dma_free_wc(cursor->dev, cursor->pixmap.size,
+				    cursor->pixmap.base, cursor->pixmap.paddr);
 
 		cursor->pixmap.size = cursor->width * cursor->height;
 
-		cursor->pixmap.base = dma_alloc_writecombine(cursor->dev,
-							cursor->pixmap.size,
-							&cursor->pixmap.paddr,
-							GFP_KERNEL | GFP_DMA);
+		cursor->pixmap.base = dma_alloc_wc(cursor->dev,
+						   cursor->pixmap.size,
+						   &cursor->pixmap.paddr,
+						   GFP_KERNEL | GFP_DMA);
 		if (!cursor->pixmap.base) {
 			DRM_ERROR("Failed to allocate memory for pixmap\n");
 			return;
@@ -252,8 +250,8 @@ struct drm_plane *sti_cursor_create(struct drm_device *drm_dev,
 
 	/* Allocate clut buffer */
 	size = 0x100 * sizeof(unsigned short);
-	cursor->clut = dma_alloc_writecombine(dev, size, &cursor->clut_paddr,
-					      GFP_KERNEL | GFP_DMA);
+	cursor->clut = dma_alloc_wc(dev, size, &cursor->clut_paddr,
+				    GFP_KERNEL | GFP_DMA);
 
 	if (!cursor->clut) {
 		DRM_ERROR("Failed to allocate memory for cursor clut\n");
@@ -286,7 +284,7 @@ struct drm_plane *sti_cursor_create(struct drm_device *drm_dev,
 	return &cursor->plane.drm_plane;
 
 err_plane:
-	dma_free_writecombine(dev, size, cursor->clut, cursor->clut_paddr);
+	dma_free_wc(dev, size, cursor->clut, cursor->clut_paddr);
 err_clut:
 	devm_kfree(dev, cursor);
 	return NULL;
diff --git a/drivers/gpu/drm/sti/sti_gdp.c b/drivers/gpu/drm/sti/sti_gdp.c
index f9a1d92..514551c 100644
--- a/drivers/gpu/drm/sti/sti_gdp.c
+++ b/drivers/gpu/drm/sti/sti_gdp.c
@@ -312,8 +312,7 @@ static void sti_gdp_init(struct sti_gdp *gdp)
 	/* Allocate all the nodes within a single memory page */
 	size = sizeof(struct sti_gdp_node) *
 	    GDP_NODE_PER_FIELD * GDP_NODE_NB_BANK;
-	base = dma_alloc_writecombine(gdp->dev,
-				      size, &dma_addr, GFP_KERNEL | GFP_DMA);
+	base = dma_alloc_wc(gdp->dev, size, &dma_addr, GFP_KERNEL | GFP_DMA);
 
 	if (!base) {
 		DRM_ERROR("Failed to allocate memory for GDP node\n");
diff --git a/drivers/gpu/drm/sti/sti_hqvdp.c b/drivers/gpu/drm/sti/sti_hqvdp.c
index 43861b5..1d3c3d0 100644
--- a/drivers/gpu/drm/sti/sti_hqvdp.c
+++ b/drivers/gpu/drm/sti/sti_hqvdp.c
@@ -617,9 +617,9 @@ static void sti_hqvdp_init(struct sti_hqvdp *hqvdp)
 
 	/* Allocate memory for the VDP commands */
 	size = NB_VDP_CMD * sizeof(struct sti_hqvdp_cmd);
-	hqvdp->hqvdp_cmd = dma_alloc_writecombine(hqvdp->dev, size,
-					 &hqvdp->hqvdp_cmd_paddr,
-					 GFP_KERNEL | GFP_DMA);
+	hqvdp->hqvdp_cmd = dma_alloc_wc(hqvdp->dev, size,
+					&hqvdp->hqvdp_cmd_paddr,
+					GFP_KERNEL | GFP_DMA);
 	if (!hqvdp->hqvdp_cmd) {
 		DRM_ERROR("Failed to allocate memory for VDP cmd\n");
 		return;
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index 33add93..3b0d8c3 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -175,8 +175,7 @@ static void tegra_bo_free(struct drm_device *drm, struct tegra_bo *bo)
 		sg_free_table(bo->sgt);
 		kfree(bo->sgt);
 	} else if (bo->vaddr) {
-		dma_free_writecombine(drm->dev, bo->gem.size, bo->vaddr,
-				      bo->paddr);
+		dma_free_wc(drm->dev, bo->gem.size, bo->vaddr, bo->paddr);
 	}
 }
 
@@ -233,8 +232,8 @@ static int tegra_bo_alloc(struct drm_device *drm, struct tegra_bo *bo)
 	} else {
 		size_t size = bo->gem.size;
 
-		bo->vaddr = dma_alloc_writecombine(drm->dev, size, &bo->paddr,
-						   GFP_KERNEL | __GFP_NOWARN);
+		bo->vaddr = dma_alloc_wc(drm->dev, size, &bo->paddr,
+					 GFP_KERNEL | __GFP_NOWARN);
 		if (!bo->vaddr) {
 			dev_err(drm->dev,
 				"failed to allocate buffer of size %zu\n",
@@ -472,8 +471,8 @@ int tegra_drm_mmap(struct file *file, struct vm_area_struct *vma)
 		vma->vm_flags &= ~VM_PFNMAP;
 		vma->vm_pgoff = 0;
 
-		ret = dma_mmap_writecombine(gem->dev->dev, vma, bo->vaddr,
-					    bo->paddr, gem->size);
+		ret = dma_mmap_wc(gem->dev->dev, vma, bo->vaddr, bo->paddr,
+				  gem->size);
 		if (ret) {
 			drm_gem_vm_close(vma);
 			return ret;
diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index 22278bc..034ef2d 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -398,9 +398,8 @@ int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
 	vma->vm_flags &= ~VM_PFNMAP;
 	vma->vm_pgoff = 0;
 
-	ret = dma_mmap_writecombine(bo->base.base.dev->dev, vma,
-				    bo->base.vaddr, bo->base.paddr,
-				    vma->vm_end - vma->vm_start);
+	ret = dma_mmap_wc(bo->base.base.dev->dev, vma, bo->base.vaddr,
+			  bo->base.paddr, vma->vm_end - vma->vm_start);
 	if (ret)
 		drm_gem_vm_close(vma);
 
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index 5a8c8d5..a18db4d 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -52,8 +52,8 @@ static void host1x_pushbuffer_destroy(struct push_buffer *pb)
 	struct host1x *host1x = cdma_to_host1x(cdma);
 
 	if (pb->phys != 0)
-		dma_free_writecombine(host1x->dev, pb->size_bytes + 4,
-				      pb->mapped, pb->phys);
+		dma_free_wc(host1x->dev, pb->size_bytes + 4, pb->mapped,
+			    pb->phys);
 
 	pb->mapped = NULL;
 	pb->phys = 0;
@@ -76,8 +76,8 @@ static int host1x_pushbuffer_init(struct push_buffer *pb)
 	pb->pos = 0;
 
 	/* allocate and map pushbuffer memory */
-	pb->mapped = dma_alloc_writecombine(host1x->dev, pb->size_bytes + 4,
-					    &pb->phys, GFP_KERNEL);
+	pb->mapped = dma_alloc_wc(host1x->dev, pb->size_bytes + 4, &pb->phys,
+				  GFP_KERNEL);
 	if (!pb->mapped)
 		goto fail;
 
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
index 63bd63f..defa799 100644
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -467,9 +467,8 @@ static inline int copy_gathers(struct host1x_job *job, struct device *dev)
 		size += g->words * sizeof(u32);
 	}
 
-	job->gather_copy_mapped = dma_alloc_writecombine(dev, size,
-							 &job->gather_copy,
-							 GFP_KERNEL);
+	job->gather_copy_mapped = dma_alloc_wc(dev, size, &job->gather_copy,
+					       GFP_KERNEL);
 	if (!job->gather_copy_mapped) {
 		job->gather_copy_mapped = NULL;
 		return -ENOMEM;
@@ -578,9 +577,8 @@ void host1x_job_unpin(struct host1x_job *job)
 	job->num_unpins = 0;
 
 	if (job->gather_copy_size)
-		dma_free_writecombine(job->channel->dev, job->gather_copy_size,
-				      job->gather_copy_mapped,
-				      job->gather_copy);
+		dma_free_wc(job->channel->dev, job->gather_copy_size,
+		            job->gather_copy_mapped, job->gather_copy);
 }
 EXPORT_SYMBOL(host1x_job_unpin);
 
diff --git a/drivers/media/platform/coda/coda-bit.c b/drivers/media/platform/coda/coda-bit.c
index 7d28899..38aacc7 100644
--- a/drivers/media/platform/coda/coda-bit.c
+++ b/drivers/media/platform/coda/coda-bit.c
@@ -1455,9 +1455,9 @@ static int coda_alloc_bitstream_buffer(struct coda_ctx *ctx,
 		return 0;
 
 	ctx->bitstream.size = roundup_pow_of_two(q_data->sizeimage * 2);
-	ctx->bitstream.vaddr = dma_alloc_writecombine(
-			&ctx->dev->plat_dev->dev, ctx->bitstream.size,
-			&ctx->bitstream.paddr, GFP_KERNEL);
+	ctx->bitstream.vaddr = dma_alloc_wc(&ctx->dev->plat_dev->dev,
+					    ctx->bitstream.size,
+					    &ctx->bitstream.paddr, GFP_KERNEL);
 	if (!ctx->bitstream.vaddr) {
 		v4l2_err(&ctx->dev->v4l2_dev,
 			 "failed to allocate bitstream ringbuffer");
@@ -1474,8 +1474,8 @@ static void coda_free_bitstream_buffer(struct coda_ctx *ctx)
 	if (ctx->bitstream.vaddr == NULL)
 		return;
 
-	dma_free_writecombine(&ctx->dev->plat_dev->dev, ctx->bitstream.size,
-			      ctx->bitstream.vaddr, ctx->bitstream.paddr);
+	dma_free_wc(&ctx->dev->plat_dev->dev, ctx->bitstream.size,
+		    ctx->bitstream.vaddr, ctx->bitstream.paddr);
 	ctx->bitstream.vaddr = NULL;
 	kfifo_init(&ctx->bitstream_fifo, NULL, 0);
 }
diff --git a/drivers/video/fbdev/acornfb.c b/drivers/video/fbdev/acornfb.c
index a305cae..fb75b7e 100644
--- a/drivers/video/fbdev/acornfb.c
+++ b/drivers/video/fbdev/acornfb.c
@@ -1040,8 +1040,8 @@ static int acornfb_probe(struct platform_device *dev)
 		 * for the framebuffer if we are not using
 		 * VRAM.
 		 */
-		base = dma_alloc_writecombine(current_par.dev, size, &handle,
-					      GFP_KERNEL);
+		base = dma_alloc_wc(current_par.dev, size, &handle,
+				    GFP_KERNEL);
 		if (base == NULL) {
 			printk(KERN_ERR "acornfb: unable to allocate screen "
 			       "memory\n");
diff --git a/drivers/video/fbdev/amba-clcd-versatile.c b/drivers/video/fbdev/amba-clcd-versatile.c
index 7a8afcd..a8a22da 100644
--- a/drivers/video/fbdev/amba-clcd-versatile.c
+++ b/drivers/video/fbdev/amba-clcd-versatile.c
@@ -154,8 +154,8 @@ int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
 {
 	dma_addr_t dma;
 
-	fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, framesize,
-						    &dma, GFP_KERNEL);
+	fb->fb.screen_base = dma_alloc_wc(&fb->dev->dev, framesize, &dma,
+					  GFP_KERNEL);
 	if (!fb->fb.screen_base) {
 		pr_err("CLCD: unable to map framebuffer\n");
 		return -ENOMEM;
@@ -169,14 +169,12 @@ int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
 
 int versatile_clcd_mmap_dma(struct clcd_fb *fb, struct vm_area_struct *vma)
 {
-	return dma_mmap_writecombine(&fb->dev->dev, vma,
-				     fb->fb.screen_base,
-				     fb->fb.fix.smem_start,
-				     fb->fb.fix.smem_len);
+	return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+			   fb->fb.fix.smem_start, fb->fb.fix.smem_len);
 }
 
 void versatile_clcd_remove_dma(struct clcd_fb *fb)
 {
-	dma_free_writecombine(&fb->dev->dev, fb->fb.fix.smem_len,
-			      fb->fb.screen_base, fb->fb.fix.smem_start);
+	dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
+		    fb->fb.fix.smem_start);
 }
diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c
index 9362424..fe274b5 100644
--- a/drivers/video/fbdev/amba-clcd.c
+++ b/drivers/video/fbdev/amba-clcd.c
@@ -774,8 +774,8 @@ static int clcdfb_of_dma_setup(struct clcd_fb *fb)
 
 static int clcdfb_of_dma_mmap(struct clcd_fb *fb, struct vm_area_struct *vma)
 {
-	return dma_mmap_writecombine(&fb->dev->dev, vma, fb->fb.screen_base,
-			fb->fb.fix.smem_start, fb->fb.fix.smem_len);
+	return dma_mmap_wc(&fb->dev->dev, vma, fb->fb.screen_base,
+			   fb->fb.fix.smem_start, fb->fb.fix.smem_len);
 }
 
 static void clcdfb_of_dma_remove(struct clcd_fb *fb)
diff --git a/drivers/video/fbdev/atmel_lcdfb.c b/drivers/video/fbdev/atmel_lcdfb.c
index 19eb42b..56c60e6 100644
--- a/drivers/video/fbdev/atmel_lcdfb.c
+++ b/drivers/video/fbdev/atmel_lcdfb.c
@@ -414,8 +414,8 @@ static inline void atmel_lcdfb_free_video_memory(struct atmel_lcdfb_info *sinfo)
 {
 	struct fb_info *info = sinfo->info;
 
-	dma_free_writecombine(info->device, info->fix.smem_len,
-				info->screen_base, info->fix.smem_start);
+	dma_free_wc(info->device, info->fix.smem_len, info->screen_base,
+		    info->fix.smem_start);
 }
 
 /**
@@ -435,8 +435,9 @@ static int atmel_lcdfb_alloc_video_memory(struct atmel_lcdfb_info *sinfo)
 		    * ((var->bits_per_pixel + 7) / 8));
 	info->fix.smem_len = max(smem_len, sinfo->smem_len);
 
-	info->screen_base = dma_alloc_writecombine(info->device, info->fix.smem_len,
-					(dma_addr_t *)&info->fix.smem_start, GFP_KERNEL);
+	info->screen_base = dma_alloc_wc(info->device, info->fix.smem_len,
+					 (dma_addr_t *)&info->fix.smem_start,
+					 GFP_KERNEL);
 
 	if (!info->screen_base) {
 		return -ENOMEM;
diff --git a/drivers/video/fbdev/ep93xx-fb.c b/drivers/video/fbdev/ep93xx-fb.c
index 5b10810..75f0db2 100644
--- a/drivers/video/fbdev/ep93xx-fb.c
+++ b/drivers/video/fbdev/ep93xx-fb.c
@@ -316,9 +316,8 @@ static int ep93xxfb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 	unsigned int offset = vma->vm_pgoff << PAGE_SHIFT;
 
 	if (offset < info->fix.smem_len) {
-		return dma_mmap_writecombine(info->dev, vma, info->screen_base,
-					     info->fix.smem_start,
-					     info->fix.smem_len);
+		return dma_mmap_wc(info->dev, vma, info->screen_base,
+				   info->fix.smem_start, info->fix.smem_len);
 	}
 
 	return -EINVAL;
@@ -428,8 +427,7 @@ static int ep93xxfb_alloc_videomem(struct fb_info *info)
 	/* Maximum 16bpp -> used memory is maximum x*y*2 bytes */
 	fb_size = EP93XXFB_MAX_XRES * EP93XXFB_MAX_YRES * 2;
 
-	virt_addr = dma_alloc_writecombine(info->dev, fb_size,
-					   &phys_addr, GFP_KERNEL);
+	virt_addr = dma_alloc_wc(info->dev, fb_size, &phys_addr, GFP_KERNEL);
 	if (!virt_addr)
 		return -ENOMEM;
 
diff --git a/drivers/video/fbdev/gbefb.c b/drivers/video/fbdev/gbefb.c
index b63d55f..1a242b1 100644
--- a/drivers/video/fbdev/gbefb.c
+++ b/drivers/video/fbdev/gbefb.c
@@ -1185,8 +1185,8 @@ static int gbefb_probe(struct platform_device *p_dev)
 	} else {
 		/* try to allocate memory with the classical allocator
 		 * this has high chance to fail on low memory machines */
-		gbe_mem = dma_alloc_writecombine(NULL, gbe_mem_size,
-						 &gbe_dma_addr, GFP_KERNEL);
+		gbe_mem = dma_alloc_wc(NULL, gbe_mem_size, &gbe_dma_addr,
+				       GFP_KERNEL);
 		if (!gbe_mem) {
 			printk(KERN_ERR "gbefb: couldn't allocate framebuffer memory\n");
 			ret = -ENOMEM;
@@ -1238,7 +1238,7 @@ static int gbefb_probe(struct platform_device *p_dev)
 out_gbe_unmap:
 	arch_phys_wc_del(par->wc_cookie);
 	if (gbe_dma_addr)
-		dma_free_writecombine(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
+		dma_free_wc(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
 out_tiles_free:
 	dma_free_coherent(NULL, GBE_TLB_SIZE * sizeof(uint16_t),
 			  (void *)gbe_tiles.cpu, gbe_tiles.dma);
@@ -1259,7 +1259,7 @@ static int gbefb_remove(struct platform_device* p_dev)
 	gbe_turn_off();
 	arch_phys_wc_del(par->wc_cookie);
 	if (gbe_dma_addr)
-		dma_free_writecombine(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
+		dma_free_wc(NULL, gbe_mem_size, gbe_mem, gbe_mem_phys);
 	dma_free_coherent(NULL, GBE_TLB_SIZE * sizeof(uint16_t),
 			  (void *)gbe_tiles.cpu, gbe_tiles.dma);
 	release_mem_region(GBE_BASE, sizeof(struct sgi_gbe));
diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c
index bb2f1e8..76b6a77 100644
--- a/drivers/video/fbdev/imxfb.c
+++ b/drivers/video/fbdev/imxfb.c
@@ -937,8 +937,8 @@ static int imxfb_probe(struct platform_device *pdev)
 	}
 
 	fbi->map_size = PAGE_ALIGN(info->fix.smem_len);
-	info->screen_base = dma_alloc_writecombine(&pdev->dev, fbi->map_size,
-						   &fbi->map_dma, GFP_KERNEL);
+	info->screen_base = dma_alloc_wc(&pdev->dev, fbi->map_size,
+					 &fbi->map_dma, GFP_KERNEL);
 
 	if (!info->screen_base) {
 		dev_err(&pdev->dev, "Failed to allocate video RAM: %d\n", ret);
@@ -1005,8 +1005,8 @@ failed_cmap:
 	if (pdata && pdata->exit)
 		pdata->exit(fbi->pdev);
 failed_platform_init:
-	dma_free_writecombine(&pdev->dev, fbi->map_size, info->screen_base,
-			      fbi->map_dma);
+	dma_free_wc(&pdev->dev, fbi->map_size, info->screen_base,
+		    fbi->map_dma);
 failed_map:
 	iounmap(fbi->regs);
 failed_ioremap:
@@ -1041,8 +1041,8 @@ static int imxfb_remove(struct platform_device *pdev)
 	kfree(info->pseudo_palette);
 	framebuffer_release(info);
 
-	dma_free_writecombine(&pdev->dev, fbi->map_size, info->screen_base,
-			      fbi->map_dma);
+	dma_free_wc(&pdev->dev, fbi->map_size, info->screen_base,
+		    fbi->map_dma);
 
 	iounmap(fbi->regs);
 	release_mem_region(res->start, resource_size(res));
diff --git a/drivers/video/fbdev/mx3fb.c b/drivers/video/fbdev/mx3fb.c
index 7947634..f91b1db 100644
--- a/drivers/video/fbdev/mx3fb.c
+++ b/drivers/video/fbdev/mx3fb.c
@@ -1336,9 +1336,8 @@ static int mx3fb_map_video_memory(struct fb_info *fbi, unsigned int mem_len,
 	int retval = 0;
 	dma_addr_t addr;
 
-	fbi->screen_base = dma_alloc_writecombine(fbi->device,
-						  mem_len,
-						  &addr, GFP_DMA | GFP_KERNEL);
+	fbi->screen_base = dma_alloc_wc(fbi->device, mem_len, &addr,
+					GFP_DMA | GFP_KERNEL);
 
 	if (!fbi->screen_base) {
 		dev_err(fbi->device, "Cannot allocate %u bytes framebuffer memory\n",
@@ -1378,8 +1377,8 @@ err0:
  */
 static int mx3fb_unmap_video_memory(struct fb_info *fbi)
 {
-	dma_free_writecombine(fbi->device, fbi->fix.smem_len,
-			      fbi->screen_base, fbi->fix.smem_start);
+	dma_free_wc(fbi->device, fbi->fix.smem_len, fbi->screen_base,
+		    fbi->fix.smem_start);
 
 	fbi->screen_base = NULL;
 	mutex_lock(&fbi->mm_lock);
diff --git a/drivers/video/fbdev/nuc900fb.c b/drivers/video/fbdev/nuc900fb.c
index 389fa2c..6680eda 100644
--- a/drivers/video/fbdev/nuc900fb.c
+++ b/drivers/video/fbdev/nuc900fb.c
@@ -396,8 +396,8 @@ static int nuc900fb_map_video_memory(struct fb_info *info)
 	dev_dbg(fbi->dev, "nuc900fb_map_video_memory(fbi=%p) map_size %lu\n",
 		fbi, map_size);
 
-	info->screen_base = dma_alloc_writecombine(fbi->dev, map_size,
-							&map_dma, GFP_KERNEL);
+	info->screen_base = dma_alloc_wc(fbi->dev, map_size, &map_dma,
+					 GFP_KERNEL);
 
 	if (!info->screen_base)
 		return -ENOMEM;
@@ -411,8 +411,8 @@ static int nuc900fb_map_video_memory(struct fb_info *info)
 static inline void nuc900fb_unmap_video_memory(struct fb_info *info)
 {
 	struct nuc900fb_info *fbi = info->par;
-	dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-			      info->screen_base, info->fix.smem_start);
+	dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+		    info->screen_base, info->fix.smem_start);
 }
 
 static irqreturn_t nuc900fb_irqhandler(int irq, void *dev_id)
diff --git a/drivers/video/fbdev/omap/lcdc.c b/drivers/video/fbdev/omap/lcdc.c
index 6efa259..e3d9b9e 100644
--- a/drivers/video/fbdev/omap/lcdc.c
+++ b/drivers/video/fbdev/omap/lcdc.c
@@ -612,8 +612,8 @@ static void lcdc_dma_handler(u16 status, void *data)
 
 static int alloc_palette_ram(void)
 {
-	lcdc.palette_virt = dma_alloc_writecombine(lcdc.fbdev->dev,
-		MAX_PALETTE_SIZE, &lcdc.palette_phys, GFP_KERNEL);
+	lcdc.palette_virt = dma_alloc_wc(lcdc.fbdev->dev, MAX_PALETTE_SIZE,
+					 &lcdc.palette_phys, GFP_KERNEL);
 	if (lcdc.palette_virt == NULL) {
 		dev_err(lcdc.fbdev->dev, "failed to alloc palette memory\n");
 		return -ENOMEM;
@@ -625,8 +625,8 @@ static int alloc_palette_ram(void)
 
 static void free_palette_ram(void)
 {
-	dma_free_writecombine(lcdc.fbdev->dev, MAX_PALETTE_SIZE,
-			lcdc.palette_virt, lcdc.palette_phys);
+	dma_free_wc(lcdc.fbdev->dev, MAX_PALETTE_SIZE, lcdc.palette_virt,
+		    lcdc.palette_phys);
 }
 
 static int alloc_fbmem(struct omapfb_mem_region *region)
@@ -642,8 +642,8 @@ static int alloc_fbmem(struct omapfb_mem_region *region)
 	if (region->size > frame_size)
 		frame_size = region->size;
 	lcdc.vram_size = frame_size;
-	lcdc.vram_virt = dma_alloc_writecombine(lcdc.fbdev->dev,
-			lcdc.vram_size, &lcdc.vram_phys, GFP_KERNEL);
+	lcdc.vram_virt = dma_alloc_wc(lcdc.fbdev->dev, lcdc.vram_size,
+				      &lcdc.vram_phys, GFP_KERNEL);
 	if (lcdc.vram_virt == NULL) {
 		dev_err(lcdc.fbdev->dev, "unable to allocate FB DMA memory\n");
 		return -ENOMEM;
@@ -660,8 +660,8 @@ static int alloc_fbmem(struct omapfb_mem_region *region)
 
 static void free_fbmem(void)
 {
-	dma_free_writecombine(lcdc.fbdev->dev, lcdc.vram_size,
-			      lcdc.vram_virt, lcdc.vram_phys);
+	dma_free_wc(lcdc.fbdev->dev, lcdc.vram_size, lcdc.vram_virt,
+		    lcdc.vram_phys);
 }
 
 static int setup_fbmem(struct omapfb_mem_desc *req_md)
diff --git a/drivers/video/fbdev/pxa168fb.c b/drivers/video/fbdev/pxa168fb.c
index efb57c0..def3a50 100644
--- a/drivers/video/fbdev/pxa168fb.c
+++ b/drivers/video/fbdev/pxa168fb.c
@@ -680,8 +680,8 @@ static int pxa168fb_probe(struct platform_device *pdev)
 	 */
 	info->fix.smem_len = PAGE_ALIGN(DEFAULT_FB_SIZE);
 
-	info->screen_base = dma_alloc_writecombine(fbi->dev, info->fix.smem_len,
-						&fbi->fb_start_dma, GFP_KERNEL);
+	info->screen_base = dma_alloc_wc(fbi->dev, info->fix.smem_len,
+					 &fbi->fb_start_dma, GFP_KERNEL);
 	if (info->screen_base == NULL) {
 		ret = -ENOMEM;
 		goto failed_free_info;
@@ -804,8 +804,8 @@ static int pxa168fb_remove(struct platform_device *pdev)
 
 	irq = platform_get_irq(pdev, 0);
 
-	dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-				info->screen_base, info->fix.smem_start);
+	dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+		    info->screen_base, info->fix.smem_start);
 
 	clk_disable(fbi->clk);
 
diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c
index 33b2bb3..2c0487f 100644
--- a/drivers/video/fbdev/pxafb.c
+++ b/drivers/video/fbdev/pxafb.c
@@ -2446,8 +2446,8 @@ static int pxafb_remove(struct platform_device *dev)
 
 	free_pages_exact(fbi->video_mem, fbi->video_mem_size);
 
-	dma_free_writecombine(&dev->dev, fbi->dma_buff_size,
-			fbi->dma_buff, fbi->dma_buff_phys);
+	dma_free_wc(&dev->dev, fbi->dma_buff_size, fbi->dma_buff,
+		    fbi->dma_buff_phys);
 
 	iounmap(fbi->mmio_base);
 
diff --git a/drivers/video/fbdev/s3c-fb.c b/drivers/video/fbdev/s3c-fb.c
index f72dd12..5f4f696 100644
--- a/drivers/video/fbdev/s3c-fb.c
+++ b/drivers/video/fbdev/s3c-fb.c
@@ -1105,8 +1105,7 @@ static int s3c_fb_alloc_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
 
 	dev_dbg(sfb->dev, "want %u bytes for window\n", size);
 
-	fbi->screen_base = dma_alloc_writecombine(sfb->dev, size,
-						  &map_dma, GFP_KERNEL);
+	fbi->screen_base = dma_alloc_wc(sfb->dev, size, &map_dma, GFP_KERNEL);
 	if (!fbi->screen_base)
 		return -ENOMEM;
 
@@ -1131,8 +1130,8 @@ static void s3c_fb_free_memory(struct s3c_fb *sfb, struct s3c_fb_win *win)
 	struct fb_info *fbi = win->fbinfo;
 
 	if (fbi->screen_base)
-		dma_free_writecombine(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len),
-			      fbi->screen_base, fbi->fix.smem_start);
+		dma_free_wc(sfb->dev, PAGE_ALIGN(fbi->fix.smem_len),
+		            fbi->screen_base, fbi->fix.smem_start);
 }
 
 /**
diff --git a/drivers/video/fbdev/s3c2410fb.c b/drivers/video/fbdev/s3c2410fb.c
index d6704ad..0dd86be 100644
--- a/drivers/video/fbdev/s3c2410fb.c
+++ b/drivers/video/fbdev/s3c2410fb.c
@@ -645,8 +645,8 @@ static int s3c2410fb_map_video_memory(struct fb_info *info)
 
 	dprintk("map_video_memory(fbi=%p) map_size %u\n", fbi, map_size);
 
-	info->screen_base = dma_alloc_writecombine(fbi->dev, map_size,
-						   &map_dma, GFP_KERNEL);
+	info->screen_base = dma_alloc_wc(fbi->dev, map_size, &map_dma,
+					 GFP_KERNEL);
 
 	if (info->screen_base) {
 		/* prevent initial garbage on screen */
@@ -667,8 +667,8 @@ static inline void s3c2410fb_unmap_video_memory(struct fb_info *info)
 {
 	struct s3c2410fb_info *fbi = info->par;
 
-	dma_free_writecombine(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
-			      info->screen_base, info->fix.smem_start);
+	dma_free_wc(fbi->dev, PAGE_ALIGN(info->fix.smem_len),
+		    info->screen_base, info->fix.smem_start);
 }
 
 static inline void modify_gpio(void __iomem *reg,
diff --git a/drivers/video/fbdev/sa1100fb.c b/drivers/video/fbdev/sa1100fb.c
index dcf774c..fc2aaa5 100644
--- a/drivers/video/fbdev/sa1100fb.c
+++ b/drivers/video/fbdev/sa1100fb.c
@@ -567,8 +567,8 @@ static int sa1100fb_mmap(struct fb_info *info,
 
 	if (off < info->fix.smem_len) {
 		vma->vm_pgoff += 1; /* skip over the palette */
-		return dma_mmap_writecombine(fbi->dev, vma, fbi->map_cpu,
-					     fbi->map_dma, fbi->map_size);
+		return dma_mmap_wc(fbi->dev, vma, fbi->map_cpu, fbi->map_dma,
+				   fbi->map_size);
 	}
 
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
@@ -1099,8 +1099,8 @@ static int sa1100fb_map_video_memory(struct sa1100fb_info *fbi)
 	 * of the framebuffer.
 	 */
 	fbi->map_size = PAGE_ALIGN(fbi->fb.fix.smem_len + PAGE_SIZE);
-	fbi->map_cpu = dma_alloc_writecombine(fbi->dev, fbi->map_size,
-					      &fbi->map_dma, GFP_KERNEL);
+	fbi->map_cpu = dma_alloc_wc(fbi->dev, fbi->map_size, &fbi->map_dma,
+				    GFP_KERNEL);
 
 	if (fbi->map_cpu) {
 		fbi->fb.screen_base = fbi->map_cpu + PAGE_SIZE;
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 75857cd..471e064 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -641,31 +641,40 @@ static inline void dmam_release_declared_memory(struct device *dev)
 }
 #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */
 
-static inline void *dma_alloc_writecombine(struct device *dev, size_t size,
-					   dma_addr_t *dma_addr, gfp_t gfp)
+static inline void *dma_alloc_wc(struct device *dev, size_t size,
+				 dma_addr_t *dma_addr, gfp_t gfp)
 {
 	DEFINE_DMA_ATTRS(attrs);
 	dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
 	return dma_alloc_attrs(dev, size, dma_addr, gfp, &attrs);
 }
+#ifndef dma_alloc_writecombine
+#define dma_alloc_writecombine dma_alloc_wc
+#endif
 
-static inline void dma_free_writecombine(struct device *dev, size_t size,
-					 void *cpu_addr, dma_addr_t dma_addr)
+static inline void dma_free_wc(struct device *dev, size_t size,
+			       void *cpu_addr, dma_addr_t dma_addr)
 {
 	DEFINE_DMA_ATTRS(attrs);
 	dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
 	return dma_free_attrs(dev, size, cpu_addr, dma_addr, &attrs);
 }
+#ifndef dma_free_writecombine
+#define dma_free_writecombine dma_free_wc
+#endif
 
-static inline int dma_mmap_writecombine(struct device *dev,
-					struct vm_area_struct *vma,
-					void *cpu_addr, dma_addr_t dma_addr,
-					size_t size)
+static inline int dma_mmap_wc(struct device *dev,
+			      struct vm_area_struct *vma,
+			      void *cpu_addr, dma_addr_t dma_addr,
+			      size_t size)
 {
 	DEFINE_DMA_ATTRS(attrs);
 	dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
 	return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, &attrs);
 }
+#ifndef dma_mmap_writecombine
+#define dma_mmap_writecombine dma_mmap_wc
+#endif
 
 #ifdef CONFIG_NEED_DMA_MAP_STATE
 #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME)        dma_addr_t ADDR_NAME
diff --git a/sound/arm/pxa2xx-pcm-lib.c b/sound/arm/pxa2xx-pcm-lib.c
index e9b98af..e8da3b8 100644
--- a/sound/arm/pxa2xx-pcm-lib.c
+++ b/sound/arm/pxa2xx-pcm-lib.c
@@ -141,10 +141,8 @@ int pxa2xx_pcm_mmap(struct snd_pcm_substream *substream,
 	struct vm_area_struct *vma)
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
-	return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-				     runtime->dma_area,
-				     runtime->dma_addr,
-				     runtime->dma_bytes);
+	return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+			   runtime->dma_addr, runtime->dma_bytes);
 }
 EXPORT_SYMBOL(pxa2xx_pcm_mmap);
 
@@ -156,8 +154,7 @@ int pxa2xx_pcm_preallocate_dma_buffer(struct snd_pcm *pcm, int stream)
 	buf->dev.type = SNDRV_DMA_TYPE_DEV;
 	buf->dev.dev = pcm->card->dev;
 	buf->private_data = NULL;
-	buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-					   &buf->addr, GFP_KERNEL);
+	buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
 	if (!buf->area)
 		return -ENOMEM;
 	buf->bytes = size;
@@ -178,8 +175,7 @@ void pxa2xx_pcm_free_dma_buffers(struct snd_pcm *pcm)
 		buf = &substream->dma_buffer;
 		if (!buf->area)
 			continue;
-		dma_free_writecombine(pcm->card->dev, buf->bytes,
-				      buf->area, buf->addr);
+		dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
 		buf->area = NULL;
 	}
 }
diff --git a/sound/soc/fsl/imx-pcm-fiq.c b/sound/soc/fsl/imx-pcm-fiq.c
index 49d7513..e63cd5e 100644
--- a/sound/soc/fsl/imx-pcm-fiq.c
+++ b/sound/soc/fsl/imx-pcm-fiq.c
@@ -217,8 +217,8 @@ static int snd_imx_pcm_mmap(struct snd_pcm_substream *substream,
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	int ret;
 
-	ret = dma_mmap_writecombine(substream->pcm->card->dev, vma,
-		runtime->dma_area, runtime->dma_addr, runtime->dma_bytes);
+	ret = dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+			  runtime->dma_addr, runtime->dma_bytes);
 
 	pr_debug("%s: ret: %d %p %pad 0x%08x\n", __func__, ret,
 			runtime->dma_area,
@@ -247,8 +247,7 @@ static int imx_pcm_preallocate_dma_buffer(struct snd_pcm *pcm, int stream)
 	buf->dev.type = SNDRV_DMA_TYPE_DEV;
 	buf->dev.dev = pcm->card->dev;
 	buf->private_data = NULL;
-	buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-					   &buf->addr, GFP_KERNEL);
+	buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
 	if (!buf->area)
 		return -ENOMEM;
 	buf->bytes = size;
@@ -330,8 +329,7 @@ static void imx_pcm_free(struct snd_pcm *pcm)
 		if (!buf->area)
 			continue;
 
-		dma_free_writecombine(pcm->card->dev, buf->bytes,
-				      buf->area, buf->addr);
+		dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
 		buf->area = NULL;
 	}
 }
diff --git a/sound/soc/nuc900/nuc900-pcm.c b/sound/soc/nuc900/nuc900-pcm.c
index e093261..2cca055 100644
--- a/sound/soc/nuc900/nuc900-pcm.c
+++ b/sound/soc/nuc900/nuc900-pcm.c
@@ -267,10 +267,8 @@ static int nuc900_dma_mmap(struct snd_pcm_substream *substream,
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
 
-	return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-					runtime->dma_area,
-					runtime->dma_addr,
-					runtime->dma_bytes);
+	return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+			   runtime->dma_addr, runtime->dma_bytes);
 }
 
 static struct snd_pcm_ops nuc900_dma_ops = {
diff --git a/sound/soc/omap/omap-pcm.c b/sound/soc/omap/omap-pcm.c
index 6bb623a..99381a2 100644
--- a/sound/soc/omap/omap-pcm.c
+++ b/sound/soc/omap/omap-pcm.c
@@ -156,10 +156,8 @@ static int omap_pcm_mmap(struct snd_pcm_substream *substream,
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
 
-	return dma_mmap_writecombine(substream->pcm->card->dev, vma,
-				     runtime->dma_area,
-				     runtime->dma_addr,
-				     runtime->dma_bytes);
+	return dma_mmap_wc(substream->pcm->card->dev, vma, runtime->dma_area,
+			   runtime->dma_addr, runtime->dma_bytes);
 }
 
 static struct snd_pcm_ops omap_pcm_ops = {
@@ -183,8 +181,7 @@ static int omap_pcm_preallocate_dma_buffer(struct snd_pcm *pcm,
 	buf->dev.type = SNDRV_DMA_TYPE_DEV;
 	buf->dev.dev = pcm->card->dev;
 	buf->private_data = NULL;
-	buf->area = dma_alloc_writecombine(pcm->card->dev, size,
-					   &buf->addr, GFP_KERNEL);
+	buf->area = dma_alloc_wc(pcm->card->dev, size, &buf->addr, GFP_KERNEL);
 	if (!buf->area)
 		return -ENOMEM;
 
@@ -207,8 +204,7 @@ static void omap_pcm_free_dma_buffers(struct snd_pcm *pcm)
 		if (!buf->area)
 			continue;
 
-		dma_free_writecombine(pcm->card->dev, buf->bytes,
-				      buf->area, buf->addr);
+		dma_free_wc(pcm->card->dev, buf->bytes, buf->area, buf->addr);
 		buf->area = NULL;
 	}
 }
-- 
cgit v0.10.2


From 36e5cd6b897e17d03008f81e075625d8e43e52d0 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 8 Mar 2016 21:09:29 +0700
Subject: arm64: account for sparsemem section alignment when choosing vmemmap
 offset

Commit dfd55ad85e4a ("arm64: vmemmap: use virtual projection of linear
region") fixed an issue where the struct page array would overflow into the
adjacent virtual memory region if system RAM was placed so high up in
physical memory that its addresses were not representable in the build time
configured virtual address size.

However, the fix failed to take into account that the vmemmap region needs
to be relatively aligned with respect to the sparsemem section size, so that
a sequence of page structs corresponding with a sparsemem section in the
linear region appears naturally aligned in the vmemmap region.

So round up vmemmap to sparsemem section size. Since this essentially moves
the projection of the linear region up in memory, also revert the reduction
of the size of the vmemmap region.

Cc: <stable@vger.kernel.org>
Fixes: dfd55ad85e4a ("arm64: vmemmap: use virtual projection of linear region")
Tested-by: Mark Langsdorf <mlangsdo@redhat.com>
Tested-by: David Daney <david.daney@cavium.com>
Tested-by: Robert Richter <rrichter@cavium.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index f506086..819aff5 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -40,7 +40,7 @@
  * VMALLOC_END: extends to the available space below vmmemmap, PCI I/O space,
  *	fixed mappings and modules
  */
-#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT - 1)) * sizeof(struct page), PUD_SIZE)
+#define VMEMMAP_SIZE		ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE)
 
 #ifndef CONFIG_KASAN
 #define VMALLOC_START		(VA_START)
@@ -52,7 +52,8 @@
 #define VMALLOC_END		(PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K)
 
 #define VMEMMAP_START		(VMALLOC_END + SZ_64K)
-#define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
+#define vmemmap			((struct page *)VMEMMAP_START - \
+				 SECTION_ALIGN_DOWN(memstart_addr >> PAGE_SHIFT))
 
 #define FIRST_USER_ADDRESS	0UL
 
-- 
cgit v0.10.2


From ff7925848b50050732ac0401e0acf27e8b241d7b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 9 Mar 2016 15:22:55 +0000
Subject: arm64: hugetlb: partial revert of 66b3923a1a0f

Commit 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit")
introduced support for huge pages using the contiguous bit in the PTE
as opposed to block mappings, which may be slightly unwieldy (512M) in
64k page configurations.

Unfortunately, this support has resulted in some late regressions when
running the libhugetlbfs test suite with 64k pages and CONFIG_DEBUG_VM
as a result of a BUG:

 | readback (2M: 64):	------------[ cut here ]------------
 | kernel BUG at fs/hugetlbfs/inode.c:446!
 | Internal error: Oops - BUG: 0 [#1] SMP
 | Modules linked in:
 | CPU: 7 PID: 1448 Comm: readback Not tainted 4.5.0-rc7 #148
 | Hardware name: linux,dummy-virt (DT)
 | task: fffffe0040964b00 ti: fffffe00c2668000 task.ti: fffffe00c2668000
 | PC is at remove_inode_hugepages+0x44c/0x480
 | LR is at remove_inode_hugepages+0x264/0x480

Rather than revert the entire patch, simply avoid advertising the
contiguous huge page sizes for now while people are actively working on
a fix. This patch can then be reverted once things have been sorted out.

Cc: David Woods <dwoods@ezchip.com>
Reported-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 82d607c..da30529 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -306,10 +306,6 @@ static __init int setup_hugepagesz(char *opt)
 		hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
 	} else if (ps == PUD_SIZE) {
 		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
-	} else if (ps == (PAGE_SIZE * CONT_PTES)) {
-		hugetlb_add_hstate(CONT_PTE_SHIFT);
-	} else if (ps == (PMD_SIZE * CONT_PMDS)) {
-		hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT);
 	} else {
 		pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10);
 		return 0;
@@ -317,13 +313,3 @@ static __init int setup_hugepagesz(char *opt)
 	return 1;
 }
 __setup("hugepagesz=", setup_hugepagesz);
-
-#ifdef CONFIG_ARM64_64K_PAGES
-static __init int add_default_hugepagesz(void)
-{
-	if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
-		hugetlb_add_hstate(CONT_PMD_SHIFT);
-	return 0;
-}
-arch_initcall(add_default_hugepagesz);
-#endif
-- 
cgit v0.10.2


From dc17147de328a74bbdee67c1bf37d2f1992de756 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 9 Mar 2016 11:58:41 -0500
Subject: tracing: Fix check for cpu online when event is disabled

Commit f37755490fe9b ("tracepoints: Do not trace when cpu is offline") added
a check to make sure that tracepoints only get called when the cpu is
online, as it uses rcu_read_lock_sched() for protection.

Commit 3a630178fd5f3 ("tracing: generate RCU warnings even when tracepoints
are disabled") added lockdep checks (including rcu checks) for events that
are not enabled to catch possible RCU issues that would only be triggered if
a trace event was enabled. Commit f37755490fe9b only stopped the warnings
when the trace event was enabled but did not prevent warnings if the trace
event was called when disabled.

To fix this, the cpu online check is moved to where the condition is added
to the trace event. This will place the cpu online check in all places that
it may be used now and in the future.

Cc: stable@vger.kernel.org # v3.18+
Fixes: f37755490fe9b ("tracepoints: Do not trace when cpu is offline")
Fixes: 3a630178fd5f3 ("tracing: generate RCU warnings even when tracepoints are disabled")
Reported-by: Sudeep Holla <sudeep.holla@arm.com>
Tested-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index acfdbf3..be586c6 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -134,9 +134,6 @@ extern void syscall_unregfunc(void);
 		void *it_func;						\
 		void *__data;						\
 									\
-		if (!cpu_online(raw_smp_processor_id()))		\
-			return;						\
-									\
 		if (!(cond))						\
 			return;						\
 		prercu;							\
@@ -343,15 +340,19 @@ extern void syscall_unregfunc(void);
  * "void *__data, proto" as the callback prototype.
  */
 #define DECLARE_TRACE_NOARGS(name)					\
-		__DECLARE_TRACE(name, void, , 1, void *__data, __data)
+	__DECLARE_TRACE(name, void, ,					\
+			cpu_online(raw_smp_processor_id()),		\
+			void *__data, __data)
 
 #define DECLARE_TRACE(name, proto, args)				\
-		__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), 1,	\
-				PARAMS(void *__data, proto),		\
-				PARAMS(__data, args))
+	__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),		\
+			cpu_online(raw_smp_processor_id()),		\
+			PARAMS(void *__data, proto),			\
+			PARAMS(__data, args))
 
 #define DECLARE_TRACE_CONDITION(name, proto, args, cond)		\
-	__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), PARAMS(cond), \
+	__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),		\
+			cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \
 			PARAMS(void *__data, proto),			\
 			PARAMS(__data, args))
 
-- 
cgit v0.10.2


From 910154d520c97cd0095a889e6b878041c91111a6 Mon Sep 17 00:00:00 2001
From: Geoffrey Thomas <geofft@ldpreload.com>
Date: Wed, 9 Mar 2016 14:08:04 -0800
Subject: mm/hugetlb: hugetlb_no_page: rate-limit warning message

The warning message "killed due to inadequate hugepage pool" simply
indicates that SIGBUS was sent, not that the process was forcibly killed.
If the process has a signal handler installed does not fix the problem,
this message can rapidly spam the kernel log.

On my amd64 dev machine that does not have hugepages configured, I can
reproduce the repeated warnings easily by setting vm.nr_hugepages=2 (i.e.,
4 megabytes of huge pages) and running something that sets a signal
handler and forks, like

  #include <sys/mman.h>
  #include <signal.h>
  #include <stdlib.h>
  #include <unistd.h>

  sig_atomic_t counter = 10;
  void handler(int signal)
  {
      if (counter-- == 0)
         exit(0);
  }

  int main(void)
  {
      int status;
      char *addr = mmap(NULL, 4 * 1048576, PROT_READ | PROT_WRITE,
              MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
      if (addr == MAP_FAILED) {perror("mmap"); return 1;}
      *addr = 'x';
      switch (fork()) {
         case -1:
            perror("fork"); return 1;
         case 0:
            signal(SIGBUS, handler);
            *addr = 'x';
            break;
         default:
            *addr = 'x';
            wait(&status);
            if (WIFSIGNALED(status)) {
               psignal(WTERMSIG(status), "child");
            }
            break;
      }
  }

Signed-off-by: Geoffrey Thomas <geofft@ldpreload.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 01f2b48..0e27a9d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3502,7 +3502,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * COW. Warn that such a situation has occurred as it may not be obvious
 	 */
 	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
-		pr_warning("PID %d killed due to inadequate hugepage pool\n",
+		pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
 			   current->pid);
 		return ret;
 	}
-- 
cgit v0.10.2


From 06b241f32c711d7ca868a0351dd97fe91fd8817b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 9 Mar 2016 14:08:07 -0800
Subject: mm: __delete_from_page_cache show Bad page if mapped

Commit e1534ae95004 ("mm: differentiate page_mapped() from
page_mapcount() for compound pages") changed the famous
BUG_ON(page_mapped(page)) in __delete_from_page_cache() to
VM_BUG_ON_PAGE(page_mapped(page)): which gives us more info when
CONFIG_DEBUG_VM=y, but nothing at all when not.

Although it has not usually been very helpul, being hit long after the
error in question, we do need to know if it actually happens on users'
systems; but reinstating a crash there is likely to be opposed :)

In the non-debug case, pr_alert("BUG: Bad page cache") plus dump_page(),
dump_stack(), add_taint() - I don't really believe LOCKDEP_NOW_UNRELIABLE,
but that seems to be the standard procedure now.  Move that, or the
VM_BUG_ON_PAGE(), up before the deletion from tree: so that the
unNULLified page->mapping gives a little more information.

If the inode is being evicted (rather than truncated), it won't have any
vmas left, so it's safe(ish) to assume that the raised mapcount is
erroneous, and we can discount it from page_count to avoid leaking the
page (I'm less worried by leaking the occasional 4kB, than losing a
potential 2MB page with each 4kB page leaked).

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/filemap.c b/mm/filemap.c
index 3461d97..da7a35d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -195,6 +195,30 @@ void __delete_from_page_cache(struct page *page, void *shadow,
 	else
 		cleancache_invalidate_page(mapping, page);
 
+	VM_BUG_ON_PAGE(page_mapped(page), page);
+	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
+		int mapcount;
+
+		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
+			 current->comm, page_to_pfn(page));
+		dump_page(page, "still mapped when deleted");
+		dump_stack();
+		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+		mapcount = page_mapcount(page);
+		if (mapping_exiting(mapping) &&
+		    page_count(page) >= mapcount + 2) {
+			/*
+			 * All vmas have already been torn down, so it's
+			 * a good bet that actually the page is unmapped,
+			 * and we'd prefer not to leak it: if we're wrong,
+			 * some other bad page check should catch it later.
+			 */
+			page_mapcount_reset(page);
+			atomic_sub(mapcount, &page->_count);
+		}
+	}
+
 	page_cache_tree_delete(mapping, page, shadow);
 
 	page->mapping = NULL;
@@ -205,7 +229,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
 		__dec_zone_page_state(page, NR_FILE_PAGES);
 	if (PageSwapBacked(page))
 		__dec_zone_page_state(page, NR_SHMEM);
-	VM_BUG_ON_PAGE(page_mapped(page), page);
 
 	/*
 	 * At this point page must be either written or cleaned by truncate.
-- 
cgit v0.10.2


From d77a117e6871ff78a06def46583d23752593de60 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 9 Mar 2016 14:08:10 -0800
Subject: list: kill list_force_poison()

Given we have uninitialized list_heads being passed to list_add() it
will always be the case that those uninitialized values randomly trigger
the poison value.  Especially since a list_add() operation will seed the
stack with the poison value for later stack allocations to trip over.

For example, see these two false positive reports:

  list_add attempted on force-poisoned entry
  WARNING: at lib/list_debug.c:34
  [..]
  NIP [c00000000043c390] __list_add+0xb0/0x150
  LR [c00000000043c38c] __list_add+0xac/0x150
  Call Trace:
    __list_add+0xac/0x150 (unreliable)
    __down+0x4c/0xf8
    down+0x68/0x70
    xfs_buf_lock+0x4c/0x150 [xfs]

  list_add attempted on force-poisoned entry(0000000000000500),
   new->next == d0000000059ecdb0, new->prev == 0000000000000500
  WARNING: at lib/list_debug.c:33
  [..]
  NIP [c00000000042db78] __list_add+0xa8/0x140
  LR [c00000000042db74] __list_add+0xa4/0x140
  Call Trace:
    __list_add+0xa4/0x140 (unreliable)
    rwsem_down_read_failed+0x6c/0x1a0
    down_read+0x58/0x60
    xfs_log_commit_cil+0x7c/0x600 [xfs]

Fixes: commit 5c2c2587b132 ("mm, dax, pmem: introduce {get|put}_dev_pagemap() for dax-gup")
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Eryu Guan <eguan@redhat.com>
Tested-by: Eryu Guan <eguan@redhat.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/list.h b/include/linux/list.h
index 30cf420..5356f4d 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -113,17 +113,6 @@ extern void __list_del_entry(struct list_head *entry);
 extern void list_del(struct list_head *entry);
 #endif
 
-#ifdef CONFIG_DEBUG_LIST
-/*
- * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one
- * of the pages it allocates is ever passed to list_add()
- */
-extern void list_force_poison(struct list_head *entry);
-#else
-/* fallback to the less strict LIST_POISON* definitions */
-#define list_force_poison list_del
-#endif
-
 /**
  * list_replace - replace old entry by new one
  * @old : the element to be replaced
diff --git a/kernel/memremap.c b/kernel/memremap.c
index b981a7b..778191e 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -351,8 +351,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	for_each_device_pfn(pfn, page_map) {
 		struct page *page = pfn_to_page(pfn);
 
-		/* ZONE_DEVICE pages must never appear on a slab lru */
-		list_force_poison(&page->lru);
+		/*
+		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
+		 * pointer.  It is a bug if a ZONE_DEVICE page is ever
+		 * freed or placed on a driver-private list.  Seed the
+		 * storage with LIST_POISON* values.
+		 */
+		list_del(&page->lru);
 		page->pgmap = pgmap;
 	}
 	devres_add(dev, page_map);
diff --git a/lib/list_debug.c b/lib/list_debug.c
index 3345a08..3859bf6 100644
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -12,13 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/rculist.h>
 
-static struct list_head force_poison;
-void list_force_poison(struct list_head *entry)
-{
-	entry->next = &force_poison;
-	entry->prev = &force_poison;
-}
-
 /*
  * Insert a new entry between two known consecutive entries.
  *
@@ -30,8 +23,6 @@ void __list_add(struct list_head *new,
 			      struct list_head *prev,
 			      struct list_head *next)
 {
-	WARN(new->next == &force_poison || new->prev == &force_poison,
-		"list_add attempted on force-poisoned entry\n");
 	WARN(next->prev != prev,
 		"list_add corruption. next->prev should be "
 		"prev (%p), but was %p. (next=%p).\n",
-- 
cgit v0.10.2


From 5f29a77cd95771edebbf41e5800fdd557c69302a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 9 Mar 2016 14:08:13 -0800
Subject: mm: fix mixed zone detection in devm_memremap_pages

The check for whether we overlap "System RAM" needs to be done at
section granularity.  For example a system with the following mapping:

    100000000-37bffffff : System RAM
    37c000000-837ffffff : Persistent Memory

...is unable to use devm_memremap_pages() as it would result in two
zones colliding within a given section.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 778191e..60baf4d3 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -270,13 +270,16 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
 void *devm_memremap_pages(struct device *dev, struct resource *res,
 		struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
-	int is_ram = region_intersects(res->start, resource_size(res),
-			"System RAM");
 	resource_size_t key, align_start, align_size, align_end;
 	struct dev_pagemap *pgmap;
 	struct page_map *page_map;
+	int error, nid, is_ram;
 	unsigned long pfn;
-	int error, nid;
+
+	align_start = res->start & ~(SECTION_SIZE - 1);
+	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
+		- align_start;
+	is_ram = region_intersects(align_start, align_size, "System RAM");
 
 	if (is_ram == REGION_MIXED) {
 		WARN_ONCE(1, "%s attempted on mixed region %pr\n",
@@ -314,8 +317,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 
 	mutex_lock(&pgmap_lock);
 	error = 0;
-	align_start = res->start & ~(SECTION_SIZE - 1);
-	align_size = ALIGN(resource_size(res), SECTION_SIZE);
 	align_end = align_start + align_size - 1;
 	for (key = align_start; key <= align_end; key += SECTION_SIZE) {
 		struct dev_pagemap *dup;
-- 
cgit v0.10.2


From e3ae116339f9a0c77523abc95e338fa405946e07 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 9 Mar 2016 14:08:15 -0800
Subject: kasan: add functions to clear stack poison

Functions which the compiler has instrumented for ASAN place poison on
the stack shadow upon entry and remove this poison prior to returning.

In some cases (e.g. hotplug and idle), CPUs may exit the kernel a
number of levels deep in C code.  If there are any instrumented
functions on this critical path, these will leave portions of the idle
thread stack shadow poisoned.

If a CPU returns to the kernel via a different path (e.g. a cold
entry), then depending on stack frame layout subsequent calls to
instrumented functions may use regions of the stack with stale poison,
resulting in (spurious) KASAN splats to the console.

Contemporary GCCs always add stack shadow poisoning when ASAN is
enabled, even when asked to not instrument a function [1], so we can't
simply annotate functions on the critical path to avoid poisoning.

Instead, this series explicitly removes any stale poison before it can
be hit.  In the common hotplug case we clear the entire stack shadow in
common code, before a CPU is brought online.

On architectures which perform a cold return as part of cpu idle may
retain an architecture-specific amount of stack contents.  To retain the
poison for this retained context, the arch code must call the core KASAN
code, passing a "watermark" stack pointer value beyond which shadow will
be cleared.  Architectures which don't perform a cold return as part of
idle do not need any additional code.

This patch (of 3):

Functions which the compiler has instrumented for KASAN place poison on
the stack shadow upon entry and remove this poision prior to returning.

In some cases (e.g.  hotplug and idle), CPUs may exit the kernel a number
of levels deep in C code.  If there are any instrumented functions on this
critical path, these will leave portions of the stack shadow poisoned.

If a CPU returns to the kernel via a different path (e.g.  a cold entry),
then depending on stack frame layout subsequent calls to instrumented
functions may use regions of the stack with stale poison, resulting in
(spurious) KASAN splats to the console.

To avoid this, we must clear stale poison from the stack prior to
instrumented functions being called.  This patch adds functions to the
KASAN core for removing poison from (portions of) a task's stack.  These
will be used by subsequent patches to avoid problems with hotplug and
idle.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 4b9f85c..0fdc798 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_KASAN_H
 #define _LINUX_KASAN_H
 
+#include <linux/sched.h>
 #include <linux/types.h>
 
 struct kmem_cache;
@@ -13,7 +14,6 @@ struct vm_struct;
 
 #include <asm/kasan.h>
 #include <asm/pgtable.h>
-#include <linux/sched.h>
 
 extern unsigned char kasan_zero_page[PAGE_SIZE];
 extern pte_t kasan_zero_pte[PTRS_PER_PTE];
@@ -43,6 +43,8 @@ static inline void kasan_disable_current(void)
 
 void kasan_unpoison_shadow(const void *address, size_t size);
 
+void kasan_unpoison_task_stack(struct task_struct *task);
+
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
@@ -66,6 +68,8 @@ void kasan_free_shadow(const struct vm_struct *vm);
 
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
 
+static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+
 static inline void kasan_enable_current(void) {}
 static inline void kasan_disable_current(void) {}
 
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index bc0a8d8..1ad20ad 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/kmemleak.h>
+#include <linux/linkage.h>
 #include <linux/memblock.h>
 #include <linux/memory.h>
 #include <linux/mm.h>
@@ -60,6 +61,25 @@ void kasan_unpoison_shadow(const void *address, size_t size)
 	}
 }
 
+static void __kasan_unpoison_stack(struct task_struct *task, void *sp)
+{
+	void *base = task_stack_page(task);
+	size_t size = sp - base;
+
+	kasan_unpoison_shadow(base, size);
+}
+
+/* Unpoison the entire stack for a task. */
+void kasan_unpoison_task_stack(struct task_struct *task)
+{
+	__kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+}
+
+/* Unpoison the stack for the current task beyond a watermark sp value. */
+asmlinkage void kasan_unpoison_remaining_stack(void *sp)
+{
+	__kasan_unpoison_stack(current, sp);
+}
 
 /*
  * All functions below always inlined so compiler could
-- 
cgit v0.10.2


From e1b77c92981a522223bd1ac118fdcade6b7ad086 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 9 Mar 2016 14:08:18 -0800
Subject: sched/kasan: remove stale KASAN poison after hotplug

Functions which the compiler has instrumented for KASAN place poison on
the stack shadow upon entry and remove this poision prior to returning.

In the case of CPU hotplug, CPUs exit the kernel a number of levels deep
in C code.  Any instrumented functions on this critical path will leave
portions of the stack shadow poisoned.

When a CPU is subsequently brought back into the kernel via a different
path, depending on stackframe, layout calls to instrumented functions
may hit this stale poison, resulting in (spurious) KASAN splats to the
console.

To avoid this, clear any stale poison from the idle thread for a CPU
prior to bringing a CPU online.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Alexander Potapenko <glider@google.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9503d59..41f6b22 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
  *              Thomas Gleixner, Mike Kravetz
  */
 
+#include <linux/kasan.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/nmi.h>
@@ -5096,6 +5097,8 @@ void init_idle(struct task_struct *idle, int cpu)
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 
+	kasan_unpoison_task_stack(idle);
+
 #ifdef CONFIG_SMP
 	/*
 	 * Its possible that init_idle() gets called multiple times on a task,
-- 
cgit v0.10.2


From 0d97e6d8024c71cc838b292c01d5bd951e080eba Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 9 Mar 2016 14:08:21 -0800
Subject: arm64: kasan: clear stale stack poison

Functions which the compiler has instrumented for KASAN place poison on
the stack shadow upon entry and remove this poison prior to returning.

In the case of cpuidle, CPUs exit the kernel a number of levels deep in
C code.  Any instrumented functions on this critical path will leave
portions of the stack shadow poisoned.

If CPUs lose context and return to the kernel via a cold path, we
restore a prior context saved in __cpu_suspend_enter are forgotten, and
we never remove the poison they placed in the stack shadow area by
functions calls between this and the actual exit of the kernel.

Thus, (depending on stackframe layout) subsequent calls to instrumented
functions may hit this stale poison, resulting in (spurious) KASAN
splats to the console.

To avoid this, clear any stale poison from the idle thread for a CPU
prior to bringing a CPU online.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index e33fe33..fd10eb6 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -145,6 +145,10 @@ ENTRY(cpu_resume_mmu)
 ENDPROC(cpu_resume_mmu)
 	.popsection
 cpu_resume_after_mmu:
+#ifdef CONFIG_KASAN
+	mov	x0, sp
+	bl	kasan_unpoison_remaining_stack
+#endif
 	mov	x0, #0			// return zero on success
 	ldp	x19, x20, [sp, #16]
 	ldp	x21, x22, [sp, #32]
-- 
cgit v0.10.2


From 566e8dfd88d9e0d12873ca69c26e82c0af8479d8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 9 Mar 2016 14:08:24 -0800
Subject: ocfs2: fix return value from ocfs2_page_mkwrite()

ocfs2_page_mkwrite() could mistakenly return error code instead of
mkwrite status value.  Fix it.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9581d190..77ebc2b 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret < 0) {
 		mlog_errno(ret);
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else
+			ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
-- 
cgit v0.10.2


From 30f471fd88e0304bee2c17ef1a4651e705870817 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Wed, 9 Mar 2016 14:08:27 -0800
Subject: dax: check return value of dax_radix_entry()

dax_pfn_mkwrite() previously wasn't checking the return value of the
call to dax_radix_entry(), which was a mistake.

Instead, capture this return value and return the appropriate VM_FAULT_
value.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/fs/dax.c b/fs/dax.c
index 7111724..bbb2ad7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1056,6 +1056,7 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
+	int error;
 
 	/*
 	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
@@ -1065,7 +1066,13 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * saves us from having to make a call to get_block() here to look
 	 * up the sector.
 	 */
-	dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true);
+	error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
+			true);
+
+	if (error == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (error)
+		return VM_FAULT_SIGBUS;
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
-- 
cgit v0.10.2


From 0a2e280b6d8ea4afef07c749070705d6af403b7f Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 9 Mar 2016 14:08:30 -0800
Subject: mm, thp: fix migration of PTE-mapped transparent huge pages

We don't have native support of THP migration, so we have to split huge
page into small pages in order to migrate it to different node.  This
includes PTE-mapped huge pages.

I made mistake in refcounting patchset: we don't actually split
PTE-mapped huge page in queue_pages_pte_range(), if we step on head
page.

The result is that the head page is queued for migration, but none of
tail pages: putting head page on queue takes pin on the page and any
subsequent attempts of split_huge_pages() would fail and we skip queuing
tail pages.

unmap_and_move_huge_page() will eventually split the huge pages, but
only one of 512 pages would get migrated.

Let's fix the situation.

Fixes: 248db92da13f2507 ("migrate_pages: try to split pages on queuing")
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4c4187c..9a3f6b9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -532,7 +532,7 @@ retry:
 		nid = page_to_nid(page);
 		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
 			continue;
-		if (PageTail(page) && PageAnon(page)) {
+		if (PageTransCompound(page) && PageAnon(page)) {
 			get_page(page);
 			pte_unmap_unlock(pte, ptl);
 			lock_page(page);
-- 
cgit v0.10.2


From ac343e882a8377caef5fa75d9093cb77e9d4bf6d Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 9 Mar 2016 14:08:32 -0800
Subject: memremap: check pfn validity before passing to pfn_to_page()

In memremap's helper function try_ram_remap(), we dereference a struct
page pointer that was derived from a PFN that is known to be covered by
a 'System RAM' iomem region, and is thus assumed to be a 'valid' PFN,
i.e., a PFN that has a struct page associated with it and is covered by
the kernel direct mapping.

However, the assumption that there is a 1:1 relation between the System
RAM iomem region and the kernel direct mapping is not universally valid
on all architectures, and on ARM and arm64, 'System RAM' may include
regions for which pfn_valid() returns false.

Generally speaking, both __va() and pfn_to_page() should only ever be
called on PFNs/physical addresses for which pfn_valid() returns true, so
add that check to try_ram_remap().

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 60baf4d3..6cf5461 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -29,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
 
 static void *try_ram_remap(resource_size_t offset, size_t size)
 {
-	struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
+	unsigned long pfn = PHYS_PFN(offset);
 
 	/* In the simple case just return the existing linear address */
-	if (!PageHighMem(page))
+	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
 		return __va(offset);
 	return NULL; /* fallback to ioremap_cache */
 }
-- 
cgit v0.10.2


From 86613628b3d367743f71b945c203774c522404f4 Mon Sep 17 00:00:00 2001
From: Jan Stancek <jstancek@redhat.com>
Date: Wed, 9 Mar 2016 14:08:35 -0800
Subject: mm/hugetlb: use EOPNOTSUPP in hugetlb sysctl handlers

Replace ENOTSUPP with EOPNOTSUPP.  If hugepages are not supported, this
value is propagated to userspace.  EOPNOTSUPP is part of uapi and is
widely supported by libc libraries.

It gives nicer message to user, rather than:

  # cat /proc/sys/vm/nr_hugepages
  cat: /proc/sys/vm/nr_hugepages: Unknown error 524

And also LTP's proc01 test was failing because this ret code (524)
was unexpected:

  proc01      1  TFAIL  :  proc01.c:396: read failed: /proc/sys/vm/nr_hugepages: errno=???(524): Unknown error 524
  proc01      2  TFAIL  :  proc01.c:396: read failed: /proc/sys/vm/nr_hugepages_mempolicy: errno=???(524): Unknown error 524
  proc01      3  TFAIL  :  proc01.c:396: read failed: /proc/sys/vm/nr_overcommit_hugepages: errno=???(524): Unknown error 524

Signed-off-by: Jan Stancek <jstancek@redhat.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0e27a9d..aefba5a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2751,7 +2751,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
 	int ret;
 
 	if (!hugepages_supported())
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	table->data = &tmp;
 	table->maxlen = sizeof(unsigned long);
@@ -2792,7 +2792,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 	int ret;
 
 	if (!hugepages_supported())
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	tmp = h->nr_overcommit_huge_pages;
 
-- 
cgit v0.10.2


From d6b7eaeb03421139e32800324ef04ab50bba886d Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Wed, 9 Mar 2016 14:08:38 -0800
Subject: dma-mapping: avoid oops when parameter cpu_addr is null

To keep consistent with kfree, which tolerate ptr is NULL.  We do this
because sometimes we may use goto statement, so that success and failure
case can share parts of the code.  But unfortunately, dma_free_coherent
called with parameter cpu_addr is null will cause oops, such as showed
below:

  Unable to handle kernel paging request at virtual address ffffffc020d3b2b8
  pgd = ffffffc083a61000
  [ffffffc020d3b2b8] *pgd=0000000000000000, *pud=0000000000000000
  CPU: 4 PID: 1489 Comm: malloc_dma_1 Tainted: G           O    4.1.12 #1
  Hardware name: ARM64 (DT)
  PC is at __dma_free_coherent.isra.10+0x74/0xc8
  LR is at __dma_free+0x9c/0xb0
  Process malloc_dma_1 (pid: 1489, stack limit = 0xffffffc0837fc020)
  [...]
  Call trace:
    __dma_free_coherent.isra.10+0x74/0xc8
    __dma_free+0x9c/0xb0
    malloc_dma+0x104/0x158 [dma_alloc_coherent_mtmalloc]
    kthread+0xec/0xfc

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 75857cd..728ef07 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -386,7 +386,7 @@ static inline void dma_free_attrs(struct device *dev, size_t size,
 	if (dma_release_from_coherent(dev, get_order(size), cpu_addr))
 		return;
 
-	if (!ops->free)
+	if (!ops->free || !cpu_addr)
 		return;
 
 	debug_dma_free_coherent(dev, size, cpu_addr, dma_handle);
-- 
cgit v0.10.2


From 6ffe77bad545f4a7c8edd2a4ee797ccfcd894ab4 Mon Sep 17 00:00:00 2001
From: Eryu Guan <guaneryu@gmail.com>
Date: Sun, 21 Feb 2016 18:38:44 -0500
Subject: ext4: iterate over buffer heads correctly in move_extent_per_page()

In commit bcff24887d00 ("ext4: don't read blocks from disk after extents
being swapped") bh is not updated correctly in the for loop and wrong
data has been written to disk. generic/324 catches this on sub-page
block size ext4.

Fixes: bcff24887d00 ("ext4: don't read blocks from disk after extentsbeing swapped")
Signed-off-by: Eryu Guan <guaneryu@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e032a04..4098acc 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -390,6 +390,7 @@ data_copy:
 		*err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
 		if (*err < 0)
 			break;
+		bh = bh->b_this_page;
 	}
 	if (!*err)
 		*err = block_commit_write(pagep[0], from, from + replaced_size);
-- 
cgit v0.10.2


From 3446c13b268af86391d06611327006b059b8bab1 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 15 Feb 2016 14:46:49 +0100
Subject: s390/mm: four page table levels vs. fork
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fork of a process with four page table levels is broken since
git commit 6252d702c5311ce9 "[S390] dynamic page tables."

All new mm contexts are created with three page table levels and
an asce limit of 4TB. If the parent has four levels dup_mmap will
add vmas to the new context which are outside of the asce limit.
The subsequent call to copy_page_range will walk the three level
page table structure of the new process with non-zero pgd and pud
indexes. This leads to memory clobbers as the pgd_index *and* the
pud_index is added to the mm->pgd pointer without a pgd_deref
in between.

The init_new_context() function is selecting the number of page
table levels for a new context. The function is used by mm_init()
which in turn is called by dup_mm() and mm_alloc(). These two are
used by fork() and exec(). The init_new_context() function can
distinguish the two cases by looking at mm->context.asce_limit,
for fork() the mm struct has been copied and the number of page
table levels may not change. For exec() the mm_alloc() function
set the new mm structure to zero, in this case a three-level page
table is created as the temporary stack space is located at
STACK_TOP_MAX = 4TB.

This fixes CVE-2016-2143.

Reported-by: Marcin Kościelnicki <koriakin@0x04.net>
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: stable@vger.kernel.org
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index fb1b93e..e485817 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,17 +15,25 @@
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 {
+	spin_lock_init(&mm->context.list_lock);
+	INIT_LIST_HEAD(&mm->context.pgtable_list);
+	INIT_LIST_HEAD(&mm->context.gmap_list);
 	cpumask_clear(&mm->context.cpu_attach_mask);
 	atomic_set(&mm->context.attach_count, 0);
 	mm->context.flush_mm = 0;
-	mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS;
-	mm->context.asce_bits |= _ASCE_TYPE_REGION3;
 #ifdef CONFIG_PGSTE
 	mm->context.alloc_pgste = page_table_allocate_pgste;
 	mm->context.has_pgste = 0;
 	mm->context.use_skey = 0;
 #endif
-	mm->context.asce_limit = STACK_TOP_MAX;
+	if (mm->context.asce_limit == 0) {
+		/* context created by exec, set asce limit to 4TB */
+		mm->context.asce_bits = _ASCE_TABLE_LENGTH |
+			_ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+		mm->context.asce_limit = STACK_TOP_MAX;
+	} else if (mm->context.asce_limit == (1UL << 31)) {
+		mm_inc_nr_pmds(mm);
+	}
 	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
 	return 0;
 }
@@ -111,8 +119,6 @@ static inline void activate_mm(struct mm_struct *prev,
 static inline void arch_dup_mmap(struct mm_struct *oldmm,
 				 struct mm_struct *mm)
 {
-	if (oldmm->context.asce_limit < mm->context.asce_limit)
-		crst_table_downgrade(mm, oldmm->context.asce_limit);
 }
 
 static inline void arch_exit_mmap(struct mm_struct *mm)
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 7b7858f..d7cc79f 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -100,12 +100,26 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	spin_lock_init(&mm->context.list_lock);
-	INIT_LIST_HEAD(&mm->context.pgtable_list);
-	INIT_LIST_HEAD(&mm->context.gmap_list);
-	return (pgd_t *) crst_table_alloc(mm);
+	unsigned long *table = crst_table_alloc(mm);
+
+	if (!table)
+		return NULL;
+	if (mm->context.asce_limit == (1UL << 31)) {
+		/* Forking a compat process with 2 page table levels */
+		if (!pgtable_pmd_page_ctor(virt_to_page(table))) {
+			crst_table_free(mm, table);
+			return NULL;
+		}
+	}
+	return (pgd_t *) table;
+}
+
+static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	if (mm->context.asce_limit == (1UL << 31))
+		pgtable_pmd_page_dtor(virt_to_page(pgd));
+	crst_table_free(mm, (unsigned long *) pgd);
 }
-#define pgd_free(mm, pgd) crst_table_free(mm, (unsigned long *) pgd)
 
 static inline void pmd_populate(struct mm_struct *mm,
 				pmd_t *pmd, pgtable_t pte)
-- 
cgit v0.10.2


From a318beea224d255c38652f88a7ae0d30ac4c23c3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:24 -0800
Subject: selftests/x86: In syscall_nt, test NT|TF as well

Setting TF prevents fastpath returns in most cases, which causes the
test to fail on 32-bit kernels because 32-bit kernels do not, in
fact, handle NT correctly on SYSENTER entries.

The next patch will fix 32-bit kernels.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/bd4bb48af6b10c0dc84aec6dbcf487ed25683495.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c
index 60c06af4..43fcab36 100644
--- a/tools/testing/selftests/x86/syscall_nt.c
+++ b/tools/testing/selftests/x86/syscall_nt.c
@@ -17,6 +17,9 @@
 
 #include <stdio.h>
 #include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <err.h>
 #include <sys/syscall.h>
 #include <asm/processor-flags.h>
 
@@ -26,6 +29,8 @@
 # define WIDTH "l"
 #endif
 
+static unsigned int nerrs;
+
 static unsigned long get_eflags(void)
 {
 	unsigned long eflags;
@@ -39,16 +44,52 @@ static void set_eflags(unsigned long eflags)
 		      : : "rm" (eflags) : "flags");
 }
 
-int main()
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
 {
-	printf("[RUN]\tSet NT and issue a syscall\n");
-	set_eflags(get_eflags() | X86_EFLAGS_NT);
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static void sigtrap(int sig, siginfo_t *si, void *ctx_void)
+{
+}
+
+static void do_it(unsigned long extraflags)
+{
+	unsigned long flags;
+
+	set_eflags(get_eflags() | extraflags);
 	syscall(SYS_getpid);
-	if (get_eflags() & X86_EFLAGS_NT) {
-		printf("[OK]\tThe syscall worked and NT is still set\n");
-		return 0;
+	flags = get_eflags();
+	if ((flags & extraflags) == extraflags) {
+		printf("[OK]\tThe syscall worked and flags are still set\n");
 	} else {
-		printf("[FAIL]\tThe syscall worked but NT was cleared\n");
-		return 1;
+		printf("[FAIL]\tThe syscall worked but flags were cleared (flags = 0x%lx but expected 0x%lx set)\n",
+		       flags, extraflags);
+		nerrs++;
 	}
 }
+
+int main(void)
+{
+	printf("[RUN]\tSet NT and issue a syscall\n");
+	do_it(X86_EFLAGS_NT);
+
+	/*
+	 * Now try it again with TF set -- TF forces returns via IRET in all
+	 * cases except non-ptregs-using 64-bit full fast path syscalls.
+	 */
+
+	sethandler(SIGTRAP, sigtrap, 0);
+
+	printf("[RUN]\tSet NT|TF and issue a syscall\n");
+	do_it(X86_EFLAGS_NT | X86_EFLAGS_TF);
+
+	return nerrs == 0 ? 0 : 1;
+}
-- 
cgit v0.10.2


From e786041153df6343169373177248abfab5c5ac1b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:25 -0800
Subject: x86/entry/compat: In SYSENTER, sink AC clearing below the existing
 FLAGS test

CLAC is slow, and the SYSENTER code already has an unlikely path
that runs if unusual flags are set.  Drop the CLAC and instead rely
on the unlikely path to clear AC.

This seems to save ~24 cycles on my Skylake laptop.  (Hey, Intel,
make this faster please!)

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/90d6db2189f9add83bc7bddd75a0c19ebbd676b2.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 3c990ee..da5c194 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -66,8 +66,6 @@ ENTRY(entry_SYSENTER_compat)
 	 */
 	pushfq				/* pt_regs->flags (except IF = 0) */
 	orl	$X86_EFLAGS_IF, (%rsp)	/* Fix saved flags */
-	ASM_CLAC			/* Clear AC after saving FLAGS */
-
 	pushq	$__USER32_CS		/* pt_regs->cs */
 	xorq    %r8,%r8
 	pushq	%r8			/* pt_regs->ip = 0 (placeholder) */
@@ -90,9 +88,9 @@ ENTRY(entry_SYSENTER_compat)
 	cld
 
 	/*
-	 * Sysenter doesn't filter flags, so we need to clear NT
+	 * SYSENTER doesn't filter flags, so we need to clear NT and AC
 	 * ourselves.  To save a few cycles, we can check whether
-	 * NT was set instead of doing an unconditional popfq.
+	 * either was set instead of doing an unconditional popfq.
 	 * This needs to happen before enabling interrupts so that
 	 * we don't get preempted with NT set.
 	 *
@@ -102,7 +100,7 @@ ENTRY(entry_SYSENTER_compat)
 	 * we're keeping that code behind a branch which will predict as
 	 * not-taken and therefore its instructions won't be fetched.
 	 */
-	testl	$X86_EFLAGS_NT, EFLAGS(%rsp)
+	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC, EFLAGS(%rsp)
 	jnz	.Lsysenter_fix_flags
 .Lsysenter_flags_fixed:
 
-- 
cgit v0.10.2


From 67f590e8d4d718d9bd377b39223f7f69678d6a10 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:26 -0800
Subject: x86/entry/32: Filter NT and speed up AC filtering in SYSENTER

This makes the 32-bit code work just like the 64-bit code.  It should
speed up syscalls on 32-bit kernels on Skylake by something like 20
cycles (by analogy to the 64-bit compat case).

It also cleans up NT just like we do for the 64-bit case.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/07daef3d44bd1ed62a2c866e143e8df64edb40ee.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index e130272..8daa812 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -294,7 +294,6 @@ sysenter_past_esp:
 	pushl	$__USER_DS		/* pt_regs->ss */
 	pushl	%ebp			/* pt_regs->sp (stashed in bp) */
 	pushfl				/* pt_regs->flags (except IF = 0) */
-	ASM_CLAC			/* Clear AC after saving FLAGS */
 	orl	$X86_EFLAGS_IF, (%esp)	/* Fix IF */
 	pushl	$__USER_CS		/* pt_regs->cs */
 	pushl	$0			/* pt_regs->ip = 0 (placeholder) */
@@ -302,6 +301,23 @@ sysenter_past_esp:
 	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
 
 	/*
+	 * SYSENTER doesn't filter flags, so we need to clear NT and AC
+	 * ourselves.  To save a few cycles, we can check whether
+	 * either was set instead of doing an unconditional popfq.
+	 * This needs to happen before enabling interrupts so that
+	 * we don't get preempted with NT set.
+	 *
+	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
+	 * out-of-line as an optimization: NT is unlikely to be set in the
+	 * majority of the cases and instead of polluting the I$ unnecessarily,
+	 * we're keeping that code behind a branch which will predict as
+	 * not-taken and therefore its instructions won't be fetched.
+	 */
+	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC, PT_EFLAGS(%esp)
+	jnz	.Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
+
+	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * turned them off.
 	 */
@@ -339,6 +355,11 @@ sysenter_past_esp:
 .popsection
 	_ASM_EXTABLE(1b, 2b)
 	PTGS_TO_GS_EX
+
+.Lsysenter_fix_flags:
+	pushl	$X86_EFLAGS_FIXED
+	popfl
+	jmp	.Lsysenter_flags_fixed
 ENDPROC(entry_SYSENTER_32)
 
 	# system call handler stub
-- 
cgit v0.10.2


From c2c9b52fab0d0cf993476ed4c34f24da5a1205ae Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:27 -0800
Subject: x86/entry/32: Restore FLAGS on SYSEXIT

We weren't restoring FLAGS at all on SYSEXIT.  Apparently no one cared.

With this patch applied, native kernels should always honor
task_pt_regs()->flags, which opens the door for some sys_iopl()
cleanups.  I'll do those as a separate series, though, since getting
it right will involve tweaking some paravirt ops.

( The short version is that, before this patch, sys_iopl(), invoked via
  SYSENTER, wasn't guaranteed to ever transfer the updated
  regs->flags, so sys_iopl() had to change the hardware flags register
  as well. )

Reported-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3f98b207472dc9784838eb5ca2b89dcc845ce269.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 8daa812..7610906 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -343,6 +343,15 @@ sysenter_past_esp:
 	popl	%eax			/* pt_regs->ax */
 
 	/*
+	 * Restore all flags except IF. (We restore IF separately because
+	 * STI gives a one-instruction window in which we won't be interrupted,
+	 * whereas POPF does not.)
+	 */
+	addl	$PT_EFLAGS-PT_DS, %esp	/* point esp at pt_regs->flags */
+	btr	$X86_EFLAGS_IF_BIT, (%esp)
+	popfl
+
+	/*
 	 * Return back to the vDSO, which will pop ecx and edx.
 	 * Don't bother with DS and ES (they already contain __USER_DS).
 	 */
-- 
cgit v0.10.2


From 81edd9f69a6fd214fdbe66b43de6aa1610c84c63 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:28 -0800
Subject: x86/entry/traps: Clear TIF_BLOCKSTEP on all debug exceptions

The SDM says that debug exceptions clear BTF, and we need to keep
TIF_BLOCKSTEP in sync with BTF.  Clear it unconditionally and improve
the comment.

I suspect that the fact that kmemcheck could cause TIF_BLOCKSTEP not
to be cleared was just an oversight.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/fa86e55d196e6dde5b38839595bde2a292c52fdc.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 410e8e2..1af5621 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -598,6 +598,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	dr6 &= ~DR6_RESERVED;
 
 	/*
+	 * The SDM says "The processor clears the BTF flag when it
+	 * generates a debug exception."  Clear TIF_BLOCKSTEP to keep
+	 * TIF_BLOCKSTEP in sync with the hardware BTF flag.
+	 */
+	clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
+
+	/*
 	 * If dr6 has no reason to give us about the origin of this trap,
 	 * then it's very likely the result of an icebp/int01 trap.
 	 * User wants a sigtrap for that.
@@ -612,11 +619,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	/* DR6 may or may not be cleared by the CPU */
 	set_debugreg(0, 6);
 
-	/*
-	 * The processor cleared BTF, so don't mark that we need it set.
-	 */
-	clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
-
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
 
-- 
cgit v0.10.2


From 8bb5643686d2898bec181c407651cf84e77d413f Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:29 -0800
Subject: x86/entry/traps: Clear DR6 early in do_debug() and improve the
 comment

Leaving any bits set in DR6 on return from a debug exception is
asking for trouble.  Prevent it by writing zero right away and
clarify the comment.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/3857676e1be8fb27db4b89bbb1e2052b7f435ff4.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1af5621..7394be8 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -593,6 +593,18 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	ist_enter(regs);
 
 	get_debugreg(dr6, 6);
+	/*
+	 * The Intel SDM says:
+	 *
+	 *   Certain debug exceptions may clear bits 0-3. The remaining
+	 *   contents of the DR6 register are never cleared by the
+	 *   processor. To avoid confusion in identifying debug
+	 *   exceptions, debug handlers should clear the register before
+	 *   returning to the interrupted task.
+	 *
+	 * Keep it simple: clear DR6 immediately.
+	 */
+	set_debugreg(0, 6);
 
 	/* Filter out all the reserved bits which are preset to 1 */
 	dr6 &= ~DR6_RESERVED;
@@ -616,9 +628,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
 		goto exit;
 
-	/* DR6 may or may not be cleared by the CPU */
-	set_debugreg(0, 6);
-
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
 
-- 
cgit v0.10.2


From f2b375756c839ff655a3e0b45e339f8fbf973093 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:30 -0800
Subject: x86/entry: Vastly simplify SYSENTER TF (single-step) handling

Due to a blatant design error, SYSENTER doesn't clear TF (single-step).

As a result, if a user does SYSENTER with TF set, we will single-step
through the kernel until something clears TF.  There is absolutely
nothing we can do to prevent this short of turning off SYSENTER [1].

Simplify the handling considerably with two changes:

  1. We already sanitize EFLAGS in SYSENTER to clear NT and AC.  We can
     add TF to that list of flags to sanitize with no overhead whatsoever.

  2. Teach do_debug() to ignore single-step traps in the SYSENTER prologue.

That's all we need to do.

Don't get too excited -- our handling is still buggy on 32-bit
kernels.  There's nothing wrong with the SYSENTER code itself, but
the #DB prologue has a clever fixup for traps on the very first
instruction of entry_SYSENTER_32, and the fixup doesn't work quite
correctly.  The next two patches will fix that.

[1] We could probably prevent it by forcing BTF on at all times and
    making sure we clear TF before any branches in the SYSENTER
    code.  Needless to say, this is a bad idea.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a30d2ea06fe4b621fe6a9ef911b02c0f38feb6f2.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 7610906..b2e1d44 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -287,7 +287,26 @@ need_resched:
 END(resume_kernel)
 #endif
 
-	# SYSENTER  call handler stub
+GLOBAL(__begin_SYSENTER_singlestep_region)
+/*
+ * All code from here through __end_SYSENTER_singlestep_region is subject
+ * to being single-stepped if a user program sets TF and executes SYSENTER.
+ * There is absolutely nothing that we can do to prevent this from happening
+ * (thanks Intel!).  To keep our handling of this situation as simple as
+ * possible, we handle TF just like AC and NT, except that our #DB handler
+ * will ignore all of the single-step traps generated in this range.
+ */
+
+#ifdef CONFIG_XEN
+/*
+ * Xen doesn't set %esp to be precisely what the normal SYSENTER
+ * entry point expects, so fix it up before using the normal path.
+ */
+ENTRY(xen_sysenter_target)
+	addl	$5*4, %esp			/* remove xen-provided frame */
+	jmp	sysenter_past_esp
+#endif
+
 ENTRY(entry_SYSENTER_32)
 	movl	TSS_sysenter_sp0(%esp), %esp
 sysenter_past_esp:
@@ -301,19 +320,25 @@ sysenter_past_esp:
 	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
 
 	/*
-	 * SYSENTER doesn't filter flags, so we need to clear NT and AC
-	 * ourselves.  To save a few cycles, we can check whether
+	 * SYSENTER doesn't filter flags, so we need to clear NT, AC
+	 * and TF ourselves.  To save a few cycles, we can check whether
 	 * either was set instead of doing an unconditional popfq.
 	 * This needs to happen before enabling interrupts so that
 	 * we don't get preempted with NT set.
 	 *
+	 * If TF is set, we will single-step all the way to here -- do_debug
+	 * will ignore all the traps.  (Yes, this is slow, but so is
+	 * single-stepping in general.  This allows us to avoid having
+	 * a more complicated code to handle the case where a user program
+	 * forces us to single-step through the SYSENTER entry code.)
+	 *
 	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
 	 * out-of-line as an optimization: NT is unlikely to be set in the
 	 * majority of the cases and instead of polluting the I$ unnecessarily,
 	 * we're keeping that code behind a branch which will predict as
 	 * not-taken and therefore its instructions won't be fetched.
 	 */
-	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC, PT_EFLAGS(%esp)
+	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, PT_EFLAGS(%esp)
 	jnz	.Lsysenter_fix_flags
 .Lsysenter_flags_fixed:
 
@@ -369,6 +394,7 @@ sysenter_past_esp:
 	pushl	$X86_EFLAGS_FIXED
 	popfl
 	jmp	.Lsysenter_flags_fixed
+GLOBAL(__end_SYSENTER_singlestep_region)
 ENDPROC(entry_SYSENTER_32)
 
 	# system call handler stub
@@ -651,14 +677,6 @@ ENTRY(spurious_interrupt_bug)
 END(spurious_interrupt_bug)
 
 #ifdef CONFIG_XEN
-/*
- * Xen doesn't set %esp to be precisely what the normal SYSENTER
- * entry point expects, so fix it up before using the normal path.
- */
-ENTRY(xen_sysenter_target)
-	addl	$5*4, %esp			/* remove xen-provided frame */
-	jmp	sysenter_past_esp
-
 ENTRY(xen_hypervisor_callback)
 	pushl	$-1				/* orig_ax = -1 => not a system call */
 	SAVE_ALL
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index da5c194..cbb5e69 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -94,13 +94,19 @@ ENTRY(entry_SYSENTER_compat)
 	 * This needs to happen before enabling interrupts so that
 	 * we don't get preempted with NT set.
 	 *
+	 * If TF is set, we will single-step all the way to here -- do_debug
+	 * will ignore all the traps.  (Yes, this is slow, but so is
+	 * single-stepping in general.  This allows us to avoid having
+	 * a more complicated code to handle the case where a user program
+	 * forces us to single-step through the SYSENTER entry code.)
+	 *
 	 * NB.: .Lsysenter_fix_flags is a label with the code under it moved
 	 * out-of-line as an optimization: NT is unlikely to be set in the
 	 * majority of the cases and instead of polluting the I$ unnecessarily,
 	 * we're keeping that code behind a branch which will predict as
 	 * not-taken and therefore its instructions won't be fetched.
 	 */
-	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC, EFLAGS(%rsp)
+	testl	$X86_EFLAGS_NT|X86_EFLAGS_AC|X86_EFLAGS_TF, EFLAGS(%rsp)
 	jnz	.Lsysenter_fix_flags
 .Lsysenter_flags_fixed:
 
@@ -121,6 +127,7 @@ ENTRY(entry_SYSENTER_compat)
 	pushq	$X86_EFLAGS_FIXED
 	popfq
 	jmp	.Lsysenter_flags_fixed
+GLOBAL(__end_entry_SYSENTER_compat)
 ENDPROC(entry_SYSENTER_compat)
 
 /*
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index a4a7728..9b9b30b 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -7,12 +7,23 @@
 
 void syscall_init(void);
 
+#ifdef CONFIG_X86_64
 void entry_SYSCALL_64(void);
-void entry_SYSCALL_compat(void);
+#endif
+
+#ifdef CONFIG_X86_32
 void entry_INT80_32(void);
-void entry_INT80_compat(void);
 void entry_SYSENTER_32(void);
+void __begin_SYSENTER_singlestep_region(void);
+void __end_SYSENTER_singlestep_region(void);
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
 void entry_SYSENTER_compat(void);
+void __end_entry_SYSENTER_compat(void);
+void entry_SYSCALL_compat(void);
+void entry_INT80_compat(void);
+#endif
 
 void x86_configure_nx(void);
 void x86_report_nx(void);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7394be8..b0ddb819 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -559,6 +559,29 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
 NOKPROBE_SYMBOL(fixup_bad_iret);
 #endif
 
+static bool is_sysenter_singlestep(struct pt_regs *regs)
+{
+	/*
+	 * We don't try for precision here.  If we're anywhere in the region of
+	 * code that can be single-stepped in the SYSENTER entry path, then
+	 * assume that this is a useless single-step trap due to SYSENTER
+	 * being invoked with TF set.  (We don't know in advance exactly
+	 * which instructions will be hit because BTF could plausibly
+	 * be set.)
+	 */
+#ifdef CONFIG_X86_32
+	return (regs->ip - (unsigned long)__begin_SYSENTER_singlestep_region) <
+		(unsigned long)__end_SYSENTER_singlestep_region -
+		(unsigned long)__begin_SYSENTER_singlestep_region;
+#elif defined(CONFIG_IA32_EMULATION)
+	return (regs->ip - (unsigned long)entry_SYSENTER_compat) <
+		(unsigned long)__end_entry_SYSENTER_compat -
+		(unsigned long)entry_SYSENTER_compat;
+#else
+	return false;
+#endif
+}
+
 /*
  * Our handling of the processor debug registers is non-trivial.
  * We do not clear them on entry and exit from the kernel. Therefore
@@ -616,6 +639,18 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	 */
 	clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
 
+	if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
+		     is_sysenter_singlestep(regs))) {
+		dr6 &= ~DR_STEP;
+		if (!dr6)
+			goto exit;
+		/*
+		 * else we might have gotten a single-step trap and hit a
+		 * watchpoint at the same time, in which case we should fall
+		 * through and handle the watchpoint.
+		 */
+	}
+
 	/*
 	 * If dr6 has no reason to give us about the origin of this trap,
 	 * then it's very likely the result of an icebp/int01 trap.
@@ -624,7 +659,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	if (!dr6 && user_mode(regs))
 		user_icebp = 1;
 
-	/* Catch kmemcheck conditions first of all! */
+	/* Catch kmemcheck conditions! */
 	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
 		goto exit;
 
@@ -659,14 +694,13 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 		goto exit;
 	}
 
-	/*
-	 * Single-stepping through system calls: ignore any exceptions in
-	 * kernel space, but re-enable TF when returning to user mode.
-	 *
-	 * We already checked v86 mode above, so we can check for kernel mode
-	 * by just checking the CPL of CS.
-	 */
-	if ((dr6 & DR_STEP) && !user_mode(regs)) {
+	if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
+		/*
+		 * Historical junk that used to handle SYSENTER single-stepping.
+		 * This should be unreachable now.  If we survive for a while
+		 * without anyone hitting this warning, we'll turn this into
+		 * an oops.
+		 */
 		tsk->thread.debugreg6 &= ~DR_STEP;
 		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 		regs->flags &= ~X86_EFLAGS_TF;
-- 
cgit v0.10.2


From 6dcc94149d605908a7c0c4cf2085340637aac86d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:31 -0800
Subject: x86/entry: Only allocate space for tss_struct::SYSENTER_stack if
 needed

The SYSENTER stack is only used on 32-bit kernels.  Remove it on 64-bit kernels.

( We may end up using it down the road on 64-bit kernels. If so,
  we'll re-enable it for CONFIG_IA32_EMULATION. )

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/9dbd18429f9ff61a76b6eda97a9ea20510b9f6ba.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ecb4103..7cd01b7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -297,10 +297,12 @@ struct tss_struct {
 	 */
 	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
 
+#ifdef CONFIG_X86_32
 	/*
 	 * Space for the temporary SYSENTER stack:
 	 */
 	unsigned long		SYSENTER_stack[64];
+#endif
 
 } ____cacheline_aligned;
 
-- 
cgit v0.10.2


From 7536656f08d0c1a3b4c487d00785c5186ec6f533 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:32 -0800
Subject: x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup

Right after SYSENTER, we can get a #DB or NMI.  On x86_32, there's no IST,
so the exception handler is invoked on the temporary SYSENTER stack.

Because the SYSENTER stack is very small, we have a fixup to switch
off the stack quickly when this happens.  The old fixup had several issues:

 1. It checked the interrupt frame's CS and EIP.  This wasn't
    obviously correct on Xen or if vm86 mode was in use [1].

 2. In the NMI handler, it did some frightening digging into the
    stack frame.  I'm not convinced this digging was correct.

 3. The fixup didn't switch stacks and then switch back.  Instead, it
    synthesized a brand new stack frame that would redirect the IRET
    back to the SYSENTER code.  That frame was highly questionable.
    For one thing, if NMI nested inside #DB, we would effectively
    abort the #DB prologue, which was probably safe but was
    frightening.  For another, the code used PUSHFL to write the
    FLAGS portion of the frame, which was simply bogus -- by the time
    PUSHFL was called, at least TF, NT, VM, and all of the arithmetic
    flags were clobbered.

Simplify this considerably.  Instead of looking at the saved frame
to see where we came from, check the hardware ESP register against
the SYSENTER stack directly.  Malicious user code cannot spoof the
kernel ESP register, and by moving the check after SAVE_ALL, we can
use normal PER_CPU accesses to find all the relevant addresses.

With this patch applied, the improved syscall_nt_32 test finally
passes on 32-bit kernels.

[1] It isn't obviously correct, but it is nonetheless safe from vm86
    shenanigans as far as I can tell.  A user can't point EIP at
    entry_SYSENTER_32 while in vm86 mode because entry_SYSENTER_32,
    like all kernel addresses, is greater than 0xffff and would thus
    violate the CS segment limit.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b2cdbc037031c07ecf2c40a96069318aec0e7971.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b2e1d44..7b3ec24 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -976,51 +976,48 @@ error_code:
 	jmp	ret_from_exception
 END(page_fault)
 
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
-	cmpw	$__KERNEL_CS, 4(%esp)
-	jne	\ok
-\label:
-	movl	TSS_sysenter_sp0 + \offset(%esp), %esp
-	pushfl
-	pushl	$__KERNEL_CS
-	pushl	$sysenter_past_esp
-.endm
-
 ENTRY(debug)
+	/*
+	 * #DB can happen at the first instruction of
+	 * entry_SYSENTER_32 or in Xen's SYSENTER prologue.  If this
+	 * happens, then we will be running on a very small stack.  We
+	 * need to detect this condition and switch to the thread
+	 * stack before calling any C code at all.
+	 *
+	 * If you edit this code, keep in mind that NMIs can happen in here.
+	 */
 	ASM_CLAC
-	cmpl	$entry_SYSENTER_32, (%esp)
-	jne	debug_stack_correct
-	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
 	pushl	$-1				# mark this as an int
 	SAVE_ALL
-	TRACE_IRQS_OFF
 	xorl	%edx, %edx			# error code 0
 	movl	%esp, %eax			# pt_regs pointer
+
+	/* Are we currently on the SYSENTER stack? */
+	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
+	cmpl	$SIZEOF_SYSENTER_stack, %ecx
+	jb	.Ldebug_from_sysenter_stack
+
+	TRACE_IRQS_OFF
+	call	do_debug
+	jmp	ret_from_exception
+
+.Ldebug_from_sysenter_stack:
+	/* We're on the SYSENTER stack.  Switch off. */
+	movl	%esp, %ebp
+	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esp
+	TRACE_IRQS_OFF
 	call	do_debug
+	movl	%ebp, %esp
 	jmp	ret_from_exception
 END(debug)
 
 /*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got  an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
+ * NMI is doubly nasty.  It can happen on the first instruction of
+ * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning
+ * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32
+ * switched stacks.  We handle both conditions by simply checking whether we
+ * interrupted kernel code running on the SYSENTER stack.
  */
 ENTRY(nmi)
 	ASM_CLAC
@@ -1031,41 +1028,32 @@ ENTRY(nmi)
 	popl	%eax
 	je	nmi_espfix_stack
 #endif
-	cmpl	$entry_SYSENTER_32, (%esp)
-	je	nmi_stack_fixup
-	pushl	%eax
-	movl	%esp, %eax
-	/*
-	 * Do not access memory above the end of our stack page,
-	 * it might not exist.
-	 */
-	andl	$(THREAD_SIZE-1), %eax
-	cmpl	$(THREAD_SIZE-20), %eax
-	popl	%eax
-	jae	nmi_stack_correct
-	cmpl	$entry_SYSENTER_32, 12(%esp)
-	je	nmi_debug_stack_check
-nmi_stack_correct:
-	pushl	%eax
+
+	pushl	%eax				# pt_regs->orig_ax
 	SAVE_ALL
 	xorl	%edx, %edx			# zero error code
 	movl	%esp, %eax			# pt_regs pointer
+
+	/* Are we currently on the SYSENTER stack? */
+	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
+	cmpl	$SIZEOF_SYSENTER_stack, %ecx
+	jb	.Lnmi_from_sysenter_stack
+
+	/* Not on SYSENTER stack. */
 	call	do_nmi
 	jmp	restore_all_notrace
 
-nmi_stack_fixup:
-	FIX_STACK 12, nmi_stack_correct, 1
-	jmp	nmi_stack_correct
-
-nmi_debug_stack_check:
-	cmpw	$__KERNEL_CS, 16(%esp)
-	jne	nmi_stack_correct
-	cmpl	$debug, (%esp)
-	jb	nmi_stack_correct
-	cmpl	$debug_esp_fix_insn, (%esp)
-	ja	nmi_stack_correct
-	FIX_STACK 24, nmi_stack_correct, 1
-	jmp	nmi_stack_correct
+.Lnmi_from_sysenter_stack:
+	/*
+	 * We're on the SYSENTER stack.  Switch off.  No one (not even debug)
+	 * is using the thread stack right now, so it's safe for us to use it.
+	 */
+	movl	%esp, %ebp
+	movl	PER_CPU_VAR(cpu_current_top_of_stack), %esp
+	call	do_nmi
+	movl	%ebp, %esp
+	jmp	restore_all_notrace
 
 #ifdef CONFIG_X86_ESPFIX32
 nmi_espfix_stack:
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fdeb0ce..ecdc1d2 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -52,6 +52,11 @@ void foo(void)
 	DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
 	       offsetofend(struct tss_struct, SYSENTER_stack));
 
+	/* Offset from cpu_tss to SYSENTER_stack */
+	OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
+	/* Size of SYSENTER_stack */
+	DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
-- 
cgit v0.10.2


From 2a41aa4feb25af3ead60b740c43df80c576efea2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:33 -0800
Subject: x86/entry/32: Add and check a stack canary for the SYSENTER stack

The first instruction of the SYSENTER entry runs on its own tiny
stack.  That stack can be used if a #DB or NMI is delivered before
the SYSENTER prologue switches to a real stack.

We have code in place to prevent us from overflowing the tiny stack.
For added paranoia, add a canary to the stack and check it in
do_debug() -- that way, if something goes wrong with the #DB logic,
we'll eventually notice.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/6ff9a806f39098b166dc2c41c1db744df5272f29.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7cd01b7..50a6dc8 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -299,8 +299,9 @@ struct tss_struct {
 
 #ifdef CONFIG_X86_32
 	/*
-	 * Space for the temporary SYSENTER stack:
+	 * Space for the temporary SYSENTER stack.
 	 */
+	unsigned long		SYSENTER_stack_canary;
 	unsigned long		SYSENTER_stack[64];
 #endif
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f7c21c..ee9a979 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -57,6 +57,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 	  */
 	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
 #endif
+#ifdef CONFIG_X86_32
+	.SYSENTER_stack_canary	= STACK_END_MAGIC,
+#endif
 };
 EXPORT_PER_CPU_SYMBOL(cpu_tss);
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b0ddb819..49e2e77 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -713,6 +713,14 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_dec();
 
 exit:
+#if defined(CONFIG_X86_32)
+	/*
+	 * This is the most likely code path that involves non-trivial use
+	 * of the SYSENTER stack.  Check that we haven't overrun it.
+	 */
+	WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
+	     "Overran or corrupted SYSENTER stack\n");
+#endif
 	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);
-- 
cgit v0.10.2


From 392a62549fbb80da48811d04391615a25c39d091 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:34 -0800
Subject: x86/entry: Remove TIF_SINGLESTEP entry work

Now that SYSENTER with TF set puts X86_EFLAGS_TF directly into
regs->flags, we don't need a TIF_SINGLESTEP fixup in the syscall
entry code.  Remove it.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2d15f24da52dafc9d2f0b8d76f55544f4779c517.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 1a000f5..209ba33 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -172,16 +172,6 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
 	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
 		BUG_ON(regs != task_pt_regs(current));
 
-	/*
-	 * If we stepped into a sysenter/syscall insn, it trapped in
-	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-	 * If user-mode had set TF itself, then it's still clear from
-	 * do_debug() and we need to set it again to restore the user
-	 * state.  If we entered on the slow path, TF was already set.
-	 */
-	if (work & _TIF_SINGLESTEP)
-		regs->flags |= X86_EFLAGS_TF;
-
 #ifdef CONFIG_SECCOMP
 	/*
 	 * Call seccomp_phase2 before running the other hooks so that
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index c0778fc..f2e2302 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -137,7 +137,7 @@ struct thread_info {
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
-	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
+	 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT |	\
 	 _TIF_NOHZ)
 
 /* work to do on any return to user space */
-- 
cgit v0.10.2


From fda57b2267e12de014069b1596a5438cf76fc7c6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 19:00:35 -0800
Subject: x86/entry: Improve system call entry comments

Ingo suggested that the comments should explain when the various
entries are used.  This adds these explanations and improves other
parts of the comments.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/9524ecef7a295347294300045d08354d6a57c6e7.1457578375.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 7b3ec24..286efa3 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -307,6 +307,38 @@ ENTRY(xen_sysenter_target)
 	jmp	sysenter_past_esp
 #endif
 
+/*
+ * 32-bit SYSENTER entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * if X86_FEATURE_SEP is available.  This is the preferred system call
+ * entry on 32-bit systems.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, ESP, CS, and EIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old EIP (!!!), ESP, or EFLAGS.
+ *
+ * To avoid losing track of EFLAGS.VM (and thus potentially corrupting
+ * user and/or vm86 state), we explicitly disable the SYSENTER
+ * instruction in vm86 mode by reprogramming the MSRs.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ */
 ENTRY(entry_SYSENTER_32)
 	movl	TSS_sysenter_sp0(%esp), %esp
 sysenter_past_esp:
@@ -397,7 +429,34 @@ sysenter_past_esp:
 GLOBAL(__end_SYSENTER_singlestep_region)
 ENDPROC(entry_SYSENTER_32)
 
-	# system call handler stub
+/*
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction.  INT $0x80 lands here.
+ *
+ * This entry point can be used by any 32-bit perform system calls.
+ * Instances of INT $0x80 can be found inline in various programs and
+ * libraries.  It is also used by the vDSO's __kernel_vsyscall
+ * fallback for hardware that doesn't support a faster entry method.
+ * Restarted 32-bit system calls also fall back to INT $0x80
+ * regardless of what instruction was originally used to do the system
+ * call.  (64-bit programs can use INT $0x80 as well, but they can
+ * only run on 64-bit kernels and therefore land in
+ * entry_INT80_compat.)
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  arg6
+ */
 ENTRY(entry_INT80_32)
 	ASM_CLAC
 	pushl	%eax			/* pt_regs->orig_ax */
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 70eadb0..858b555 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -103,6 +103,16 @@ ENDPROC(native_usergs_sysret64)
 /*
  * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
+ * This is the only entry point used for 64-bit system calls.  The
+ * hardware interface is reasonably well designed and the register to
+ * argument mapping Linux uses fits well with the registers that are
+ * available when SYSCALL is used.
+ *
+ * SYSCALL instructions can be found inlined in libc implementations as
+ * well as some other programs and libraries.  There are also a handful
+ * of SYSCALL instructions in the vDSO used, for example, as a
+ * clock_gettimeofday fallback.
+ *
  * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
  * then loads new ss, cs, and rip from previously programmed MSRs.
  * rflags gets masked by a value from another MSR (so CLD and CLAC
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index cbb5e69..efe0fe3 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -19,12 +19,21 @@
 	.section .entry.text, "ax"
 
 /*
- * 32-bit SYSENTER instruction entry.
+ * 32-bit SYSENTER entry.
  *
- * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
- * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on Intel CPUs.
+ *
+ * The SYSENTER instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, a small number of Android devices were shipped
+ * with a copy of Bionic that inlined a SYSENTER instruction.  This
+ * never happened in any of Google's Bionic versions -- it only happened
+ * in a narrow range of Intel-provided versions.
+ *
+ * SYSENTER loads SS, RSP, CS, and RIP from previously programmed MSRs.
+ * IF and VM in RFLAGS are cleared (IOW: interrupts are off).
  * SYSENTER does not save anything on the stack,
- * and does not save old rip (!!!) and rflags.
+ * and does not save old RIP (!!!), RSP, or RFLAGS.
  *
  * Arguments:
  * eax  system call number
@@ -35,10 +44,6 @@
  * edi  arg5
  * ebp  user stack
  * 0(%ebp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
  */
 ENTRY(entry_SYSENTER_compat)
 	/* Interrupts are off on entry. */
@@ -131,17 +136,38 @@ GLOBAL(__end_entry_SYSENTER_compat)
 ENDPROC(entry_SYSENTER_compat)
 
 /*
- * 32-bit SYSCALL instruction entry.
+ * 32-bit SYSCALL entry.
+ *
+ * 32-bit system calls through the vDSO's __kernel_vsyscall enter here
+ * on 64-bit kernels running on AMD CPUs.
+ *
+ * The SYSCALL instruction, in principle, should *only* occur in the
+ * vDSO.  In practice, it appears that this really is the case.
+ * As evidence:
+ *
+ *  - The calling convention for SYSCALL has changed several times without
+ *    anyone noticing.
+ *
+ *  - Prior to the in-kernel X86_BUG_SYSRET_SS_ATTRS fixup, anything
+ *    user task that did SYSCALL without immediately reloading SS
+ *    would randomly crash.
  *
- * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
- * then loads new ss, cs, and rip from previously programmed MSRs.
- * rflags gets masked by a value from another MSR (so CLD and CLAC
- * are not needed). SYSCALL does not save anything on the stack
- * and does not change rsp.
+ *  - Most programmers do not directly target AMD CPUs, and the 32-bit
+ *    SYSCALL instruction does not exist on Intel CPUs.  Even on AMD
+ *    CPUs, Linux disables the SYSCALL instruction on 32-bit kernels
+ *    because the SYSCALL instruction in legacy/native 32-bit mode (as
+ *    opposed to compat mode) is sufficiently poorly designed as to be
+ *    essentially unusable.
  *
- * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * 32-bit SYSCALL saves RIP to RCX, clears RFLAGS.RF, then saves
+ * RFLAGS to R11, then loads new SS, CS, and RIP from previously
+ * programmed MSRs.  RFLAGS gets masked by a value from another MSR
+ * (so CLD and CLAC are not needed).  SYSCALL does not save anything on
+ * the stack and does not change RSP.
+ *
+ * Note: RFLAGS saving+masking-with-MSR happens only in Long mode
  * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
- * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * Don't get confused: RFLAGS saving+masking depends on Long Mode Active bit
  * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
  * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
  *
@@ -241,7 +267,21 @@ sysret32_from_system_call:
 END(entry_SYSCALL_compat)
 
 /*
- * Emulated IA32 system calls via int 0x80.
+ * 32-bit legacy system call entry.
+ *
+ * 32-bit x86 Linux system calls traditionally used the INT $0x80
+ * instruction.  INT $0x80 lands here.
+ *
+ * This entry point can be used by 32-bit and 64-bit programs to perform
+ * 32-bit system calls.  Instances of INT $0x80 can be found inline in
+ * various programs and libraries.  It is also used by the vDSO's
+ * __kernel_vsyscall fallback for hardware that doesn't support a faster
+ * entry method.  Restarted 32-bit system calls also fall back to INT
+ * $0x80 regardless of what instruction was originally used to do the
+ * system call.
+ *
+ * This is considered a slow path.  It is not used by most libc
+ * implementations on modern hardware except during process startup.
  *
  * Arguments:
  * eax  system call number
@@ -250,17 +290,8 @@ END(entry_SYSCALL_compat)
  * edx  arg3
  * esi  arg4
  * edi  arg5
- * ebp  arg6	(note: not saved in the stack frame, should not be touched)
- *
- * Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except eax must be saved (but ptrace may violate that).
- * Arguments are zero extended. For system calls that want sign extension and
- * take long arguments a wrapper is needed. Most calls can just be called
- * directly.
- * Assumes it is only called from user space and entered with interrupts off.
+ * ebp  arg6
  */
-
 ENTRY(entry_INT80_compat)
 	/*
 	 * Interrupts are off on entry.
-- 
cgit v0.10.2


From a65050c6f17e52442716138d48d0a47301a8344b Mon Sep 17 00:00:00 2001
From: Yu-cheng Yu <yu-cheng.yu@intel.com>
Date: Wed, 9 Mar 2016 16:28:54 -0800
Subject: x86/fpu: Revert ("x86/fpu: Disable AVX when eagerfpu is off")

Leonid Shatz noticed that the SDM interpretation of the following
recent commit:

  394db20ca240741 ("x86/fpu: Disable AVX when eagerfpu is off")

... is incorrect and that the original behavior of the FPU code was correct.

Because AVX is not stated in CR0 TS bit description, it was mistakenly
believed to be not supported for lazy context switch. This turns out
to be false:

  Intel Software Developer's Manual Vol. 3A, Sec. 2.5 Control Registers:

   'TS Task Switched bit (bit 3 of CR0) -- Allows the saving of the x87 FPU/
    MMX/SSE/SSE2/SSE3/SSSE3/SSE4 context on a task switch to be delayed until
    an x87 FPU/MMX/SSE/SSE2/SSE3/SSSE3/SSE4 instruction is actually executed
    by the new task.'

  Intel Software Developer's Manual Vol. 2A, Sec. 2.4 Instruction Exception
  Specification:

   'AVX instructions refer to exceptions by classes that include #NM
    "Device Not Available" exception for lazy context switch.'

So revert the commit.

Reported-by: Leonid Shatz <leonid.shatz@ravellosystems.com>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ravi V. Shankar <ravi.v.shankar@intel.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1457569734-3785-1-git-send-email-yu-cheng.yu@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index af30fde..f23cd8c 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -20,16 +20,15 @@
 
 /* Supported features which support lazy state saving */
 #define XFEATURE_MASK_LAZY	(XFEATURE_MASK_FP | \
-				 XFEATURE_MASK_SSE)
-
-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER	(XFEATURE_MASK_BNDREGS | \
-				 XFEATURE_MASK_BNDCSR | \
+				 XFEATURE_MASK_SSE | \
 				 XFEATURE_MASK_YMM | \
 				 XFEATURE_MASK_OPMASK | \
 				 XFEATURE_MASK_ZMM_Hi256 | \
 				 XFEATURE_MASK_Hi16_ZMM)
 
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER	(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
+
 /* All currently supported features */
 #define XCNTXT_MASK	(XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index d53ab3d..9ee7e30 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -302,12 +302,6 @@ u64 __init fpu__get_supported_xfeatures_mask(void)
 static void __init fpu__clear_eager_fpu_features(void)
 {
 	setup_clear_cpu_cap(X86_FEATURE_MPX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX2);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
 }
 
 /*
-- 
cgit v0.10.2


From 90d1098478fb08a1ef166fe91622d8046869e17b Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 9 Mar 2016 17:55:35 -0800
Subject: locking/csd_lock: Explicitly inline csd_lock*() helpers

While the compiler tends to already to it for us (except for
csd_unlock), make it explicit. These helpers mainly deal with
the ->flags, are short-lived  and can be called, for example,
from smp_call_function_many().

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1457574936-19065-2-git-send-email-dbueso@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/smp.c b/kernel/smp.c
index d903c02..5099db1 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -105,13 +105,13 @@ void __init call_function_init(void)
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
-static void csd_lock_wait(struct call_single_data *csd)
+static __always_inline void csd_lock_wait(struct call_single_data *csd)
 {
 	while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
 		cpu_relax();
 }
 
-static void csd_lock(struct call_single_data *csd)
+static __always_inline void csd_lock(struct call_single_data *csd)
 {
 	csd_lock_wait(csd);
 	csd->flags |= CSD_FLAG_LOCK;
@@ -124,7 +124,7 @@ static void csd_lock(struct call_single_data *csd)
 	smp_wmb();
 }
 
-static void csd_unlock(struct call_single_data *csd)
+static __always_inline void csd_unlock(struct call_single_data *csd)
 {
 	WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
 
-- 
cgit v0.10.2


From 38460a2178d225b39ade5ac66586c3733391cf86 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs>
Date: Wed, 9 Mar 2016 17:55:36 -0800
Subject: locking/csd_lock: Use smp_cond_acquire() in csd_lock_wait()

We can micro-optimize this call and mildly relax the
barrier requirements by relying on ctrl + rmb, keeping
the acquire semantics. In addition, this is pretty much
the now standard for busy-waiting under such restraints.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1457574936-19065-3-git-send-email-dbueso@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/smp.c b/kernel/smp.c
index 5099db1..300d293 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -107,8 +107,7 @@ void __init call_function_init(void)
  */
 static __always_inline void csd_lock_wait(struct call_single_data *csd)
 {
-	while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
-		cpu_relax();
+	smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
 }
 
 static __always_inline void csd_lock(struct call_single_data *csd)
-- 
cgit v0.10.2


From a798f091113ef4999277dbe0483d37d04fa35b2e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 13:24:32 -0800
Subject: x86/entry/32: Change INT80 to be an interrupt gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We want all of the syscall entries to run with interrupts off so that
we can efficiently run context tracking before enabling interrupts.

This will regress int $0x80 performance on 32-bit kernels by a
couple of cycles.  This shouldn't matter much -- int $0x80 is not a
fast path.

This effectively reverts:

  657c1eea0019 ("x86/entry/32: Fix entry_INT80_32() to expect interrupts to be on")

... and fixes the same issue differently.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/59b4f90c9ebfccd8c937305dbbbca680bc74b905.1457558566.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 209ba33..d69d1b6 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -371,14 +371,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
  * in workloads that use it, and it's usually called from
  * do_fast_syscall_32, so forcibly inline it to improve performance.
  */
-#ifdef CONFIG_X86_32
-/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
-__visible
-#else
-/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
-static
-#endif
-__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 {
 	struct thread_info *ti = pt_regs_to_thread_info(regs);
 	unsigned int nr = (unsigned int)regs->orig_ax;
@@ -413,14 +406,12 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 	syscall_return_slowpath(regs);
 }
 
-#ifdef CONFIG_X86_64
-/* Handles INT80 on 64-bit kernels */
-__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
+/* Handles int $0x80 */
+__visible void do_int80_syscall_32(struct pt_regs *regs)
 {
 	local_irq_enable();
 	do_syscall_32_irqs_on(regs);
 }
-#endif
 
 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
 __visible long do_fast_syscall_32(struct pt_regs *regs)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 286efa3..10868aa 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -463,13 +463,13 @@ ENTRY(entry_INT80_32)
 	SAVE_ALL pt_regs_ax=$-ENOSYS	/* save rest */
 
 	/*
-	 * User mode is traced as though IRQs are on.  Unlike the 64-bit
-	 * case, INT80 is a trap gate on 32-bit kernels, so interrupts
-	 * are already on (unless user code is messing around with iopl).
+	 * User mode is traced as though IRQs are on, and the interrupt gate
+	 * turned them off.
 	 */
+	TRACE_IRQS_OFF
 
 	movl	%esp, %eax
-	call	do_syscall_32_irqs_on
+	call	do_int80_syscall_32
 .Lsyscall_32_done:
 
 restore_all:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index efe0fe3..847f2f0 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -336,7 +336,7 @@ ENTRY(entry_INT80_compat)
 	TRACE_IRQS_OFF
 
 	movq	%rsp, %rdi
-	call	do_syscall_32_irqs_off
+	call	do_int80_syscall_32
 .Lsyscall_32_done:
 
 	/* Go back to user mode. */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 49e2e77..5fae2d8 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -912,7 +912,7 @@ void __init trap_init(void)
 #endif
 
 #ifdef CONFIG_X86_32
-	set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
+	set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
 	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
-- 
cgit v0.10.2


From 9999c8c01f34c918a57d6e5ba2f5d8b79aa04801 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 9 Mar 2016 13:24:33 -0800
Subject: x86/entry: Call enter_from_user_mode() with IRQs off
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that slow-path syscalls always enter C before enabling
interrupts, it's straightforward to call enter_from_user_mode() before
enabling interrupts rather than doing it as part of entry tracing.

With this change, we should finally be able to retire exception_enter().

This will also enable optimizations based on knowing that we never
change context tracking state with interrupts on.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/bc376ecf87921a495e874ff98139b1ca2f5c5dd7.1457558566.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d69d1b6..e79d93d 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -45,6 +45,8 @@ __visible void enter_from_user_mode(void)
 	CT_WARN_ON(ct_state() != CONTEXT_USER);
 	user_exit();
 }
+#else
+static inline void enter_from_user_mode(void) {}
 #endif
 
 static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
@@ -85,17 +87,6 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
 
 	work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
 
-#ifdef CONFIG_CONTEXT_TRACKING
-	/*
-	 * If TIF_NOHZ is set, we are required to call user_exit() before
-	 * doing anything that could touch RCU.
-	 */
-	if (work & _TIF_NOHZ) {
-		enter_from_user_mode();
-		work &= ~_TIF_NOHZ;
-	}
-#endif
-
 #ifdef CONFIG_SECCOMP
 	/*
 	 * Do seccomp first -- it should minimize exposure of other
@@ -344,6 +335,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
 	struct thread_info *ti = pt_regs_to_thread_info(regs);
 	unsigned long nr = regs->orig_ax;
 
+	enter_from_user_mode();
 	local_irq_enable();
 
 	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
@@ -366,9 +358,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
 
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 /*
- * Does a 32-bit syscall.  Called with IRQs on and does all entry and
- * exit work and returns with IRQs off.  This function is extremely hot
- * in workloads that use it, and it's usually called from
+ * Does a 32-bit syscall.  Called with IRQs on in CONTEXT_KERNEL.  Does
+ * all entry and exit work and returns with IRQs off.  This function is
+ * extremely hot in workloads that use it, and it's usually called from
  * do_fast_syscall_32, so forcibly inline it to improve performance.
  */
 static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
@@ -409,6 +401,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 /* Handles int $0x80 */
 __visible void do_int80_syscall_32(struct pt_regs *regs)
 {
+	enter_from_user_mode();
 	local_irq_enable();
 	do_syscall_32_irqs_on(regs);
 }
@@ -431,12 +424,11 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
 	 */
 	regs->ip = landing_pad;
 
-	/*
-	 * Fetch EBP from where the vDSO stashed it.
-	 *
-	 * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
-	 */
+	enter_from_user_mode();
+
 	local_irq_enable();
+
+	/* Fetch EBP from where the vDSO stashed it. */
 	if (
 #ifdef CONFIG_X86_64
 		/*
@@ -454,9 +446,6 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
 		/* User code screwed up. */
 		local_irq_disable();
 		regs->ax = -EFAULT;
-#ifdef CONFIG_CONTEXT_TRACKING
-		enter_from_user_mode();
-#endif
 		prepare_exit_to_usermode(regs);
 		return 0;	/* Keep it simple: use IRET. */
 	}
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f2e2302..8286669 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -134,7 +134,10 @@ struct thread_info {
 #define _TIF_ADDR32		(1 << TIF_ADDR32)
 #define _TIF_X32		(1 << TIF_X32)
 
-/* work to do in syscall_trace_enter() */
+/*
+ * work to do in syscall_trace_enter().  Also includes TIF_NOHZ for
+ * enter_from_user_mode()
+ */
 #define _TIF_WORK_SYSCALL_ENTRY	\
 	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\
 	 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT |	\
-- 
cgit v0.10.2


From 844a5fe219cf472060315971e15cbf97674a3324 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 8 Mar 2016 12:13:39 +0100
Subject: KVM: MMU: fix ept=0/pte.u=1/pte.w=0/CR0.WP=0/CR4.SMEP=1/EFER.NX=0
 combo

Yes, all of these are needed. :) This is admittedly a bit odd, but
kvm-unit-tests access.flat tests this if you run it with "-cpu host"
and of course ept=0.

KVM runs the guest with CR0.WP=1, so it must handle supervisor writes
specially when pte.u=1/pte.w=0/CR0.WP=0.  Such writes cause a fault
when U=1 and W=0 in the SPTE, but they must succeed because CR0.WP=0.
When KVM gets the fault, it sets U=0 and W=1 in the shadow PTE and
restarts execution.  This will still cause a user write to fault, while
supervisor writes will succeed.  User reads will fault spuriously now,
and KVM will then flip U and W again in the SPTE (U=1, W=0).  User reads
will be enabled and supervisor writes disabled, going back to the
originary situation where supervisor writes fault spuriously.

When SMEP is in effect, however, U=0 will enable kernel execution of
this page.  To avoid this, KVM also sets NX=1 in the shadow PTE together
with U=0.  If the guest has not enabled NX, the result is a continuous
stream of page faults due to the NX bit being reserved.

The fix is to force EFER.NX=1 even if the CPU is taking care of the EFER
switch.  (All machines with SMEP have the CPU_LOAD_IA32_EFER vm-entry
control, so they do not use user-return notifiers for EFER---if they did,
EFER.NX would be forced to the same value as the host).

There is another bug in the reserved bit check, which I've split to a
separate patch for easier application to stable kernels.

Cc: stable@vger.kernel.org
Cc: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Fixes: f6577a5fa15d82217ca73c74cd2dcbc0f6c781dd
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index daf9c0f..c817310 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -358,7 +358,8 @@ In the first case there are two additional complications:
 - if CR4.SMEP is enabled: since we've turned the page into a kernel page,
   the kernel may now execute it.  We handle this by also setting spte.nx.
   If we get a user fetch or read fault, we'll change spte.u=1 and
-  spte.nx=gpte.nx back.
+  spte.nx=gpte.nx back.  For this to work, KVM forces EFER.NX to 1 when
+  shadow paging is in use.
 - if CR4.SMAP is disabled: since the page has been changed to a kernel
   page, it can not be reused when CR4.SMAP is enabled. We set
   CR4.SMAP && !CR0.WP into shadow page's role to avoid this case. Note,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6e51493..9bd8f44 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1857,26 +1857,31 @@ static void reload_tss(void)
 
 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 {
-	u64 guest_efer;
-	u64 ignore_bits;
+	u64 guest_efer = vmx->vcpu.arch.efer;
+	u64 ignore_bits = 0;
 
-	guest_efer = vmx->vcpu.arch.efer;
+	if (!enable_ept) {
+		/*
+		 * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
+		 * host CPUID is more efficient than testing guest CPUID
+		 * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
+		 */
+		if (boot_cpu_has(X86_FEATURE_SMEP))
+			guest_efer |= EFER_NX;
+		else if (!(guest_efer & EFER_NX))
+			ignore_bits |= EFER_NX;
+	}
 
 	/*
-	 * NX is emulated; LMA and LME handled by hardware; SCE meaningless
-	 * outside long mode
+	 * LMA and LME handled by hardware; SCE meaningless outside long mode.
 	 */
-	ignore_bits = EFER_NX | EFER_SCE;
+	ignore_bits |= EFER_SCE;
 #ifdef CONFIG_X86_64
 	ignore_bits |= EFER_LMA | EFER_LME;
 	/* SCE is meaningful only in long mode on Intel */
 	if (guest_efer & EFER_LMA)
 		ignore_bits &= ~(u64)EFER_SCE;
 #endif
-	guest_efer &= ~ignore_bits;
-	guest_efer |= host_efer & ignore_bits;
-	vmx->guest_msrs[efer_offset].data = guest_efer;
-	vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
 
 	clear_atomic_switch_msr(vmx, MSR_EFER);
 
@@ -1887,16 +1892,21 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 	 */
 	if (cpu_has_load_ia32_efer ||
 	    (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
-		guest_efer = vmx->vcpu.arch.efer;
 		if (!(guest_efer & EFER_LMA))
 			guest_efer &= ~EFER_LME;
 		if (guest_efer != host_efer)
 			add_atomic_switch_msr(vmx, MSR_EFER,
 					      guest_efer, host_efer);
 		return false;
-	}
+	} else {
+		guest_efer &= ~ignore_bits;
+		guest_efer |= host_efer & ignore_bits;
 
-	return true;
+		vmx->guest_msrs[efer_offset].data = guest_efer;
+		vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+		return true;
+	}
 }
 
 static unsigned long segment_base(u16 selector)
-- 
cgit v0.10.2


From 5f0b819995e172f48fdcd91335a2126ba7d9deae Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 9 Mar 2016 14:28:02 +0100
Subject: KVM: MMU: fix reserved bit check for
 ept=0/CR0.WP=0/CR4.SMEP=1/EFER.NX=0

KVM has special logic to handle pages with pte.u=1 and pte.w=0 when
CR0.WP=1.  These pages' SPTEs flip continuously between two states:
U=1/W=0 (user and supervisor reads allowed, supervisor writes not allowed)
and U=0/W=1 (supervisor reads and writes allowed, user writes not allowed).

When SMEP is in effect, however, U=0 will enable kernel execution of
this page.  To avoid this, KVM also sets NX=1 in the shadow PTE together
with U=0, making the two states U=1/W=0/NX=gpte.NX and U=0/W=1/NX=1.
When guest EFER has the NX bit cleared, the reserved bit check thinks
that the latter state is invalid; teach it that the smep_andnot_wp case
will also use the NX bit of SPTEs.

Cc: stable@vger.kernel.org
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.inel.com>
Fixes: c258b62b264fdc469b6d3610a907708068145e3b
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 95a955d..1e7a49b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3721,13 +3721,15 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
 void
 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
+	bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+
 	/*
 	 * Passing "true" to the last argument is okay; it adds a check
 	 * on bit 8 of the SPTEs which KVM doesn't use anyway.
 	 */
 	__reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
 				boot_cpu_data.x86_phys_bits,
-				context->shadow_root_level, context->nx,
+				context->shadow_root_level, uses_nx,
 				guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
 				true);
 }
-- 
cgit v0.10.2


From 84477336ec03f8061ffd6908da341e063e5d6d1f Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Wed, 9 Mar 2016 21:56:22 +0100
Subject: x86/delay: Avoid preemptible context checks in delay_mwaitx()

We do use this_cpu_ptr(&cpu_tss) as a cacheline-aligned, seldomly
accessed per-cpu var as the MONITORX target in delay_mwaitx(). However,
when called in preemptible context, this_cpu_ptr -> smp_processor_id() ->
debug_smp_processor_id() fires:

  BUG: using smp_processor_id() in preemptible [00000000] code: udevd/312
  caller is delay_mwaitx+0x40/0xa0

But we don't care about that check - we only need cpu_tss as a MONITORX
target and it doesn't really matter which CPU's var we're touching as
we're going idle anyway. Fix that.

Suggested-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: spg_linux_kernel@amd.com
Link: http://lkml.kernel.org/r/20160309205622.GG6564@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index e912b2f..2f07c29 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -102,7 +102,7 @@ static void delay_mwaitx(unsigned long __loops)
 		 * Use cpu_tss as a cacheline-aligned, seldomly
 		 * accessed per-cpu variable as the monitor target.
 		 */
-		__monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
+		__monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
 
 		/*
 		 * AMD, like Intel, supports the EAX hint and EAX=0xf
-- 
cgit v0.10.2


From 25c5e9626ca4d40928dc9c44f009ce2ed0a739e7 Mon Sep 17 00:00:00 2001
From: Ludovic Desroches <ludovic.desroches@atmel.com>
Date: Thu, 10 Mar 2016 10:17:55 +0100
Subject: dmaengine: at_xdmac: fix residue computation

When computing the residue we need two pieces of information: the current
descriptor and the remaining data of the current descriptor. To get
that information, we need to read consecutively two registers but we
can't do it in an atomic way. For that reason, we have to check manually
that current descriptor has not changed.

Signed-off-by: Ludovic Desroches <ludovic.desroches@atmel.com>
Suggested-by: Cyrille Pitchen <cyrille.pitchen@atmel.com>
Reported-by: David Engraf <david.engraf@sysgo.com>
Tested-by: David Engraf <david.engraf@sysgo.com>
Fixes: e1f7c9eee707 ("dmaengine: at_xdmac: creation of the atmel
eXtended DMA Controller driver")
Cc: stable@vger.kernel.org #4.1 and later
Signed-off-by: Vinod Koul <vinod.koul@intel.com>

diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c
index 64f5d1b..8e304b1 100644
--- a/drivers/dma/at_xdmac.c
+++ b/drivers/dma/at_xdmac.c
@@ -176,6 +176,7 @@
 #define AT_XDMAC_MAX_CHAN	0x20
 #define AT_XDMAC_MAX_CSIZE	16	/* 16 data */
 #define AT_XDMAC_MAX_DWIDTH	8	/* 64 bits */
+#define AT_XDMAC_RESIDUE_MAX_RETRIES	5
 
 #define AT_XDMAC_DMA_BUSWIDTHS\
 	(BIT(DMA_SLAVE_BUSWIDTH_UNDEFINED) |\
@@ -1395,8 +1396,8 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 	struct at_xdmac_desc	*desc, *_desc;
 	struct list_head	*descs_list;
 	enum dma_status		ret;
-	int			residue;
-	u32			cur_nda, mask, value;
+	int			residue, retry;
+	u32			cur_nda, check_nda, cur_ubc, mask, value;
 	u8			dwidth = 0;
 	unsigned long		flags;
 
@@ -1433,7 +1434,42 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 			cpu_relax();
 	}
 
+	/*
+	 * When processing the residue, we need to read two registers but we
+	 * can't do it in an atomic way. AT_XDMAC_CNDA is used to find where
+	 * we stand in the descriptor list and AT_XDMAC_CUBC is used
+	 * to know how many data are remaining for the current descriptor.
+	 * Since the dma channel is not paused to not loose data, between the
+	 * AT_XDMAC_CNDA and AT_XDMAC_CUBC read, we may have change of
+	 * descriptor.
+	 * For that reason, after reading AT_XDMAC_CUBC, we check if we are
+	 * still using the same descriptor by reading a second time
+	 * AT_XDMAC_CNDA. If AT_XDMAC_CNDA has changed, it means we have to
+	 * read again AT_XDMAC_CUBC.
+	 * Memory barriers are used to ensure the read order of the registers.
+	 * A max number of retries is set because unlikely it can never ends if
+	 * we are transferring a lot of data with small buffers.
+	 */
 	cur_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+	rmb();
+	cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+	for (retry = 0; retry < AT_XDMAC_RESIDUE_MAX_RETRIES; retry++) {
+		rmb();
+		check_nda = at_xdmac_chan_read(atchan, AT_XDMAC_CNDA) & 0xfffffffc;
+
+		if (likely(cur_nda == check_nda))
+			break;
+
+		cur_nda = check_nda;
+		rmb();
+		cur_ubc = at_xdmac_chan_read(atchan, AT_XDMAC_CUBC);
+	}
+
+	if (unlikely(retry >= AT_XDMAC_RESIDUE_MAX_RETRIES)) {
+		ret = DMA_ERROR;
+		goto spin_unlock;
+	}
+
 	/*
 	 * Remove size of all microblocks already transferred and the current
 	 * one. Then add the remaining size to transfer of the current
@@ -1446,7 +1482,7 @@ at_xdmac_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
 		if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda)
 			break;
 	}
-	residue += at_xdmac_chan_read(atchan, AT_XDMAC_CUBC) << dwidth;
+	residue += cur_ubc << dwidth;
 
 	dma_set_residue(txstate, residue);
 
-- 
cgit v0.10.2


From 52b2a05fa7c8cfceebb59117a95decd68cf7e465 Mon Sep 17 00:00:00 2001
From: Quan Nguyen <qnguyen@apm.com>
Date: Thu, 3 Mar 2016 21:56:52 +0700
Subject: genirq: Export IRQ functions for module use

Export irq_chip_*_parent(), irq_domain_create_hierarchy(),
irq_domain_set_hwirq_and_chip(), irq_domain_reset_irq_data(),
irq_domain_alloc/free_irqs_parent()

So gpio drivers can be built as modules. First user: gpio-xgene-sb

Signed-off-by: Quan Nguyen <qnguyen@apm.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Phong Vo <pvo@apm.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: patches@apm.com
Cc: Loc Ho <lho@apm.com>
Cc: Keyur Chudgar <kchudgar@apm.com>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Link: https://lists.01.org/pipermail/kbuild-all/2016-February/017914.html
Link: http://lkml.kernel.org/r/1457017012-10628-1-git-send-email-qnguyen@apm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 5797909..2f9f2b0 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -961,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data)
 	data = data->parent_data;
 	data->chip->irq_mask(data);
 }
+EXPORT_SYMBOL_GPL(irq_chip_mask_parent);
 
 /**
  * irq_chip_unmask_parent - Unmask the parent interrupt
@@ -971,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data)
 	data = data->parent_data;
 	data->chip->irq_unmask(data);
 }
+EXPORT_SYMBOL_GPL(irq_chip_unmask_parent);
 
 /**
  * irq_chip_eoi_parent - Invoke EOI on the parent interrupt
@@ -981,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data)
 	data = data->parent_data;
 	data->chip->irq_eoi(data);
 }
+EXPORT_SYMBOL_GPL(irq_chip_eoi_parent);
 
 /**
  * irq_chip_set_affinity_parent - Set affinity on the parent interrupt
@@ -1016,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
 
 	return -ENOSYS;
 }
+EXPORT_SYMBOL_GPL(irq_chip_set_type_parent);
 
 /**
  * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8681154..3a519a0 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -893,6 +893,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
 
 	return domain;
 }
+EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy);
 
 static void irq_domain_insert_irq(int virq)
 {
@@ -1043,6 +1044,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
 
 /**
  * irq_domain_set_info - Set the complete data for a @virq in @domain
@@ -1076,6 +1078,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data)
 	irq_data->chip = &no_irq_chip;
 	irq_data->chip_data = NULL;
 }
+EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
 
 /**
  * irq_domain_free_irqs_common - Clear irq_data and free the parent
@@ -1273,6 +1276,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
 						       nr_irqs, arg);
 	return -ENOSYS;
 }
+EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
 
 /**
  * irq_domain_free_irqs_parent - Free interrupts from parent domain
@@ -1290,6 +1294,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
 		irq_domain_free_irqs_recursive(domain->parent, irq_base,
 					       nr_irqs);
 }
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
 
 /**
  * irq_domain_activate_irq - Call domain_ops->activate recursively to activate
-- 
cgit v0.10.2


From cfe199afefe6201e998ddc07102fc1fdb55f196c Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vz@mleia.com>
Date: Wed, 9 Mar 2016 03:21:29 +0200
Subject: irqchip/sunxi-nmi: Fix error check of of_io_request_and_map()

The of_io_request_and_map() returns a valid pointer in iomem region or
ERR_PTR(), check for NULL always fails and may cause a NULL pointer
dereference on error path.

Fixes: 0e841b04c829 ("irqchip/sunxi-nmi: Switch to of_io_request_and_map() from of_iomap()")
Signed-off-by: Vladimir Zapolskiy <vz@mleia.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Chen-Yu Tsai <wens@csie.org>
Cc: Maxime Ripard <maxime.ripard@free-electrons.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1457486489-10189-1-git-send-email-vz@mleia.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/irqchip/irq-sunxi-nmi.c b/drivers/irqchip/irq-sunxi-nmi.c
index 0820f67..668730c 100644
--- a/drivers/irqchip/irq-sunxi-nmi.c
+++ b/drivers/irqchip/irq-sunxi-nmi.c
@@ -160,9 +160,9 @@ static int __init sunxi_sc_nmi_irq_init(struct device_node *node,
 
 	gc = irq_get_domain_generic_chip(domain, 0);
 	gc->reg_base = of_io_request_and_map(node, 0, of_node_full_name(node));
-	if (!gc->reg_base) {
+	if (IS_ERR(gc->reg_base)) {
 		pr_err("unable to map resource\n");
-		ret = -ENOMEM;
+		ret = PTR_ERR(gc->reg_base);
 		goto fail_irqd_remove;
 	}
 
-- 
cgit v0.10.2


From edf8fcdc6b254236be005851af35ea5e826e7e09 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vz@mleia.com>
Date: Wed, 9 Mar 2016 03:21:40 +0200
Subject: irqchip/mxs: Fix error check of of_io_request_and_map()

The of_io_request_and_map() returns a valid pointer in iomem region or
ERR_PTR(), check for NULL always fails and may cause a NULL pointer
dereference on error path.

Fixes: 25e34b44313b ("irqchip/mxs: Prepare driver for hardware with different offsets")
Signed-off-by: Vladimir Zapolskiy <vz@mleia.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Oleksij Rempel <linux@rempel-privat.de>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1457486500-10237-1-git-send-email-vz@mleia.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/irqchip/irq-mxs.c b/drivers/irqchip/irq-mxs.c
index efe5084..1730470 100644
--- a/drivers/irqchip/irq-mxs.c
+++ b/drivers/irqchip/irq-mxs.c
@@ -183,7 +183,7 @@ static void __iomem * __init icoll_init_iobase(struct device_node *np)
 	void __iomem *icoll_base;
 
 	icoll_base = of_io_request_and_map(np, 0, np->name);
-	if (!icoll_base)
+	if (IS_ERR(icoll_base))
 		panic("%s: unable to map resource", np->full_name);
 	return icoll_base;
 }
-- 
cgit v0.10.2


From eb1af3b71f9d83e45f2fd2fd649356e98e1c582c Mon Sep 17 00:00:00 2001
From: "Luck, Tony" <tony.luck@intel.com>
Date: Wed, 9 Mar 2016 16:40:48 -0800
Subject: EDAC/sb_edac: Fix computation of channel address

Large memory Haswell-EX systems with multiple DIMMs per channel were
sometimes reporting the wrong DIMM.

Found three problems:

 1) Debug printouts for socket and channel interleave were not interpreting
    the register fields correctly. The socket interleave field is a 2^X
    value (0=1, 1=2, 2=4, 3=8). The channel interleave is X+1 (0=1, 1=2,
    2=3. 3=4).

 2) Actual use of the socket interleave value didn't interpret as 2^X

 3) Conversion of address to channel address was complicated, and wrong.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Acked-by: Aristeu Rozanski <arozansk@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-edac@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index e438ee5..34e54a3 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -1839,8 +1839,8 @@ static void get_memory_layout(const struct mem_ctl_info *mci)
 		edac_dbg(0, "TAD#%d: up to %u.%03u GB (0x%016Lx), socket interleave %d, memory interleave %d, TGT: %d, %d, %d, %d, reg=0x%08x\n",
 			 n_tads, gb, (mb*1000)/1024,
 			 ((u64)tmp_mb) << 20L,
-			 (u32)TAD_SOCK(reg),
-			 (u32)TAD_CH(reg),
+			 (u32)(1 << TAD_SOCK(reg)),
+			 (u32)TAD_CH(reg) + 1,
 			 (u32)TAD_TGT0(reg),
 			 (u32)TAD_TGT1(reg),
 			 (u32)TAD_TGT2(reg),
@@ -2118,7 +2118,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 	}
 
 	ch_way = TAD_CH(reg) + 1;
-	sck_way = TAD_SOCK(reg) + 1;
+	sck_way = 1 << TAD_SOCK(reg);
 
 	if (ch_way == 3)
 		idx = addr >> 6;
@@ -2175,7 +2175,7 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 		 n_tads,
 		 addr,
 		 limit,
-		 (u32)TAD_SOCK(reg),
+		 sck_way,
 		 ch_way,
 		 offset,
 		 idx,
@@ -2190,18 +2190,12 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 			offset, addr);
 		return -EINVAL;
 	}
-	addr -= offset;
-	/* Store the low bits [0:6] of the addr */
-	ch_addr = addr & 0x7f;
-	/* Remove socket wayness and remove 6 bits */
-	addr >>= 6;
-	addr = div_u64(addr, sck_xch);
-#if 0
-	/* Divide by channel way */
-	addr = addr / ch_way;
-#endif
-	/* Recover the last 6 bits */
-	ch_addr |= addr << 6;
+
+	ch_addr = addr - offset;
+	ch_addr >>= (6 + shiftup);
+	ch_addr /= ch_way * sck_way;
+	ch_addr <<= (6 + shiftup);
+	ch_addr |= addr & ((1 << (6 + shiftup)) - 1);
 
 	/*
 	 * Step 3) Decode rank
-- 
cgit v0.10.2


From 10ee73865e9e4705ba8b3f4d6149e8e68d902bb7 Mon Sep 17 00:00:00 2001
From: Jianyu Zhan <nasa4836@gmail.com>
Date: Thu, 10 Mar 2016 20:19:58 +0800
Subject: x86/entry/traps: Show unhandled signal for i386 in do_trap()

Commit abd4f7505baf ("x86: i386-show-unhandled-signals-v3") did turn on
the showing-unhandled-signal behaviour for i386 for some exception handlers,
but for no reason do_trap() is left out (my naive guess is because turning it on
for do_trap() would be too noisy since do_trap() is shared by several exceptions).

And since the same commit make "show_unhandled_signals" a debug tunable(in
/proc/sys/debug/exception-trace), and x86 by default turning it on.

So it would be strange for i386 users who turing it on manually and expect
seeing the unhandled signal output in log, but nothing.

This patch turns it on for i386 in do_trap() as well.

Signed-off-by: Jianyu Zhan <nasa4836@gmail.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@suse.de
Cc: dave.hansen@linux.intel.com
Cc: heukelum@fastmail.fm
Cc: jbeulich@novell.com
Cc: jdike@addtoit.com
Cc: joe@perches.com
Cc: luto@kernel.org
Link: http://lkml.kernel.org/r/1457612398-4568-1-git-send-email-nasa4836@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5fae2d8..1baf081 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -248,7 +248,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_nr = trapnr;
 
-#ifdef CONFIG_X86_64
 	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
 	    printk_ratelimit()) {
 		pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
@@ -257,7 +256,6 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 		print_vma_addr(" in ", regs->ip);
 		pr_cont("\n");
 	}
-#endif
 
 	force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
 }
-- 
cgit v0.10.2


From b2cd27448b33de9069d580d8f229efef434b64e6 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Date: Sat, 5 Mar 2016 07:13:39 -0300
Subject: [media] media-device: map new functions into old types for legacy API

The legacy media controller userspace API exposes entity types that
carry both type and function information. The new API replaces the type
with a function. It preserves backward compatibility by defining legacy
functions for the existing types and using them in drivers.

This works fine, as long as newer entity functions won't be added.

Unfortunately, some tools, like media-ctl with --print-dot argument
rely on the now legacy MEDIA_ENT_T_V4L2_SUBDEV and MEDIA_ENT_T_DEVNODE
numeric ranges to identify what entities will be shown.

Also, if the entity doesn't match those ranges, it will ignore the
major/minor information on devnodes, and won't be getting the devnode
name via udev or sysfs.

As we're now adding devices outside the old range, the legacy ioctl
needs to map the new entity functions into a type at the old range,
or otherwise we'll have a regression.

Detected on all released media-ctl versions (e. g. versions <= 1.10).

Fix this by deriving the type from the function to emulate the legacy
API if the function isn't in the legacy functions range.

Reported-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>

diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c
index 7dae0ac..e9219f5 100644
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -20,6 +20,9 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+/* We need to access legacy defines from linux/media.h */
+#define __NEED_MEDIA_LEGACY_API
+
 #include <linux/compat.h>
 #include <linux/export.h>
 #include <linux/idr.h>
@@ -115,6 +118,26 @@ static long media_device_enum_entities(struct media_device *mdev,
 	u_ent.group_id = 0;		/* Unused */
 	u_ent.pads = ent->num_pads;
 	u_ent.links = ent->num_links - ent->num_backlinks;
+
+	/*
+	 * Workaround for a bug at media-ctl <= v1.10 that makes it to
+	 * do the wrong thing if the entity function doesn't belong to
+	 * either MEDIA_ENT_F_OLD_BASE or MEDIA_ENT_F_OLD_SUBDEV_BASE
+	 * Ranges.
+	 *
+	 * Non-subdevices are expected to be at the MEDIA_ENT_F_OLD_BASE,
+	 * or, otherwise, will be silently ignored by media-ctl when
+	 * printing the graphviz diagram. So, map them into the devnode
+	 * old range.
+	 */
+	if (ent->function < MEDIA_ENT_F_OLD_BASE ||
+	    ent->function > MEDIA_ENT_T_DEVNODE_UNKNOWN) {
+		if (is_media_entity_v4l2_subdev(ent))
+			u_ent.type = MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN;
+		else if (ent->function != MEDIA_ENT_F_IO_V4L)
+			u_ent.type = MEDIA_ENT_T_DEVNODE_UNKNOWN;
+	}
+
 	memcpy(&u_ent.raw, &ent->info, sizeof(ent->info));
 	if (copy_to_user(uent, &u_ent, sizeof(u_ent)))
 		return -EFAULT;
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 625b38f..a8e3a8c 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -120,7 +120,7 @@ struct media_device_info {
 
 #define MEDIA_ENT_F_V4L2_SUBDEV_UNKNOWN	MEDIA_ENT_F_OLD_SUBDEV_BASE
 
-#ifndef __KERNEL__
+#if !defined(__KERNEL__) || defined(__NEED_MEDIA_LEGACY_API)
 
 /*
  * Legacy symbols used to avoid userspace compilation breakages
@@ -133,6 +133,10 @@ struct media_device_info {
 #define MEDIA_ENT_TYPE_MASK		0x00ff0000
 #define MEDIA_ENT_SUBTYPE_MASK		0x0000ffff
 
+/* End of the old subdev reserved numberspace */
+#define MEDIA_ENT_T_DEVNODE_UNKNOWN	(MEDIA_ENT_T_DEVNODE | \
+					 MEDIA_ENT_SUBTYPE_MASK)
+
 #define MEDIA_ENT_T_DEVNODE		MEDIA_ENT_F_OLD_BASE
 #define MEDIA_ENT_T_DEVNODE_V4L		MEDIA_ENT_F_IO_V4L
 #define MEDIA_ENT_T_DEVNODE_FB		(MEDIA_ENT_T_DEVNODE + 2)
-- 
cgit v0.10.2


From 9df4f913eb39046fa57c4217bb3429a63e164000 Mon Sep 17 00:00:00 2001
From: Boris BREZILLON <boris.brezillon@free-electrons.com>
Date: Wed, 9 Mar 2016 21:57:24 +0100
Subject: MAINTAINERS: add a maintainer for the NAND subsystem

Add myself as the maintainer of the NAND subsystem.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Richard Weinberger <richard@nod.at>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Acked-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Brian Norris <computersforpeace@gmail.com>

diff --git a/MAINTAINERS b/MAINTAINERS
index 0db5eb9..d2424ea 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7399,6 +7399,17 @@ W:	https://www.myricom.com/support/downloads/myri10ge.html
 S:	Supported
 F:	drivers/net/ethernet/myricom/myri10ge/
 
+NAND FLASH SUBSYSTEM
+M:	Boris Brezillon <boris.brezillon@free-electrons.com>
+R:	Richard Weinberger <richard@nod.at>
+L:	linux-mtd@lists.infradead.org
+W:	http://www.linux-mtd.infradead.org/
+Q:	http://patchwork.ozlabs.org/project/linux-mtd/list/
+T:	git git://github.com/linux-nand/linux.git
+S:	Maintained
+F:	drivers/mtd/nand/
+F:	include/linux/mtd/nand*.h
+
 NATSEMI ETHERNET DRIVER (DP8381x)
 S:	Orphan
 F:	drivers/net/ethernet/natsemi/natsemi.c
-- 
cgit v0.10.2


From 9eb42dee2b11635174c74a7996934b6ca18f2179 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 26 Feb 2016 18:13:28 -0500
Subject: tools lib traceevent: Add '~' operation within arg_num_eval()

When evaluating values for print flags, if the value included a '~'
operator, the parsing would fail. This broke kmalloc's parsing of:

__print_flags(REC->gfp_flags, "|", {(unsigned
long)((((((( gfp_t)(0x400000u|0x2000000u)) | (( gfp_t)0x40u) |
(( gfp_t)0x80u) | (( gfp_t)0x20000u)) | (( gfp_t)0x02u)) |
(( gfp_t)0x08u)) | (( gfp_t)0x4000u) | (( gfp_t)0x10000u) |
(( gfp_t)0x1000u) | (( gfp_t)0x200u)) & ~(( gfp_t)0x2000000u))
                                        ^
                                        |
                                      here

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: David Ahern <dsahern@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/20160226181328.22f47129@gandalf.local.home
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
index 865dea5..190cc88 100644
--- a/tools/lib/traceevent/event-parse.c
+++ b/tools/lib/traceevent/event-parse.c
@@ -2398,6 +2398,12 @@ static int arg_num_eval(struct print_arg *arg, long long *val)
 				break;
 			*val = left + right;
 			break;
+		case '~':
+			ret = arg_num_eval(arg->op.right, &right);
+			if (!ret)
+				break;
+			*val = ~right;
+			break;
 		default:
 			do_warning("unknown op '%s'", arg->op.op);
 			ret = 0;
-- 
cgit v0.10.2


From e12b202f8fb9b62a3997cad8e93401f85293390c Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 10 Mar 2016 17:41:13 +0100
Subject: perf jitdump: Build only on supported archs

Build jitdump only on architectures defined in util/genelf.h file, to avoid
breaking the build on such arches.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Davidlohr Bueso <dbueso@suse.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Mel Gorman <mgorman@suse.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/20160310164113.GA11357@krava.redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/arch/arm/Makefile b/tools/perf/arch/arm/Makefile
index 7fbca17..18b1351 100644
--- a/tools/perf/arch/arm/Makefile
+++ b/tools/perf/arch/arm/Makefile
@@ -1,3 +1,4 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
 endif
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile
index 7fbca17..18b1351 100644
--- a/tools/perf/arch/arm64/Makefile
+++ b/tools/perf/arch/arm64/Makefile
@@ -1,3 +1,4 @@
 ifndef NO_DWARF
 PERF_HAVE_DWARF_REGS := 1
 endif
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index 9f9cea3..56e05f1 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -3,3 +3,4 @@ PERF_HAVE_DWARF_REGS := 1
 endif
 
 HAVE_KVM_STAT_SUPPORT := 1
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
index 09ba923..269af21 100644
--- a/tools/perf/arch/x86/Makefile
+++ b/tools/perf/arch/x86/Makefile
@@ -3,3 +3,4 @@ PERF_HAVE_DWARF_REGS := 1
 endif
 HAVE_KVM_STAT_SUPPORT := 1
 PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1
+PERF_HAVE_JITDUMP := 1
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index e219ed4..7fa6866 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -73,7 +73,7 @@ static int perf_event__repipe_oe_synth(struct perf_tool *tool,
 	return perf_event__repipe_synth(tool, event);
 }
 
-#if defined(HAVE_LIBELF_SUPPORT) && defined(HAVE_DWARF_SUPPORT)
+#ifdef HAVE_JITDUMP
 static int perf_event__drop_oe(struct perf_tool *tool __maybe_unused,
 			       union perf_event *event __maybe_unused,
 			       struct ordered_events *oe __maybe_unused)
@@ -245,7 +245,7 @@ static int perf_event__repipe_mmap(struct perf_tool *tool,
 	return err;
 }
 
-#if defined(HAVE_LIBELF_SUPPORT) && defined(HAVE_DWARF_SUPPORT)
+#ifdef HAVE_JITDUMP
 static int perf_event__jit_repipe_mmap(struct perf_tool *tool,
 				       union perf_event *event,
 				       struct perf_sample *sample,
@@ -283,7 +283,7 @@ static int perf_event__repipe_mmap2(struct perf_tool *tool,
 	return err;
 }
 
-#if defined(HAVE_LIBELF_SUPPORT) && defined(HAVE_DWARF_SUPPORT)
+#ifdef HAVE_JITDUMP
 static int perf_event__jit_repipe_mmap2(struct perf_tool *tool,
 					union perf_event *event,
 					struct perf_sample *sample,
@@ -778,7 +778,9 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 		OPT_BOOLEAN('s', "sched-stat", &inject.sched_stat,
 			    "Merge sched-stat and sched-switch for getting events "
 			    "where and how long tasks slept"),
+#ifdef HAVE_JITDUMP
 		OPT_BOOLEAN('j', "jit", &inject.jit_mode, "merge jitdump files into perf.data file"),
+#endif
 		OPT_INCR('v', "verbose", &verbose,
 			 "be more verbose (show build ids, etc)"),
 		OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file",
@@ -795,7 +797,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 		"perf inject [<options>]",
 		NULL
 	};
-#if !defined(HAVE_LIBELF_SUPPORT) || !defined(HAVE_DWARF_SUPPORT)
+#ifndef HAVE_JITDUMP
 	set_option_nobuild(options, 'j', "jit", "NO_LIBELF=1", true);
 #endif
 	argc = parse_options(argc, argv, options, inject_usage, 0);
@@ -833,7 +835,7 @@ int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
 		inject.tool.ordered_events = true;
 		inject.tool.ordering_requires_timestamps = true;
 	}
-#if defined(HAVE_LIBELF_SUPPORT) && defined(HAVE_DWARF_SUPPORT)
+#ifdef HAVE_JITDUMP
 	if (inject.jit_mode) {
 		inject.tool.mmap2	   = perf_event__jit_repipe_mmap2;
 		inject.tool.mmap	   = perf_event__jit_repipe_mmap;
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index f7aeaf3..eca6a91 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -328,6 +328,13 @@ ifndef NO_LIBELF
   endif # NO_LIBBPF
 endif # NO_LIBELF
 
+ifdef PERF_HAVE_JITDUMP
+  ifndef NO_DWARF
+    $(call detected,CONFIG_JITDUMP)
+    CFLAGS += -DHAVE_JITDUMP
+  endif
+endif
+
 ifeq ($(ARCH),powerpc)
   ifndef NO_DWARF
     CFLAGS += -DHAVE_SKIP_CALLCHAIN_IDX
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index f130ce2..eea25e2 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -108,7 +108,7 @@ libperf-$(CONFIG_ZLIB) += zlib.o
 libperf-$(CONFIG_LZMA) += lzma.o
 libperf-y += demangle-java.o
 
-ifdef CONFIG_DWARF
+ifdef CONFIG_JITDUMP
 libperf-$(CONFIG_LIBELF) += jitdump.o
 libperf-$(CONFIG_LIBELF) += genelf.o
 libperf-$(CONFIG_LIBELF) += genelf_debug.o
-- 
cgit v0.10.2


From f4954cfb1cda4cf0abf36d23213c702e94666c3f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 9 Mar 2016 22:46:56 +0900
Subject: perf tools: Fix hist_entry__filter() for hierarchy

When hierarchy mode is enabled each output format is in a separate hpp
list.  So when applying a filter it should check all formats in the
list.  Currently it only checks a single ->fmt field which was not set
properly.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457531222-18130-2-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 59a101e..8a49a07 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1602,16 +1602,30 @@ int hist_entry__filter(struct hist_entry *he, int type, const void *arg)
 {
 	struct perf_hpp_fmt *fmt;
 	struct hpp_sort_entry *hse;
+	int ret = -1;
+	int r;
 
-	fmt = he->fmt;
-	if (fmt == NULL || !perf_hpp__is_sort_entry(fmt))
-		return -1;
+	perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+		if (!perf_hpp__is_sort_entry(fmt))
+			continue;
 
-	hse = container_of(fmt, struct hpp_sort_entry, hpp);
-	if (hse->se->se_filter == NULL)
-		return -1;
+		hse = container_of(fmt, struct hpp_sort_entry, hpp);
+		if (hse->se->se_filter == NULL)
+			continue;
 
-	return hse->se->se_filter(he, type, arg);
+		/*
+		 * hist entry is filtered if any of sort key in the hpp list
+		 * is applied.  But it should skip non-matched filter types.
+		 */
+		r = hse->se->se_filter(he, type, arg);
+		if (r >= 0) {
+			if (ret < 0)
+				ret = 0;
+			ret |= r;
+		}
+	}
+
+	return ret;
 }
 
 static int __sort_dimension__add_hpp_sort(struct sort_dimension *sd,
-- 
cgit v0.10.2


From 4945cf2aa1ed61994c158f22f26ea6101059a8d4 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 9 Mar 2016 22:46:57 +0900
Subject: perf tools: Add more sort entry check functions

Those functions are for checkinf if a given perf_hpp_fmt is a
filter-related sort entry.  With hierarchy mode, it needs to check
filters on the hist entries with its own hpp format list.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457531222-18130-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 2cb017f..6870a1b 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -318,6 +318,10 @@ bool perf_hpp__defined_dynamic_entry(struct perf_hpp_fmt *fmt, struct hists *his
 bool perf_hpp__is_trace_entry(struct perf_hpp_fmt *fmt);
 bool perf_hpp__is_srcline_entry(struct perf_hpp_fmt *fmt);
 bool perf_hpp__is_srcfile_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_thread_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_comm_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_dso_entry(struct perf_hpp_fmt *fmt);
+bool perf_hpp__is_sym_entry(struct perf_hpp_fmt *fmt);
 
 struct perf_hpp_fmt *perf_hpp_fmt__dup(struct perf_hpp_fmt *fmt);
 
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 8a49a07..61c7402 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1488,38 +1488,26 @@ bool perf_hpp__is_sort_entry(struct perf_hpp_fmt *format)
 	return format->header == __sort__hpp_header;
 }
 
-bool perf_hpp__is_trace_entry(struct perf_hpp_fmt *fmt)
-{
-	struct hpp_sort_entry *hse;
+#define MK_SORT_ENTRY_CHK(key)					\
+bool perf_hpp__is_ ## key ## _entry(struct perf_hpp_fmt *fmt)	\
+{								\
+	struct hpp_sort_entry *hse;				\
+								\
+	if (!perf_hpp__is_sort_entry(fmt))			\
+		return false;					\
+								\
+	hse = container_of(fmt, struct hpp_sort_entry, hpp);	\
+	return hse->se == &sort_ ## key ;			\
+}
+
+MK_SORT_ENTRY_CHK(trace)
+MK_SORT_ENTRY_CHK(srcline)
+MK_SORT_ENTRY_CHK(srcfile)
+MK_SORT_ENTRY_CHK(thread)
+MK_SORT_ENTRY_CHK(comm)
+MK_SORT_ENTRY_CHK(dso)
+MK_SORT_ENTRY_CHK(sym)
 
-	if (!perf_hpp__is_sort_entry(fmt))
-		return false;
-
-	hse = container_of(fmt, struct hpp_sort_entry, hpp);
-	return hse->se == &sort_trace;
-}
-
-bool perf_hpp__is_srcline_entry(struct perf_hpp_fmt *fmt)
-{
-	struct hpp_sort_entry *hse;
-
-	if (!perf_hpp__is_sort_entry(fmt))
-		return false;
-
-	hse = container_of(fmt, struct hpp_sort_entry, hpp);
-	return hse->se == &sort_srcline;
-}
-
-bool perf_hpp__is_srcfile_entry(struct perf_hpp_fmt *fmt)
-{
-	struct hpp_sort_entry *hse;
-
-	if (!perf_hpp__is_sort_entry(fmt))
-		return false;
-
-	hse = container_of(fmt, struct hpp_sort_entry, hpp);
-	return hse->se == &sort_srcfile;
-}
 
 static bool __sort__hpp_equal(struct perf_hpp_fmt *a, struct perf_hpp_fmt *b)
 {
-- 
cgit v0.10.2


From aec13a7ec78d9322a348fb26940097b0bdfef1bd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 9 Mar 2016 22:46:58 +0900
Subject: perf tools: Fix command line filters in hierarchy mode

When a command-line filter is applied in hierarchy mode, output is
broken especially when filtering on lower level.  The higher level
entries doesn't show up so it's hard to see the results.

Also it needs to handle multi sort keys in a single hierarchy level.

Before:

  $ perf report --hierarchy -s 'cpu,{dso,comm}' --comms swapper --stdio
  ...
  #    Overhead  CPU / Shared Object+Command
  # ...........  ...........................
  #
         13.79%     [kernel.vmlinux]  swapper
      31.71%     000
         13.80%     [kernel.vmlinux]  swapper
          0.43%     [e1000e]          swapper
         11.89%     [kernel.vmlinux]  swapper
          9.18%     [kernel.vmlinux]  swapper

After:

  #    Overhead  CPU / Shared Object+Command
  # ...........  ...............................
  #
      33.09%     003
         13.79%     [kernel.vmlinux]  swapper
      31.71%     000
         13.80%     [kernel.vmlinux]  swapper
          0.43%     [e1000e]          swapper
      21.90%     002
         11.89%     [kernel.vmlinux]  swapper
      13.30%     001
          9.18%     [kernel.vmlinux]  swapper

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Tested-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457531222-18130-4-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 29da9e0..a98f934 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1087,10 +1087,103 @@ int hist_entry__snprintf_alignment(struct hist_entry *he, struct perf_hpp *hpp,
  */
 
 static void hists__apply_filters(struct hists *hists, struct hist_entry *he);
+static void hists__remove_entry_filter(struct hists *hists, struct hist_entry *he,
+				       enum hist_filter type);
+
+typedef bool (*fmt_chk_fn)(struct perf_hpp_fmt *fmt);
+
+static bool check_thread_entry(struct perf_hpp_fmt *fmt)
+{
+	return perf_hpp__is_thread_entry(fmt) || perf_hpp__is_comm_entry(fmt);
+}
+
+static void hist_entry__check_and_remove_filter(struct hist_entry *he,
+						enum hist_filter type,
+						fmt_chk_fn check)
+{
+	struct perf_hpp_fmt *fmt;
+	bool type_match = false;
+	struct hist_entry *parent = he->parent_he;
+
+	switch (type) {
+	case HIST_FILTER__THREAD:
+		if (symbol_conf.comm_list == NULL &&
+		    symbol_conf.pid_list == NULL &&
+		    symbol_conf.tid_list == NULL)
+			return;
+		break;
+	case HIST_FILTER__DSO:
+		if (symbol_conf.dso_list == NULL)
+			return;
+		break;
+	case HIST_FILTER__SYMBOL:
+		if (symbol_conf.sym_list == NULL)
+			return;
+		break;
+	case HIST_FILTER__PARENT:
+	case HIST_FILTER__GUEST:
+	case HIST_FILTER__HOST:
+	case HIST_FILTER__SOCKET:
+	default:
+		return;
+	}
+
+	/* if it's filtered by own fmt, it has to have filter bits */
+	perf_hpp_list__for_each_format(he->hpp_list, fmt) {
+		if (check(fmt)) {
+			type_match = true;
+			break;
+		}
+	}
+
+	if (type_match) {
+		/*
+		 * If the filter is for current level entry, propagate
+		 * filter marker to parents.  The marker bit was
+		 * already set by default so it only needs to clear
+		 * non-filtered entries.
+		 */
+		if (!(he->filtered & (1 << type))) {
+			while (parent) {
+				parent->filtered &= ~(1 << type);
+				parent = parent->parent_he;
+			}
+		}
+	} else {
+		/*
+		 * If current entry doesn't have matching formats, set
+		 * filter marker for upper level entries.  it will be
+		 * cleared if its lower level entries is not filtered.
+		 *
+		 * For lower-level entries, it inherits parent's
+		 * filter bit so that lower level entries of a
+		 * non-filtered entry won't set the filter marker.
+		 */
+		if (parent == NULL)
+			he->filtered |= (1 << type);
+		else
+			he->filtered |= (parent->filtered & (1 << type));
+	}
+}
+
+static void hist_entry__apply_hierarchy_filters(struct hist_entry *he)
+{
+	hist_entry__check_and_remove_filter(he, HIST_FILTER__THREAD,
+					    check_thread_entry);
+
+	hist_entry__check_and_remove_filter(he, HIST_FILTER__DSO,
+					    perf_hpp__is_dso_entry);
+
+	hist_entry__check_and_remove_filter(he, HIST_FILTER__SYMBOL,
+					    perf_hpp__is_sym_entry);
+
+	hists__apply_filters(he->hists, he);
+}
 
 static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
 						 struct rb_root *root,
 						 struct hist_entry *he,
+						 struct hist_entry *parent_he,
 						 struct perf_hpp_list *hpp_list)
 {
 	struct rb_node **p = &root->rb_node;
@@ -1125,11 +1218,13 @@ static struct hist_entry *hierarchy_insert_entry(struct hists *hists,
 	if (new == NULL)
 		return NULL;
 
-	hists__apply_filters(hists, new);
 	hists->nr_entries++;
 
 	/* save related format list for output */
 	new->hpp_list = hpp_list;
+	new->parent_he = parent_he;
+
+	hist_entry__apply_hierarchy_filters(new);
 
 	/* some fields are now passed to 'new' */
 	perf_hpp_list__for_each_sort_list(hpp_list, fmt) {
@@ -1170,14 +1265,13 @@ static int hists__hierarchy_insert_entry(struct hists *hists,
 			continue;
 
 		/* insert copy of 'he' for each fmt into the hierarchy */
-		new_he = hierarchy_insert_entry(hists, root, he, &node->hpp);
+		new_he = hierarchy_insert_entry(hists, root, he, parent, &node->hpp);
 		if (new_he == NULL) {
 			ret = -1;
 			break;
 		}
 
 		root = &new_he->hroot_in;
-		new_he->parent_he = parent;
 		new_he->depth = depth++;
 		parent = new_he;
 	}
-- 
cgit v0.10.2


From a515d8ff7085d5e9fde867f2048b8da36b95dc51 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 9 Mar 2016 22:46:59 +0900
Subject: perf tools: Remove hist_entry->fmt field

It's not used anymore and the output format is accessed by the hpp_list
pointer instead when hierarchy is enabled.  Let's get rid of it.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457531222-18130-5-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index ea1f722..151afc1 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -129,7 +129,6 @@ struct hist_entry {
 	void			*raw_data;
 	u32			raw_size;
 	void			*trace_output;
-	struct perf_hpp_fmt	*fmt;
 	struct perf_hpp_list	*hpp_list;
 	struct hist_entry	*parent_he;
 	union {
-- 
cgit v0.10.2


From 325a62834e81452d2a6e253444022cf493bbabfc Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 9 Mar 2016 22:47:00 +0900
Subject: perf hists browser: Cleanup hist_browser__fprintf_hierarchy_entry()

The hist_browser__fprintf_hierarchy_entry() if to dump current output
into a file so it needs to be sync-ed with the corresponding function
hist_browser__show_hierarchy_entry().  So use hists->nr_hpp_node to
indent width and use first fmt_node to print overhead columns instead of
checking whether it's a sort entry (or dynamic entry).

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457531222-18130-6-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index e0e217e..aed9c8f 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -1928,8 +1928,7 @@ static int hist_browser__fprintf_entry(struct hist_browser *browser,
 
 static int hist_browser__fprintf_hierarchy_entry(struct hist_browser *browser,
 						 struct hist_entry *he,
-						 FILE *fp, int level,
-						 int nr_sort_keys)
+						 FILE *fp, int level)
 {
 	char s[8192];
 	int printed = 0;
@@ -1939,23 +1938,20 @@ static int hist_browser__fprintf_hierarchy_entry(struct hist_browser *browser,
 		.size = sizeof(s),
 	};
 	struct perf_hpp_fmt *fmt;
+	struct perf_hpp_list_node *fmt_node;
 	bool first = true;
 	int ret;
-	int hierarchy_indent = nr_sort_keys * HIERARCHY_INDENT;
+	int hierarchy_indent = (he->hists->nr_hpp_node - 2) * HIERARCHY_INDENT;
 
 	printed = fprintf(fp, "%*s", level * HIERARCHY_INDENT, "");
 
 	folded_sign = hist_entry__folded(he);
 	printed += fprintf(fp, "%c", folded_sign);
 
-	hists__for_each_format(he->hists, fmt) {
-		if (perf_hpp__should_skip(fmt, he->hists))
-			continue;
-
-		if (perf_hpp__is_sort_entry(fmt) ||
-		    perf_hpp__is_dynamic_entry(fmt))
-			break;
-
+	/* the first hpp_list_node is for overhead columns */
+	fmt_node = list_first_entry(&he->hists->hpp_formats,
+				    struct perf_hpp_list_node, list);
+	perf_hpp_list__for_each_format(&fmt_node->hpp, fmt) {
 		if (!first) {
 			ret = scnprintf(hpp.buf, hpp.size, "  ");
 			advance_hpp(&hpp, ret);
@@ -1992,7 +1988,6 @@ static int hist_browser__fprintf(struct hist_browser *browser, FILE *fp)
 	struct rb_node *nd = hists__filter_entries(rb_first(browser->b.entries),
 						   browser->min_pcnt);
 	int printed = 0;
-	int nr_sort = browser->hists->nr_sort_keys;
 
 	while (nd) {
 		struct hist_entry *h = rb_entry(nd, struct hist_entry, rb_node);
@@ -2000,8 +1995,7 @@ static int hist_browser__fprintf(struct hist_browser *browser, FILE *fp)
 		if (symbol_conf.report_hierarchy) {
 			printed += hist_browser__fprintf_hierarchy_entry(browser,
 									 h, fp,
-									 h->depth,
-									 nr_sort);
+									 h->depth);
 		} else {
 			printed += hist_browser__fprintf_entry(browser, h, fp);
 		}
-- 
cgit v0.10.2


From 86e3ee5224c17b7967aac39aa15539393c144de7 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 9 Mar 2016 22:47:01 +0900
Subject: perf tools: Remove nr_sort_keys field

The nr_sort_keys field is to carry the number of sort entries in a
hpp_list or hists to determine the depth of indentation of a hist entry.
As it's only used in hierarchy mode and now we have used nr_hpp_node for
this reason, there's no need to keep it anymore.  Let's get rid of it.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Jiri Olsa <jolsa@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457531222-18130-7-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/hist.c b/tools/perf/ui/hist.c
index f03c4f7..3baeaa6 100644
--- a/tools/perf/ui/hist.c
+++ b/tools/perf/ui/hist.c
@@ -515,9 +515,6 @@ void perf_hpp_list__column_register(struct perf_hpp_list *list,
 void perf_hpp_list__register_sort_field(struct perf_hpp_list *list,
 					struct perf_hpp_fmt *format)
 {
-	if (perf_hpp__is_sort_entry(format) || perf_hpp__is_dynamic_entry(format))
-		list->nr_sort_keys++;
-
 	list_add_tail(&format->sort_list, &list->sorts);
 }
 
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index 6870a1b..ead18c8 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -79,7 +79,6 @@ struct hists {
 	int			socket_filter;
 	struct perf_hpp_list	*hpp_list;
 	struct list_head	hpp_formats;
-	int			nr_sort_keys;
 	int			nr_hpp_node;
 };
 
@@ -241,7 +240,6 @@ struct perf_hpp_fmt {
 struct perf_hpp_list {
 	struct list_head fields;
 	struct list_head sorts;
-	int nr_sort_keys;
 };
 
 extern struct perf_hpp_list perf_hpp_list;
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 61c7402..ced849e 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -2703,29 +2703,6 @@ out:
 	return ret;
 }
 
-static void evlist__set_hists_nr_sort_keys(struct perf_evlist *evlist)
-{
-	struct perf_evsel *evsel;
-
-	evlist__for_each(evlist, evsel) {
-		struct perf_hpp_fmt *fmt;
-		struct hists *hists = evsel__hists(evsel);
-
-		hists->nr_sort_keys = perf_hpp_list.nr_sort_keys;
-
-		/*
-		 * If dynamic entries were used, it might add multiple
-		 * entries to each evsel for a single field name.  Set
-		 * actual number of sort keys for each hists.
-		 */
-		perf_hpp_list__for_each_sort_list(&perf_hpp_list, fmt) {
-			if (perf_hpp__is_dynamic_entry(fmt) &&
-			    !perf_hpp__defined_dynamic_entry(fmt, hists))
-				hists->nr_sort_keys--;
-		}
-	}
-}
-
 int setup_sorting(struct perf_evlist *evlist)
 {
 	int err;
@@ -2740,9 +2717,6 @@ int setup_sorting(struct perf_evlist *evlist)
 			return err;
 	}
 
-	if (evlist != NULL)
-		evlist__set_hists_nr_sort_keys(evlist);
-
 	reset_dimensions();
 
 	/*
-- 
cgit v0.10.2


From f7fb538afea55383a9383dac5c56887c601af5f4 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 9 Mar 2016 22:47:02 +0900
Subject: perf tools: Recalc total periods using top-level entries in hierarchy

When hierarchy mode is enabled, each entry in a hierarchy level shares
the period.  IOW an upper level entry's period is the sum of lower level
entries.  Thus perf uses only one of them to calculate the total period
of hists.  It was lowest-level (leaf) entries but it has a problem when
it comes to filters.

If a filter is applied, entries in the same level will be filtered or
not.  But upper level entries still have period of their sum including
filtered one.  So total sum of upper level entries will not be same as
sum of lower level entries.

This resulted in entries having more than 100% of overhead and it can be
produced using perf top with filter(s).

Reported-and-Tested-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Wang Nan <wangnan0@huawei.com>
Link: http://lkml.kernel.org/r/1457531222-18130-8-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index a98f934..290b3cb 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -1453,6 +1453,31 @@ void hists__inc_stats(struct hists *hists, struct hist_entry *h)
 	hists->stats.total_period += h->stat.period;
 }
 
+static void hierarchy_recalc_total_periods(struct hists *hists)
+{
+	struct rb_node *node;
+	struct hist_entry *he;
+
+	node = rb_first(&hists->entries);
+
+	hists->stats.total_period = 0;
+	hists->stats.total_non_filtered_period = 0;
+
+	/*
+	 * recalculate total period using top-level entries only
+	 * since lower level entries only see non-filtered entries
+	 * but upper level entries have sum of both entries.
+	 */
+	while (node) {
+		he = rb_entry(node, struct hist_entry, rb_node);
+		node = rb_next(node);
+
+		hists->stats.total_period += he->stat.period;
+		if (!he->filtered)
+			hists->stats.total_non_filtered_period += he->stat.period;
+	}
+}
+
 static void hierarchy_insert_output_entry(struct rb_root *root,
 					  struct hist_entry *he)
 {
@@ -1518,11 +1543,6 @@ static void hists__hierarchy_output_resort(struct hists *hists,
 			continue;
 		}
 
-		/* only update stat for leaf entries to avoid duplication */
-		hists__inc_stats(hists, he);
-		if (!he->filtered)
-			hists__calc_col_len(hists, he);
-
 		if (!use_callchain)
 			continue;
 
@@ -1602,11 +1622,13 @@ static void output_resort(struct hists *hists, struct ui_progress *prog,
 	hists__reset_col_len(hists);
 
 	if (symbol_conf.report_hierarchy) {
-		return hists__hierarchy_output_resort(hists, prog,
-						      &hists->entries_collapsed,
-						      &hists->entries,
-						      min_callchain_hits,
-						      use_callchain);
+		hists__hierarchy_output_resort(hists, prog,
+					       &hists->entries_collapsed,
+					       &hists->entries,
+					       min_callchain_hits,
+					       use_callchain);
+		hierarchy_recalc_total_periods(hists);
+		return;
 	}
 
 	if (sort__need_collapse)
@@ -1927,6 +1949,8 @@ static void hists__filter_hierarchy(struct hists *hists, int type, const void *a
 		}
 	}
 
+	hierarchy_recalc_total_periods(hists);
+
 	/*
 	 * resort output after applying a new filter since filter in a lower
 	 * hierarchy can change periods in a upper hierarchy.
-- 
cgit v0.10.2


From 078b8d4a406fa8ce4a3c9d5145c27be1ed2b1dfd Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 9 Mar 2016 23:20:51 +0900
Subject: perf tools: Add sort__has_comm variable

The sort__has_comm variable is to check whether the comm sort key is
given.  This is necessary to support thread filtering in the TUI hists
browser later.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1457533253-21419-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index ced849e..93fa136 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -27,6 +27,7 @@ int		sort__has_sym = 0;
 int		sort__has_dso = 0;
 int		sort__has_socket = 0;
 int		sort__has_thread = 0;
+int		sort__has_comm = 0;
 enum sort_mode	sort__mode = SORT_MODE__NORMAL;
 
 /*
@@ -2262,6 +2263,8 @@ static int sort_dimension__add(struct perf_hpp_list *list, const char *tok,
 			sort__has_socket = 1;
 		} else if (sd->entry == &sort_thread) {
 			sort__has_thread = 1;
+		} else if (sd->entry == &sort_comm) {
+			sort__has_comm = 1;
 		}
 
 		return __sort_dimension__add(sd, list, level);
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index 151afc1..3f4e359 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -37,6 +37,7 @@ extern int sort__has_parent;
 extern int sort__has_sym;
 extern int sort__has_socket;
 extern int sort__has_thread;
+extern int sort__has_comm;
 extern enum sort_mode sort__mode;
 extern struct sort_entry sort_comm;
 extern struct sort_entry sort_dso;
-- 
cgit v0.10.2


From 6962ccb37b50366014074aec6fd14497cf719642 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 10 Mar 2016 00:14:50 +0900
Subject: perf hists browser: Allow thread filtering for comm sort key

The commit 2eafd410e669 ("perf hists browser: Only 'Zoom into thread'
only when sort order has 'pid'") disabled thread filtering in hist
browser for the default sort key.  However the he->thread is still valid
even if 'pid' sort key is not given.  Only thing it should not use is
the pid (or tid) of the thread.  So allow to filter by thread when
'comm' sort key is given and show pid only if 'pid' sort key is given.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1457536490-24084-1-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index aed9c8f..cb4191b 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -2136,11 +2136,18 @@ static int hists__browser_title(struct hists *hists,
 	if (hists->uid_filter_str)
 		printed += snprintf(bf + printed, size - printed,
 				    ", UID: %s", hists->uid_filter_str);
-	if (thread)
-		printed += scnprintf(bf + printed, size - printed,
+	if (thread) {
+		if (sort__has_thread) {
+			printed += scnprintf(bf + printed, size - printed,
 				    ", Thread: %s(%d)",
 				     (thread->comm_set ? thread__comm_str(thread) : ""),
 				    thread->tid);
+		} else {
+			printed += scnprintf(bf + printed, size - printed,
+				    ", Thread: %s",
+				     (thread->comm_set ? thread__comm_str(thread) : ""));
+		}
+	}
 	if (dso)
 		printed += scnprintf(bf + printed, size - printed,
 				    ", DSO: %s", dso->short_name);
@@ -2321,9 +2328,15 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
 		thread__zput(browser->hists->thread_filter);
 		ui_helpline__pop();
 	} else {
-		ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"",
-				   thread->comm_set ? thread__comm_str(thread) : "",
-				   thread->tid);
+		if (sort__has_thread) {
+			ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s(%d) thread\"",
+					   thread->comm_set ? thread__comm_str(thread) : "",
+					   thread->tid);
+		} else {
+			ui_helpline__fpush("To zoom out press ESC or ENTER + \"Zoom out of %s thread\"",
+					   thread->comm_set ? thread__comm_str(thread) : "");
+		}
+
 		browser->hists->thread_filter = thread__get(thread);
 		perf_hpp__set_elide(HISTC_THREAD, false);
 		pstack__push(browser->pstack, &browser->hists->thread_filter);
@@ -2338,13 +2351,22 @@ static int
 add_thread_opt(struct hist_browser *browser, struct popup_action *act,
 	       char **optstr, struct thread *thread)
 {
-	if (!sort__has_thread || thread == NULL)
+	int ret;
+
+	if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
 		return 0;
 
-	if (asprintf(optstr, "Zoom %s %s(%d) thread",
-		     browser->hists->thread_filter ? "out of" : "into",
-		     thread->comm_set ? thread__comm_str(thread) : "",
-		     thread->tid) < 0)
+	if (sort__has_thread) {
+		ret = asprintf(optstr, "Zoom %s %s(%d) thread",
+			       browser->hists->thread_filter ? "out of" : "into",
+			       thread->comm_set ? thread__comm_str(thread) : "",
+			       thread->tid);
+	} else {
+		ret = asprintf(optstr, "Zoom %s %s thread",
+			       browser->hists->thread_filter ? "out of" : "into",
+			       thread->comm_set ? thread__comm_str(thread) : "");
+	}
+	if (ret < 0)
 		return 0;
 
 	act->thread = thread;
-- 
cgit v0.10.2


From 599a2f38a989a79df99838f22cb607f5e2b5b56c Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Wed, 9 Mar 2016 23:20:53 +0900
Subject: perf hists browser: Check sort keys before hot key actions

The context menu in TUI hists browser checks corresponding sort keys
when creating the menu item.  But hotkey actions lacks these checks so
it can filter using incorrect info.

For example, default sort key of 'perf top' doesn't contain 'comm' or
'pid' sort key so each hist entry's thread info is not reliable.  Thus
it should prohibit using thread filter on 't' key.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1457533253-21419-3-git-send-email-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index cb4191b..4b98165 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -2322,6 +2322,9 @@ do_zoom_thread(struct hist_browser *browser, struct popup_action *act)
 {
 	struct thread *thread = act->thread;
 
+	if ((!sort__has_thread && !sort__has_comm) || thread == NULL)
+		return 0;
+
 	if (browser->hists->thread_filter) {
 		pstack__remove(browser->pstack, &browser->hists->thread_filter);
 		perf_hpp__set_elide(HISTC_THREAD, false);
@@ -2379,6 +2382,9 @@ do_zoom_dso(struct hist_browser *browser, struct popup_action *act)
 {
 	struct map *map = act->ms.map;
 
+	if (!sort__has_dso || map == NULL)
+		return 0;
+
 	if (browser->hists->dso_filter) {
 		pstack__remove(browser->pstack, &browser->hists->dso_filter);
 		perf_hpp__set_elide(HISTC_DSO, false);
@@ -2530,6 +2536,9 @@ add_exit_opt(struct hist_browser *browser __maybe_unused,
 static int
 do_zoom_socket(struct hist_browser *browser, struct popup_action *act)
 {
+	if (!sort__has_socket || act->socket < 0)
+		return 0;
+
 	if (browser->hists->socket_filter > -1) {
 		pstack__remove(browser->pstack, &browser->hists->socket_filter);
 		browser->hists->socket_filter = -1;
-- 
cgit v0.10.2


From 6b45f7b2a37b0e00693985fd0abfc8e0319f91ce Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 3 Mar 2016 15:57:35 -0800
Subject: perf stat: Document CSV format in manpage

With all the recently added fields in the perf stat CSV output we should
finally document them in the man page. Do this here.

v2: Fix fields in documentation (Jiri)
v3: fix order of fields again (Jiri)
v4: Change order again.
v5: Document more fields (Jiri)
v6: Move time stamp first
v7: More fixes (Jiri)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/1457049458-28956-5-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 14d9e8f..8812d73 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -219,6 +219,29 @@ $ perf stat -- make -j
 
  Wall-clock time elapsed:   719.554352 msecs
 
+CSV FORMAT
+----------
+
+With -x, perf stat is able to output a not-quite-CSV format output
+Commas in the output are not put into "". To make it easy to parse
+it is recommended to use a different character like -x \;
+
+The fields are in this order:
+
+	- optional usec time stamp in fractions of second (with -I xxx)
+	- optional CPU, core, or socket identifier
+	- optional number of logical CPUs aggregated
+	- counter value
+	- unit of the counter value or empty
+	- event name
+	- run time of counter
+	- percentage of measurement time the counter was running
+	- optional variance if multiple values are collected with -r
+	- optional metric value
+	- optional unit of metric
+
+Additional metrics may be printed with all earlier fields being empty.
+
 SEE ALSO
 --------
 linkperf:perf-top[1], linkperf:perf-list[1]
-- 
cgit v0.10.2


From 54b5091606c18f68a7fc8b4ab03ac4592c7d2922 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 3 Mar 2016 15:57:36 -0800
Subject: perf stat: Implement --metric-only mode

Add a new mode to only print metrics. Sometimes we don't care about the
raw values, just want the computed metrics. This allows more compact
printing, so with -I each sample is only a single line.  This also
allows easier plotting and processing with other tools.

The main target is with using --topdown, but it also works with -T and
standard perf stat. A few metrics are not supported.

To avoiding having to hardcode all the metrics in the code it uses a two
pass approach: first compute dummy metrics and only print the headers in
the print_metric callback. Then use the callback to print the actual
values.

There are some additional changes in the stat printout code to handle
all metrics being on a single line.

One issue is that the column code doesn't know in advance what events
are not supported by the CPU, and it would be hard to find out as this
could change based on dynamic conditions. That causes empty columns in
some cases.

The output can be fairly wide, often you may need more than 80 columns.

Example:

% perf stat -a -I 1000 --metric-only
     1.001452803 frontend cycles idle insn per cycle       stalled cycles per insn branch-misses of all branches
     1.001452803  158.91%               0.66                2.39                    2.92%
     2.002192321  180.63%               0.76                2.08                    2.96%
     3.003088282  150.59%               0.62                2.57                    2.84%
     4.004369835  196.20%               0.98                1.62                    3.79%
     5.005227314  231.98%               0.84                1.90                    4.71%

v2: Lots of updates.
v3: Use slightly narrower columns
v4: Add comment

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/1457049458-28956-6-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 8812d73..82f0951 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -147,6 +147,10 @@ Print count deltas every N milliseconds (minimum: 10ms)
 The overhead percentage could be high in some cases, for instance with small, sub 100ms intervals.  Use with caution.
 	example: 'perf stat -I 1000 -e cycles -a sleep 5'
 
+--metric-only::
+Only print computed metrics. Print them in a single line.
+Don't show any raw values. Not supported with -A or --per-thread.
+
 --per-socket::
 Aggregate counts per processor socket for system-wide mode measurements.  This
 is a useful mode to detect imbalance between sockets.  To enable this mode,
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index baa8207..74508c9 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -122,6 +122,7 @@ static bool			sync_run			= false;
 static unsigned int		initial_delay			= 0;
 static unsigned int		unit_width			= 4; /* strlen("unit") */
 static bool			forever				= false;
+static bool			metric_only			= false;
 static struct timespec		ref_time;
 static struct cpu_map		*aggr_map;
 static aggr_get_id_t		aggr_get_id;
@@ -827,6 +828,99 @@ static void print_metric_csv(void *ctx,
 	fprintf(out, "%s%s%s%s", csv_sep, vals, csv_sep, unit);
 }
 
+#define METRIC_ONLY_LEN 20
+
+/* Filter out some columns that don't work well in metrics only mode */
+
+static bool valid_only_metric(const char *unit)
+{
+	if (!unit)
+		return false;
+	if (strstr(unit, "/sec") ||
+	    strstr(unit, "hz") ||
+	    strstr(unit, "Hz") ||
+	    strstr(unit, "CPUs utilized"))
+		return false;
+	return true;
+}
+
+static const char *fixunit(char *buf, struct perf_evsel *evsel,
+			   const char *unit)
+{
+	if (!strncmp(unit, "of all", 6)) {
+		snprintf(buf, 1024, "%s %s", perf_evsel__name(evsel),
+			 unit);
+		return buf;
+	}
+	return unit;
+}
+
+static void print_metric_only(void *ctx, const char *color, const char *fmt,
+			      const char *unit, double val)
+{
+	struct outstate *os = ctx;
+	FILE *out = os->fh;
+	int n;
+	char buf[1024];
+	unsigned mlen = METRIC_ONLY_LEN;
+
+	if (!valid_only_metric(unit))
+		return;
+	unit = fixunit(buf, os->evsel, unit);
+	if (color)
+		n = color_fprintf(out, color, fmt, val);
+	else
+		n = fprintf(out, fmt, val);
+	if (n > METRIC_ONLY_LEN)
+		n = METRIC_ONLY_LEN;
+	if (mlen < strlen(unit))
+		mlen = strlen(unit) + 1;
+	fprintf(out, "%*s", mlen - n, "");
+}
+
+static void print_metric_only_csv(void *ctx, const char *color __maybe_unused,
+				  const char *fmt,
+				  const char *unit, double val)
+{
+	struct outstate *os = ctx;
+	FILE *out = os->fh;
+	char buf[64], *vals, *ends;
+	char tbuf[1024];
+
+	if (!valid_only_metric(unit))
+		return;
+	unit = fixunit(tbuf, os->evsel, unit);
+	snprintf(buf, sizeof buf, fmt, val);
+	vals = buf;
+	while (isspace(*vals))
+		vals++;
+	ends = vals;
+	while (isdigit(*ends) || *ends == '.')
+		ends++;
+	*ends = 0;
+	fprintf(out, "%s%s", vals, csv_sep);
+}
+
+static void new_line_metric(void *ctx __maybe_unused)
+{
+}
+
+static void print_metric_header(void *ctx, const char *color __maybe_unused,
+				const char *fmt __maybe_unused,
+				const char *unit, double val __maybe_unused)
+{
+	struct outstate *os = ctx;
+	char tbuf[1024];
+
+	if (!valid_only_metric(unit))
+		return;
+	unit = fixunit(tbuf, os->evsel, unit);
+	if (csv_output)
+		fprintf(os->fh, "%s%s", unit, csv_sep);
+	else
+		fprintf(os->fh, "%-*s ", METRIC_ONLY_LEN, unit);
+}
+
 static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
 	FILE *output = stat_config.output;
@@ -921,9 +1015,16 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 	print_metric_t pm = print_metric_std;
 	void (*nl)(void *);
 
-	nl = new_line_std;
+	if (metric_only) {
+		nl = new_line_metric;
+		if (csv_output)
+			pm = print_metric_only_csv;
+		else
+			pm = print_metric_only;
+	} else
+		nl = new_line_std;
 
-	if (csv_output) {
+	if (csv_output && !metric_only) {
 		static int aggr_fields[] = {
 			[AGGR_GLOBAL] = 0,
 			[AGGR_THREAD] = 1,
@@ -940,6 +1041,10 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 			os.nfields++;
 	}
 	if (run == 0 || ena == 0 || counter->counts->scaled == -1) {
+		if (metric_only) {
+			pm(&os, NULL, "", "", 0);
+			return;
+		}
 		aggr_printout(counter, id, nr);
 
 		fprintf(stat_config.output, "%*s%s",
@@ -968,7 +1073,9 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 		return;
 	}
 
-	if (nsec_counter(counter))
+	if (metric_only)
+		/* nothing */;
+	else if (nsec_counter(counter))
 		nsec_printout(id, nr, counter, uval);
 	else
 		abs_printout(id, nr, counter, uval);
@@ -977,7 +1084,7 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 	out.new_line = nl;
 	out.ctx = &os;
 
-	if (csv_output) {
+	if (csv_output && !metric_only) {
 		print_noise(counter, noise);
 		print_running(run, ena);
 	}
@@ -985,7 +1092,7 @@ static void printout(int id, int nr, struct perf_evsel *counter, double uval,
 	perf_stat__print_shadow_stats(counter, uval,
 				first_shadow_cpu(counter, id),
 				&out);
-	if (!csv_output) {
+	if (!csv_output && !metric_only) {
 		print_noise(counter, noise);
 		print_running(run, ena);
 	}
@@ -1021,14 +1128,23 @@ static void print_aggr(char *prefix)
 	int cpu, s, s2, id, nr;
 	double uval;
 	u64 ena, run, val;
+	bool first;
 
 	if (!(aggr_map || aggr_get_id))
 		return;
 
 	aggr_update_shadow();
 
+	/*
+	 * With metric_only everything is on a single line.
+	 * Without each counter has its own line.
+	 */
 	for (s = 0; s < aggr_map->nr; s++) {
+		if (prefix && metric_only)
+			fprintf(output, "%s", prefix);
+
 		id = aggr_map->map[s];
+		first = true;
 		evlist__for_each(evsel_list, counter) {
 			val = ena = run = 0;
 			nr = 0;
@@ -1041,13 +1157,20 @@ static void print_aggr(char *prefix)
 				run += perf_counts(counter->counts, cpu, 0)->run;
 				nr++;
 			}
-			if (prefix)
+			if (first && metric_only) {
+				first = false;
+				aggr_printout(counter, id, nr);
+			}
+			if (prefix && !metric_only)
 				fprintf(output, "%s", prefix);
 
 			uval = val * counter->scale;
 			printout(id, nr, counter, uval, prefix, run, ena, 1.0);
-			fputc('\n', output);
+			if (!metric_only)
+				fputc('\n', output);
 		}
+		if (metric_only)
+			fputc('\n', output);
 	}
 }
 
@@ -1092,12 +1215,13 @@ static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 	avg_enabled = avg_stats(&ps->res_stats[1]);
 	avg_running = avg_stats(&ps->res_stats[2]);
 
-	if (prefix)
+	if (prefix && !metric_only)
 		fprintf(output, "%s", prefix);
 
 	uval = avg * counter->scale;
 	printout(-1, 0, counter, uval, prefix, avg_running, avg_enabled, avg);
-	fprintf(output, "\n");
+	if (!metric_only)
+		fprintf(output, "\n");
 }
 
 /*
@@ -1126,6 +1250,43 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
 	}
 }
 
+static int aggr_header_lens[] = {
+	[AGGR_CORE] = 18,
+	[AGGR_SOCKET] = 12,
+	[AGGR_NONE] = 15,
+	[AGGR_THREAD] = 24,
+	[AGGR_GLOBAL] = 0,
+};
+
+static void print_metric_headers(char *prefix)
+{
+	struct perf_stat_output_ctx out;
+	struct perf_evsel *counter;
+	struct outstate os = {
+		.fh = stat_config.output
+	};
+
+	if (prefix)
+		fprintf(stat_config.output, "%s", prefix);
+
+	if (!csv_output)
+		fprintf(stat_config.output, "%*s",
+			aggr_header_lens[stat_config.aggr_mode], "");
+
+	/* Print metrics headers only */
+	evlist__for_each(evsel_list, counter) {
+		os.evsel = counter;
+		out.ctx = &os;
+		out.print_metric = print_metric_header;
+		out.new_line = new_line_metric;
+		os.evsel = counter;
+		perf_stat__print_shadow_stats(counter, 0,
+					      0,
+					      &out);
+	}
+	fputc('\n', stat_config.output);
+}
+
 static void print_interval(char *prefix, struct timespec *ts)
 {
 	FILE *output = stat_config.output;
@@ -1133,7 +1294,7 @@ static void print_interval(char *prefix, struct timespec *ts)
 
 	sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep);
 
-	if (num_print_interval == 0 && !csv_output) {
+	if (num_print_interval == 0 && !csv_output && !metric_only) {
 		switch (stat_config.aggr_mode) {
 		case AGGR_SOCKET:
 			fprintf(output, "#           time socket cpus             counts %*s events\n", unit_width, "unit");
@@ -1220,6 +1381,17 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
 	else
 		print_header(argc, argv);
 
+	if (metric_only) {
+		static int num_print_iv;
+
+		if (num_print_iv == 0)
+			print_metric_headers(prefix);
+		if (num_print_iv++ == 25)
+			num_print_iv = 0;
+		if (stat_config.aggr_mode == AGGR_GLOBAL && prefix)
+			fprintf(stat_config.output, "%s", prefix);
+	}
+
 	switch (stat_config.aggr_mode) {
 	case AGGR_CORE:
 	case AGGR_SOCKET:
@@ -1232,6 +1404,8 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
 	case AGGR_GLOBAL:
 		evlist__for_each(evsel_list, counter)
 			print_counter_aggr(counter, prefix);
+		if (metric_only)
+			fputc('\n', stat_config.output);
 		break;
 	case AGGR_NONE:
 		evlist__for_each(evsel_list, counter)
@@ -1356,6 +1530,8 @@ static const struct option stat_options[] = {
 		     "aggregate counts per thread", AGGR_THREAD),
 	OPT_UINTEGER('D', "delay", &initial_delay,
 		     "ms to wait before starting measurement after program start"),
+	OPT_BOOLEAN(0, "metric-only", &metric_only,
+			"Only print computed metrics. No raw values"),
 	OPT_END()
 };
 
@@ -1997,6 +2173,21 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 		goto out;
 	}
 
+	if (metric_only && stat_config.aggr_mode == AGGR_THREAD) {
+		fprintf(stderr, "--metric-only is not supported with --per-thread\n");
+		goto out;
+	}
+
+	if (metric_only && stat_config.aggr_mode == AGGR_NONE) {
+		fprintf(stderr, "--metric-only is not supported with -A\n");
+		goto out;
+	}
+
+	if (metric_only && run_count > 1) {
+		fprintf(stderr, "--metric-only is not supported with -r\n");
+		goto out;
+	}
+
 	if (output_fd < 0) {
 		fprintf(stderr, "argument to --log-fd must be a > 0\n");
 		parse_options_usage(stat_usage, stat_options, "log-fd", 0);
-- 
cgit v0.10.2


From 206cab651d07563d766c7f4cb73f858c5df3dec5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 3 Mar 2016 15:57:37 -0800
Subject: perf stat: Add --metric-only support for -A

Add metric only support for -A too. This requires a new print function
that prints the metrics in the right order.

v2: Fix manpage
v3: Simplify nrcpus computation

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: http://lkml.kernel.org/r/1457049458-28956-7-git-send-email-andi@firstfloor.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>

diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 82f0951..04f23b4 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -149,7 +149,7 @@ The overhead percentage could be high in some cases, for instance with small, su
 
 --metric-only::
 Only print computed metrics. Print them in a single line.
-Don't show any raw values. Not supported with -A or --per-thread.
+Don't show any raw values. Not supported with --per-thread.
 
 --per-socket::
 Aggregate counts per processor socket for system-wide mode measurements.  This
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 74508c9..1f19f2f 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1250,10 +1250,40 @@ static void print_counter(struct perf_evsel *counter, char *prefix)
 	}
 }
 
+static void print_no_aggr_metric(char *prefix)
+{
+	int cpu;
+	int nrcpus = 0;
+	struct perf_evsel *counter;
+	u64 ena, run, val;
+	double uval;
+
+	nrcpus = evsel_list->cpus->nr;
+	for (cpu = 0; cpu < nrcpus; cpu++) {
+		bool first = true;
+
+		if (prefix)
+			fputs(prefix, stat_config.output);
+		evlist__for_each(evsel_list, counter) {
+			if (first) {
+				aggr_printout(counter, cpu, 0);
+				first = false;
+			}
+			val = perf_counts(counter->counts, cpu, 0)->val;
+			ena = perf_counts(counter->counts, cpu, 0)->ena;
+			run = perf_counts(counter->counts, cpu, 0)->run;
+
+			uval = val * counter->scale;
+			printout(cpu, 0, counter, uval, prefix, run, ena, 1.0);
+		}
+		fputc('\n', stat_config.output);
+	}
+}
+
 static int aggr_header_lens[] = {
 	[AGGR_CORE] = 18,
 	[AGGR_SOCKET] = 12,
-	[AGGR_NONE] = 15,
+	[AGGR_NONE] = 6,
 	[AGGR_THREAD] = 24,
 	[AGGR_GLOBAL] = 0,
 };
@@ -1408,8 +1438,12 @@ static void print_counters(struct timespec *ts, int argc, const char **argv)
 			fputc('\n', stat_config.output);
 		break;
 	case AGGR_NONE:
-		evlist__for_each(evsel_list, counter)
-			print_counter(counter, prefix);
+		if (metric_only)
+			print_no_aggr_metric(prefix);
+		else {
+			evlist__for_each(evsel_list, counter)
+				print_counter(counter, prefix);
+		}
 		break;
 	case AGGR_UNSET:
 	default:
@@ -2178,11 +2212,6 @@ int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
 		goto out;
 	}
 
-	if (metric_only && stat_config.aggr_mode == AGGR_NONE) {
-		fprintf(stderr, "--metric-only is not supported with -A\n");
-		goto out;
-	}
-
 	if (metric_only && run_count > 1) {
 		fprintf(stderr, "--metric-only is not supported with -r\n");
 		goto out;
-- 
cgit v0.10.2


From 2a58c527bb566b7abad96289fa5b973040c33c82 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 20:42:08 +0100
Subject: cpu/hotplug: Fix smpboot thread ordering

Commit 931ef163309e moved the smpboot thread park/unpark invocation to the
state machine. The move of the unpark invocation was premature as it depends
on work in progress patches.

As a result cpu down can fail, because rcu synchronization in takedown_cpu()
eventually requires a functional softirq thread. I never encountered the
problem in testing, but 0day testing managed to provide a reliable reproducer.

Remove the smpboot_threads_park() call from the state machine for now and put
it back into the original place after the rcu synchronization.

I'm embarrassed as I knew about the dependency and still managed to get it
wrong. Hotplug induced brain melt seems to be the only sensible explanation
for that.

Fixes: 931ef163309e "cpu/hotplug: Unpark smpboot threads from the state machine"
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 373e831..bcee286 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -706,8 +706,9 @@ static int takedown_cpu(unsigned int cpu)
 	else
 		synchronize_rcu();
 
-	/* Park the hotplug thread */
+	/* Park the smpboot threads */
 	kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
+	smpboot_park_threads(cpu);
 
 	/*
 	 * Prevent irq alloc/free while the dying cpu reorganizes the
@@ -1206,7 +1207,7 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 	[CPUHP_AP_SMPBOOT_THREADS] = {
 		.name			= "smpboot:threads",
 		.startup		= smpboot_unpark_threads,
-		.teardown		= smpboot_park_threads,
+		.teardown		= NULL,
 	},
 	[CPUHP_AP_NOTIFY_ONLINE] = {
 		.name			= "notify:online",
-- 
cgit v0.10.2


From 0bbca274a31c2366414d8d3f95e03d1c4674d93f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
Date: Mon, 7 Mar 2016 17:56:57 +0200
Subject: drm/i915: Actually retry with bit-banging after GMBUS timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the GMBUS transfer times out, we set force_bit=1 and
return -EAGAIN expecting the i2c core to call the .master_xfer
hook again so that we will retry the same transfer via bit-banging.
This is in case the gmbus hardware is somehow faulty.

Unfortunately we left adapter->retries to 0, meaning the i2c core
didn't actually do the retry. Let's tell the core we want one retry
when we return -EAGAIN.

Note that i2c-algo-bit also uses this retry count for some internal
retries, so we'll end up increasing those a bit as well.

Cc: Jani Nikula <jani.nikula@intel.com>
Cc: drm-intel-fixes@lists.freedesktop.org
Fixes: bffce907d640 ("drm/i915: abstract i2c bit banging fallback in gmbus xfer")
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1457366220-29409-2-git-send-email-ville.syrjala@linux.intel.com
Reviewed-by: Jani Nikula <jani.nikula@intel.com>
(cherry picked from commit 8b1f165a4a8f64c28cf42d10e1f4d3b451dedc51)
Signed-off-by: Jani Nikula <jani.nikula@intel.com>

diff --git a/drivers/gpu/drm/i915/intel_i2c.c b/drivers/gpu/drm/i915/intel_i2c.c
index deb8282..52fbe53 100644
--- a/drivers/gpu/drm/i915/intel_i2c.c
+++ b/drivers/gpu/drm/i915/intel_i2c.c
@@ -664,6 +664,12 @@ int intel_setup_gmbus(struct drm_device *dev)
 
 		bus->adapter.algo = &gmbus_algorithm;
 
+		/*
+		 * We wish to retry with bit banging
+		 * after a timed out GMBUS attempt.
+		 */
+		bus->adapter.retries = 1;
+
 		/* By default use a conservative clock rate */
 		bus->reg0 = pin | GMBUS_RATE_100KHZ;
 
-- 
cgit v0.10.2


From 8b8addf891de8a00e4d39fc32f93f7c5eb8feceb Mon Sep 17 00:00:00 2001
From: Hector Marco-Gisbert <hecmargi@upv.es>
Date: Thu, 10 Mar 2016 20:51:00 +0100
Subject: x86/mm/32: Enable full randomization on i386 and X86_32

Currently on i386 and on X86_64 when emulating X86_32 in legacy mode, only
the stack and the executable are randomized but not other mmapped files
(libraries, vDSO, etc.). This patch enables randomization for the
libraries, vDSO and mmap requests on i386 and in X86_32 in legacy mode.

By default on i386 there are 8 bits for the randomization of the libraries,
vDSO and mmaps which only uses 1MB of VA.

This patch preserves the original randomness, using 1MB of VA out of 3GB or
4GB. We think that 1MB out of 3GB is not a big cost for having the ASLR.

The first obvious security benefit is that all objects are randomized (not
only the stack and the executable) in legacy mode which highly increases
the ASLR effectiveness, otherwise the attackers may use these
non-randomized areas. But also sensitive setuid/setgid applications are
more secure because currently, attackers can disable the randomization of
these applications by setting the ulimit stack to "unlimited". This is a
very old and widely known trick to disable the ASLR in i386 which has been
allowed for too long.

Another trick used to disable the ASLR was to set the ADDR_NO_RANDOMIZE
personality flag, but fortunately this doesn't work on setuid/setgid
applications because there is security checks which clear Security-relevant
flags.

This patch always randomizes the mmap_legacy_base address, removing the
possibility to disable the ASLR by setting the stack to "unlimited".

Signed-off-by: Hector Marco-Gisbert <hecmargi@upv.es>
Acked-by: Ismael Ripoll Ripoll <iripoll@upv.es>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: akpm@linux-foundation.org
Cc: kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/1457639460-5242-1-git-send-email-hecmargi@upv.es
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 96bd1e2..389939f 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -94,18 +94,6 @@ static unsigned long mmap_base(unsigned long rnd)
 }
 
 /*
- * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
- * does, but not when emulating X86_32
- */
-static unsigned long mmap_legacy_base(unsigned long rnd)
-{
-	if (mmap_is_ia32())
-		return TASK_UNMAPPED_BASE;
-	else
-		return TASK_UNMAPPED_BASE + rnd;
-}
-
-/*
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
  */
@@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	if (current->flags & PF_RANDOMIZE)
 		random_factor = arch_mmap_rnd();
 
-	mm->mmap_legacy_base = mmap_legacy_base(random_factor);
+	mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor;
 
 	if (mmap_is_legacy()) {
 		mm->mmap_base = mm->mmap_legacy_base;
-- 
cgit v0.10.2


From 143d36a33b4d59a56bb8e913a17a105578fd3237 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 11 Mar 2016 11:14:43 +0300
Subject: irqchip/irq-alpine-msi: Release the correct domain on error

The "msi_domain" variable is NULL here so it leads to a NULL dereference.  It
looks like we actually intended to free "middle_domain".

Fixes: e6b78f2c3e14 ('irqchip: Add the Alpine MSIX interrupt controller')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Antoine Tenart <antoine.tenart@free-electrons.com>
Cc: kernel-janitors@vger.kernel.org
Cc: Tsahee Zidenberg <tsahee@annapurnalabs.com>
Link: http://lkml.kernel.org/r/20160311081442.GE31887@mwanda
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/drivers/irqchip/irq-alpine-msi.c b/drivers/irqchip/irq-alpine-msi.c
index f871272..2538425 100644
--- a/drivers/irqchip/irq-alpine-msi.c
+++ b/drivers/irqchip/irq-alpine-msi.c
@@ -220,7 +220,7 @@ static int alpine_msix_init_domains(struct alpine_msix_data *priv,
 					       middle_domain);
 	if (!msi_domain) {
 		pr_err("Failed to create MSI domain\n");
-		irq_domain_remove(msi_domain);
+		irq_domain_remove(middle_domain);
 		return -ENOMEM;
 	}
 
-- 
cgit v0.10.2


From d7d5a43c0d16760f25d892bf9329848167a8b8a4 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Tue, 8 Mar 2016 16:59:57 +0100
Subject: ARM: mvebu: fix overlap of Crypto SRAM with PCIe memory window

When the Crypto SRAM mappings were added to the Device Tree files
describing the Armada XP boards in commit c466d997bb16 ("ARM: mvebu:
define crypto SRAM ranges for all armada-xp boards"), the fact that
those mappings were overlaping with the PCIe memory aperture was
overlooked. Due to this, we currently have for all Armada XP platforms
a situation that looks like this:

Memory mapping on Armada XP boards with internal registers at
0xf1000000:

 - 0x00000000 -> 0xf0000000	3.75G 	RAM
 - 0xf0000000 -> 0xf1000000	16M	NOR flashes (AXP GP / AXP DB)
 - 0xf1000000 -> 0xf1100000	1M	internal registers
 - 0xf8000000 -> 0xffe0000	126M	PCIe memory aperture
 - 0xf8100000 -> 0xf8110000	64KB	Crypto SRAM #0	=> OVERLAPS WITH PCIE !
 - 0xf8110000 -> 0xf8120000	64KB	Crypto SRAM #1	=> OVERLAPS WITH PCIE !
 - 0xffe00000 -> 0xfff00000	1M	PCIe I/O aperture
 - 0xfff0000  -> 0xffffffff	1M	BootROM

The overlap means that when PCIe devices are added, depending on their
memory window needs, they might or might not be mapped into the
physical address space. Indeed, they will not be mapped if the area
allocated in the PCIe memory aperture by the PCI core overlaps with
one of the Crypto SRAM. Typically, a Intel IGB PCIe NIC that needs 8MB
of PCIe memory will see its PCIe memory window allocated from
0xf80000000 for 8MB, which overlaps with the Crypto SRAM windows. Due
to this, the PCIe window is not created, and any attempt to access the
PCIe window makes the kernel explode:

[    3.302213] igb: Copyright (c) 2007-2014 Intel Corporation.
[    3.307841] pci 0000:00:09.0: enabling device (0140 -> 0143)
[    3.313539] mvebu_mbus: cannot add window '4:f8', conflicts with another window
[    3.320870] mvebu-pcie soc:pcie-controller: Could not create MBus window at [mem 0xf8000000-0xf87fffff]: -22
[    3.330811] Unhandled fault: external abort on non-linefetch (0x1008) at 0xf08c0018

This problem does not occur on Armada 370 boards, because we use the
following memory mapping (for boards that have internal registers at
0xf1000000):

 - 0x00000000 -> 0xf0000000	3.75G 	RAM
 - 0xf0000000 -> 0xf1000000	16M	NOR flashes (AXP GP / AXP DB)
 - 0xf1000000 -> 0xf1100000	1M	internal registers
 - 0xf1100000 -> 0xf1110000	64KB	Crypto SRAM #0 => OK !
 - 0xf8000000 -> 0xffe0000	126M	PCIe memory
 - 0xffe00000 -> 0xfff00000	1M	PCIe I/O
 - 0xfff0000  -> 0xffffffff	1M	BootROM

Obviously, the solution is to align the location of the Crypto SRAM
mappings of Armada XP to be similar with the ones on Armada 370, i.e
have them between the "internal registers" area and the beginning of
the PCIe aperture.

However, we have a special case with the OpenBlocks AX3-4 platform,
which has a 128 MB NOR flash. Currently, this NOR flash is mapped from
0xf0000000 to 0xf8000000. This is possible because on OpenBlocks
AX3-4, the internal registers are not at 0xf1000000. And this explains
why the Crypto SRAM mappings were not configured at the same place on
Armada XP.

Hence, the solution is two-fold:

 (1) Move the NOR flash mapping on Armada XP OpenBlocks AX3-4 from
     0xe8000000 to 0xf0000000. This frees the 0xf0000000 ->
     0xf80000000 space.

 (2) Move the Crypto SRAM mappings on Armada XP to be similar to
     Armada 370 (except of course that Armada XP has two Crypto SRAM
     and not one).

After this patch, the memory mapping on Armada XP boards with
registers at 0xf1 is:

 - 0x00000000 -> 0xf0000000	3.75G 	RAM
 - 0xf0000000 -> 0xf1000000	16M	NOR flashes (AXP GP / AXP DB)
 - 0xf1000000 -> 0xf1100000	1M	internal registers
 - 0xf1100000 -> 0xf1110000	64KB	Crypto SRAM #0
 - 0xf1110000 -> 0xf1120000	64KB	Crypto SRAM #1
 - 0xf8000000 -> 0xffe0000	126M	PCIe memory
 - 0xffe00000 -> 0xfff00000	1M	PCIe I/O
 - 0xfff0000  -> 0xffffffff	1M	BootROM

And the memory mapping for the special case of the OpenBlocks AX3-4
(internal registers at 0xd0000000, NOR of 128 MB):

 - 0x00000000 -> 0xc0000000	3G 	RAM
 - 0xd0000000 -> 0xd1000000	1M	internal registers
 - 0xe800000  -> 0xf0000000	128M	NOR flash
 - 0xf1100000 -> 0xf1110000	64KB	Crypto SRAM #0
 - 0xf1110000 -> 0xf1120000	64KB	Crypto SRAM #1
 - 0xf8000000 -> 0xffe0000	126M	PCIe memory
 - 0xffe00000 -> 0xfff00000	1M	PCIe I/O
 - 0xfff0000  -> 0xffffffff	1M	BootROM

Fixes: c466d997bb16 ("ARM: mvebu: define crypto SRAM ranges for all armada-xp boards")
Reported-by: Phil Sutter <phil@nwl.cc>
Cc: Phil Sutter <phil@nwl.cc>
Cc: <stable@vger.kernel.org>
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Acked-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: Olof Johansson <olof@lixom.net>

diff --git a/arch/arm/boot/dts/armada-xp-axpwifiap.dts b/arch/arm/boot/dts/armada-xp-axpwifiap.dts
index 23fc670..5c21b23 100644
--- a/arch/arm/boot/dts/armada-xp-axpwifiap.dts
+++ b/arch/arm/boot/dts/armada-xp-axpwifiap.dts
@@ -70,8 +70,8 @@
 	soc {
 		ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
 			  MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-			  MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-			  MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+			  MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+			  MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
 		pcie-controller {
 			status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-db.dts b/arch/arm/boot/dts/armada-xp-db.dts
index f774101..ebe1d26 100644
--- a/arch/arm/boot/dts/armada-xp-db.dts
+++ b/arch/arm/boot/dts/armada-xp-db.dts
@@ -76,8 +76,8 @@
 		ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
 			  MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
 			  MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x1000000
-			  MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-			  MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+			  MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+			  MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
 		devbus-bootcs {
 			status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-gp.dts b/arch/arm/boot/dts/armada-xp-gp.dts
index 4878d73..5730b87 100644
--- a/arch/arm/boot/dts/armada-xp-gp.dts
+++ b/arch/arm/boot/dts/armada-xp-gp.dts
@@ -95,8 +95,8 @@
 		ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
 			  MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
 			  MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x1000000
-			  MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-			  MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+			  MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+			  MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
 		devbus-bootcs {
 			status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts b/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts
index fb9e1bb..8af463f 100644
--- a/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts
+++ b/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts
@@ -65,8 +65,8 @@
 	soc {
 		ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
 			MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-			MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-			MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+			MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+			MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
 		pcie-controller {
 			status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-linksys-mamba.dts b/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
index 6e9820e..b89e6cf 100644
--- a/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
+++ b/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
@@ -70,8 +70,8 @@
 	soc {
 		ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
 			  MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-			  MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-			  MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+			  MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+			  MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
 		pcie-controller {
 			status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-matrix.dts b/arch/arm/boot/dts/armada-xp-matrix.dts
index 6ab3383..6522b04 100644
--- a/arch/arm/boot/dts/armada-xp-matrix.dts
+++ b/arch/arm/boot/dts/armada-xp-matrix.dts
@@ -68,8 +68,8 @@
 	soc {
 		ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
 			  MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-			  MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-			  MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+			  MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+			  MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
 		internal-regs {
 			serial@12000 {
diff --git a/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts b/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts
index 62175a8..d19f44c 100644
--- a/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts
+++ b/arch/arm/boot/dts/armada-xp-netgear-rn2120.dts
@@ -64,8 +64,8 @@
 	soc {
 		ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
 			  MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-			  MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-			  MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+			  MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+			  MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
 		pcie-controller {
 			status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts b/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts
index a5db177..853bd39 100644
--- a/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts
+++ b/arch/arm/boot/dts/armada-xp-openblocks-ax3-4.dts
@@ -65,9 +65,9 @@
 	soc {
 		ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xd0000000 0x100000
 			  MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-			  MBUS_ID(0x01, 0x2f) 0 0 0xf0000000 0x8000000
-			  MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-			  MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+			  MBUS_ID(0x01, 0x2f) 0 0 0xe8000000 0x8000000
+			  MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+			  MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
 		devbus-bootcs {
 			status = "okay";
diff --git a/arch/arm/boot/dts/armada-xp-synology-ds414.dts b/arch/arm/boot/dts/armada-xp-synology-ds414.dts
index 2391b11..d17dab0 100644
--- a/arch/arm/boot/dts/armada-xp-synology-ds414.dts
+++ b/arch/arm/boot/dts/armada-xp-synology-ds414.dts
@@ -78,8 +78,8 @@
 	soc {
 		ranges = <MBUS_ID(0xf0, 0x01) 0 0 0xf1000000 0x100000
 			  MBUS_ID(0x01, 0x1d) 0 0 0xfff00000 0x100000
-			  MBUS_ID(0x09, 0x09) 0 0 0xf8100000 0x10000
-			  MBUS_ID(0x09, 0x05) 0 0 0xf8110000 0x10000>;
+			  MBUS_ID(0x09, 0x09) 0 0 0xf1100000 0x10000
+			  MBUS_ID(0x09, 0x05) 0 0 0xf1110000 0x10000>;
 
 		pcie-controller {
 			status = "okay";
-- 
cgit v0.10.2


From 7640131032db9118a78af715ac77ba2debeeb17c Mon Sep 17 00:00:00 2001
From: Matthew Dawson <matthew@mjdsystems.ca>
Date: Fri, 11 Mar 2016 13:08:07 -0800
Subject: mm/mempool: avoid KASAN marking mempool poison checks as
 use-after-free

When removing an element from the mempool, mark it as unpoisoned in KASAN
before verifying its contents for SLUB/SLAB debugging.  Otherwise KASAN
will flag the reads checking the element use-after-free writes as
use-after-free reads.

Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

diff --git a/mm/mempool.c b/mm/mempool.c
index 004d42b..7924f4f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -135,8 +135,8 @@ static void *remove_element(mempool_t *pool)
 	void *element = pool->elements[--pool->curr_nr];
 
 	BUG_ON(pool->curr_nr < 0);
-	check_element(pool, element);
 	kasan_unpoison_element(pool, element);
+	check_element(pool, element);
 	return element;
 }
 
-- 
cgit v0.10.2


From 6e6867093de35141f0a76b66ac13f9f2e2c8e77a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Fri, 11 Mar 2016 12:32:06 +0100
Subject: x86/fpu: Fix eager-FPU handling on legacy FPU machines

i486 derived cores like Intel Quark support only the very old,
legacy x87 FPU (FSAVE/FRSTOR, CPUID bit FXSR is not set), and
our FPU code wasn't handling the saving and restoring there
properly in the 'eagerfpu' case.

So after we made eagerfpu the default for all CPU types:

  58122bf1d856 x86/fpu: Default eagerfpu=on on all CPUs

these old FPU designs broke. First, Andy Shevchenko reported a splat:

  WARNING: CPU: 0 PID: 823 at arch/x86/include/asm/fpu/internal.h:163 fpu__clear+0x8c/0x160

which was us trying to execute FXRSTOR on those machines even though
they don't support it.

After taking care of that, Bryan O'Donoghue reported that a simple FPU
test still failed because we weren't initializing the FPU state properly
on those machines.

Take care of all that.

Reported-and-tested-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Reported-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20160311113206.GD4312@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index d25097c..d5804ad 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -409,8 +409,10 @@ static inline void copy_init_fpstate_to_fpregs(void)
 {
 	if (use_xsave())
 		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
-	else
+	else if (static_cpu_has(X86_FEATURE_FXSR))
 		copy_kernel_to_fxregs(&init_fpstate.fxsave);
+	else
+		copy_kernel_to_fregs(&init_fpstate.fsave);
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 9ee7e30..bd08fb7 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -134,7 +134,7 @@ static void __init fpu__init_system_generic(void)
 	 * Set up the legacy init FPU context. (xstate init might overwrite this
 	 * with a more modern format, if the CPU supports it.)
 	 */
-	fpstate_init_fxstate(&init_fpstate.fxsave);
+	fpstate_init(&init_fpstate);
 
 	fpu__init_system_mxcsr();
 }
-- 
cgit v0.10.2


From 452308de61056a539352a9306c46716d7af8a1f1 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Fri, 11 Mar 2016 11:19:23 +0000
Subject: x86/efi: Fix boot crash by always mapping boot service regions into
 new EFI page tables

Some machines have EFI regions in page zero (physical address
0x00000000) and historically that region has been added to the e820
map via trim_bios_range(), and ultimately mapped into the kernel page
tables. It was not mapped via efi_map_regions() as one would expect.

Alexis reports that with the new separate EFI page tables some boot
services regions, such as page zero, are not mapped. This triggers an
oops during the SetVirtualAddressMap() runtime call.

For the EFI boot services quirk on x86 we need to memblock_reserve()
boot services regions until after SetVirtualAddressMap(). Doing that
while respecting the ownership of regions that may have already been
reserved by the kernel was the motivation behind this commit:

  7d68dc3f1003 ("x86, efi: Do not reserve boot services regions within reserved areas")

That patch was merged at a time when the EFI runtime virtual mappings
were inserted into the kernel page tables as described above, and the
trick of setting ->numpages (and hence the region size) to zero to
track regions that should not be freed in efi_free_boot_services()
meant that we never mapped those regions in efi_map_regions(). Instead
we were relying solely on the existing kernel mappings.

Now that we have separate page tables we need to make sure the EFI
boot services regions are mapped correctly, even if someone else has
already called memblock_reserve(). Instead of stashing a tag in
->numpages, set the EFI_MEMORY_RUNTIME bit of ->attribute. Since it
generally makes no sense to mark a boot services region as required at
runtime, it's pretty much guaranteed the firmware will not have
already set this bit.

For the record, the specific circumstances under which Alexis
triggered this bug was that an EFI runtime driver on his machine was
responding to the EVT_SIGNAL_VIRTUAL_ADDRESS_CHANGE event during
SetVirtualAddressMap().

The event handler for this driver looks like this,

  sub rsp,0x28
  lea rdx,[rip+0x2445] # 0xaa948720
  mov ecx,0x4
  call func_aa9447c0  ; call to ConvertPointer(4, & 0xaa948720)
  mov r11,QWORD PTR [rip+0x2434] # 0xaa948720
  xor eax,eax
  mov BYTE PTR [r11+0x1],0x1
  add rsp,0x28
  ret

Which is pretty typical code for an EVT_SIGNAL_VIRTUAL_ADDRESS_CHANGE
handler. The "mov r11, QWORD PTR [rip+0x2424]" was the faulting
instruction because ConvertPointer() was being called to convert the
address 0x0000000000000000, which when converted is left unchanged and
remains 0x0000000000000000.

The output of the oops trace gave the impression of a standard NULL
pointer dereference bug, but because we're accessing physical
addresses during ConvertPointer(), it wasn't. EFI boot services code
is stored at that address on Alexis' machine.

Reported-by: Alexis Murzeau <amurzeau@gmail.com>
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Raphael Hertzog <hertzog@debian.org>
Cc: Roger Shimizu <rogershimizu@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/1457695163-29632-2-git-send-email-matt@codeblueprint.co.uk
Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=815125
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 2d66db8..ed30e79 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -131,6 +131,27 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
 EXPORT_SYMBOL_GPL(efi_query_variable_store);
 
 /*
+ * Helper function for efi_reserve_boot_services() to figure out if we
+ * can free regions in efi_free_boot_services().
+ *
+ * Use this function to ensure we do not free regions owned by somebody
+ * else. We must only reserve (and then free) regions:
+ *
+ * - Not within any part of the kernel
+ * - Not the BIOS reserved area (E820_RESERVED, E820_NVS, etc)
+ */
+static bool can_free_region(u64 start, u64 size)
+{
+	if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end))
+		return false;
+
+	if (!e820_all_mapped(start, start+size, E820_RAM))
+		return false;
+
+	return true;
+}
+
+/*
  * The UEFI specification makes it clear that the operating system is free to do
  * whatever it wants with boot services code after ExitBootServices() has been
  * called. Ignoring this recommendation a significant bunch of EFI implementations 
@@ -147,26 +168,50 @@ void __init efi_reserve_boot_services(void)
 		efi_memory_desc_t *md = p;
 		u64 start = md->phys_addr;
 		u64 size = md->num_pages << EFI_PAGE_SHIFT;
+		bool already_reserved;
 
 		if (md->type != EFI_BOOT_SERVICES_CODE &&
 		    md->type != EFI_BOOT_SERVICES_DATA)
 			continue;
-		/* Only reserve where possible:
-		 * - Not within any already allocated areas
-		 * - Not over any memory area (really needed, if above?)
-		 * - Not within any part of the kernel
-		 * - Not the bios reserved area
-		*/
-		if ((start + size > __pa_symbol(_text)
-				&& start <= __pa_symbol(_end)) ||
-			!e820_all_mapped(start, start+size, E820_RAM) ||
-			memblock_is_region_reserved(start, size)) {
-			/* Could not reserve, skip it */
-			md->num_pages = 0;
-			memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
-				     start, start+size-1);
-		} else
+
+		already_reserved = memblock_is_region_reserved(start, size);
+
+		/*
+		 * Because the following memblock_reserve() is paired
+		 * with free_bootmem_late() for this region in
+		 * efi_free_boot_services(), we must be extremely
+		 * careful not to reserve, and subsequently free,
+		 * critical regions of memory (like the kernel image) or
+		 * those regions that somebody else has already
+		 * reserved.
+		 *
+		 * A good example of a critical region that must not be
+		 * freed is page zero (first 4Kb of memory), which may
+		 * contain boot services code/data but is marked
+		 * E820_RESERVED by trim_bios_range().
+		 */
+		if (!already_reserved) {
 			memblock_reserve(start, size);
+
+			/*
+			 * If we are the first to reserve the region, no
+			 * one else cares about it. We own it and can
+			 * free it later.
+			 */
+			if (can_free_region(start, size))
+				continue;
+		}
+
+		/*
+		 * We don't own the region. We must not free it.
+		 *
+		 * Setting this bit for a boot services region really
+		 * doesn't make sense as far as the firmware is
+		 * concerned, but it does provide us with a way to tag
+		 * those regions that must not be paired with
+		 * free_bootmem_late().
+		 */
+		md->attribute |= EFI_MEMORY_RUNTIME;
 	}
 }
 
@@ -183,8 +228,8 @@ void __init efi_free_boot_services(void)
 		    md->type != EFI_BOOT_SERVICES_DATA)
 			continue;
 
-		/* Could not reserve boot area */
-		if (!size)
+		/* Do not free, someone else owns it: */
+		if (md->attribute & EFI_MEMORY_RUNTIME)
 			continue;
 
 		free_bootmem_late(start, size);
-- 
cgit v0.10.2


From d05004944206cbbf1c453e179768163731c7c6f1 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Thu, 10 Mar 2016 19:38:18 -0800
Subject: x86/cpufeature: Enable new AVX-512 features

A few new AVX-512 instruction groups/features are added in cpufeatures.h
for enuermation: AVX512DQ, AVX512BW, and AVX512VL.

Clear the flags in fpu__xstate_clear_all_cpu_caps().

The specification for latest AVX-512 including the features can be found at:

  https://software.intel.com/sites/default/files/managed/07/b7/319433-023.pdf

Note, I didn't enable the flags in KVM. Hopefully the KVM guys can pick up
the flags and enable them in KVM.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Gleb Natapov <gleb@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi V Shankar <ravi.v.shankar@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm@vger.kernel.org
Link: http://lkml.kernel.org/r/1457667498-37357-1-git-send-email-fenghua.yu@intel.com
[ Added more detailed feature descriptions. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index d11a3aa..9e0567f 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -220,6 +220,7 @@
 #define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
 #define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
 #define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ	( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
 #define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
@@ -230,6 +231,8 @@
 #define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
 #define X86_FEATURE_AVX512CD	( 9*32+28) /* AVX-512 Conflict Detection */
 #define X86_FEATURE_SHA_NI	( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+#define X86_FEATURE_AVX512BW	( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
+#define X86_FEATURE_AVX512VL	( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
 
 /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
 #define X86_FEATURE_XSAVEOPT	(10*32+ 0) /* XSAVEOPT */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d425cda5..6e8354f 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -51,6 +51,9 @@ void fpu__xstate_clear_all_cpu_caps(void)
 	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
 	setup_clear_cpu_cap(X86_FEATURE_MPX);
 	setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
 }
-- 
cgit v0.10.2


From d10ef6f9380b8853c4b48eb104268fccfdc0b0c5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 8 Mar 2016 10:36:13 +0100
Subject: cpu/hotplug: Document states better

Requested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/kernel/cpu.c b/kernel/cpu.c
index bcee286..6ea42e8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1172,6 +1172,10 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.teardown		= NULL,
 		.cant_stop		= true,
 	},
+	/*
+	 * Preparatory and dead notifiers. Will be replaced once the notifiers
+	 * are converted to states.
+	 */
 	[CPUHP_NOTIFY_PREPARE] = {
 		.name			= "notify:prepare",
 		.startup		= notify_prepare,
@@ -1179,12 +1183,17 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 		.skip_onerr		= true,
 		.cant_stop		= true,
 	},
+	/* Kicks the plugged cpu into life */
 	[CPUHP_BRINGUP_CPU] = {
 		.name			= "cpu:bringup",
 		.startup		= bringup_cpu,
 		.teardown		= NULL,
 		.cant_stop		= true,
 	},
+	/*
+	 * Handled on controll processor until the plugged processor manages
+	 * this itself.
+	 */
 	[CPUHP_TEARDOWN_CPU] = {
 		.name			= "cpu:teardown",
 		.startup		= NULL,
@@ -1197,6 +1206,23 @@ static struct cpuhp_step cpuhp_bp_states[] = {
 /* Application processor state steps */
 static struct cpuhp_step cpuhp_ap_states[] = {
 #ifdef CONFIG_SMP
+	/* Final state before CPU kills itself */
+	[CPUHP_AP_IDLE_DEAD] = {
+		.name			= "idle:dead",
+	},
+	/*
+	 * Last state before CPU enters the idle loop to die. Transient state
+	 * for synchronization.
+	 */
+	[CPUHP_AP_OFFLINE] = {
+		.name			= "ap:offline",
+		.cant_stop		= true,
+	},
+	/*
+	 * Low level startup/teardown notifiers. Run with interrupts
+	 * disabled. Will be removed once the notifiers are converted to
+	 * states.
+	 */
 	[CPUHP_AP_NOTIFY_STARTING] = {
 		.name			= "notify:starting",
 		.startup		= notify_starting,
@@ -1204,17 +1230,32 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.skip_onerr		= true,
 		.cant_stop		= true,
 	},
+	/* Entry state on starting. Interrupts enabled from here on. Transient
+	 * state for synchronsization */
+	[CPUHP_AP_ONLINE] = {
+		.name			= "ap:online",
+	},
+	/* Handle smpboot threads park/unpark */
 	[CPUHP_AP_SMPBOOT_THREADS] = {
 		.name			= "smpboot:threads",
 		.startup		= smpboot_unpark_threads,
 		.teardown		= NULL,
 	},
+	/*
+	 * Online/down_prepare notifiers. Will be removed once the notifiers
+	 * are converted to states.
+	 */
 	[CPUHP_AP_NOTIFY_ONLINE] = {
 		.name			= "notify:online",
 		.startup		= notify_online,
 		.teardown		= notify_down_prepare,
 	},
 #endif
+	/*
+	 * The dynamically registered state space is here
+	 */
+
+	/* CPU is fully up and running. */
 	[CPUHP_ONLINE] = {
 		.name			= "online",
 		.startup		= NULL,
@@ -1232,7 +1273,11 @@ static int cpuhp_cb_check(enum cpuhp_state state)
 
 static bool cpuhp_is_ap_state(enum cpuhp_state state)
 {
-	return state > CPUHP_BRINGUP_CPU;
+	/*
+	 * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
+	 * purposes as that state is handled explicitely in cpu_down.
+	 */
+	return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
 }
 
 static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
-- 
cgit v0.10.2


From 90d0f0f11588ec692c12f9009089b398be395184 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Sat, 12 Mar 2016 22:56:19 +0800
Subject: block: don't optimize for non-cloned bio in bio_get_last_bvec()

For !BIO_CLONED bio, we can use .bi_vcnt safely, but it
doesn't mean we can just simply return .bi_io_vec[.bi_vcnt - 1]
because the start postion may have been moved in the middle of
the bvec, such as splitting in the middle of bvec.

Fixes: 7bcd79ac50d9(block: bio: introduce helpers to get the 1st and last bvec)
Cc: stable@vger.kernel.org
Reported-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>

diff --git a/include/linux/bio.h b/include/linux/bio.h
index cb68888..88bc64f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -320,11 +320,6 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
 	struct bvec_iter iter = bio->bi_iter;
 	int idx;
 
-	if (!bio_flagged(bio, BIO_CLONED)) {
-		*bv = bio->bi_io_vec[bio->bi_vcnt - 1];
-		return;
-	}
-
 	if (unlikely(!bio_multiple_segments(bio))) {
 		*bv = bio_iovec(bio);
 		return;
-- 
cgit v0.10.2


From ba9e72c2290fcdeeff5f0105590a03e0f341b059 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@iki.fi>
Date: Tue, 8 Mar 2016 00:06:36 +0200
Subject: MIPS: Fix build with DEBUG_ZBOOT and MACH_JZ4780

Ingenic SoC declares ZBOOT support, but debug definitions are missing
for MACH_JZ4780 resulting in a build failure when DEBUG_ZBOOT is set.
The UART addresses are same as with JZ4740, so fix by covering JZ4780
with those as well.

Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12830/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/boot/compressed/uart-16550.c b/arch/mips/boot/compressed/uart-16550.c
index 408799a..f752114 100644
--- a/arch/mips/boot/compressed/uart-16550.c
+++ b/arch/mips/boot/compressed/uart-16550.c
@@ -17,7 +17,7 @@
 #define PORT(offset) (CKSEG1ADDR(AR7_REGS_UART0) + (4 * offset))
 #endif
 
-#ifdef CONFIG_MACH_JZ4740
+#if defined(CONFIG_MACH_JZ4740) || defined(CONFIG_MACH_JZ4780)
 #include <asm/mach-jz4740/base.h>
 #define PORT(offset) (CKSEG1ADDR(JZ4740_UART0_BASE_ADDR) + (4 * offset))
 #endif
-- 
cgit v0.10.2


From 4b7b1ef2c2f83d702272555e8adb839a50ba0f8e Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 8 Mar 2016 16:47:53 +0000
Subject: ld-version: Fix awk regex compile failure

The ld-version.sh script fails on some versions of awk with the
following error, resulting in build failures for MIPS:

awk: scripts/ld-version.sh: line 4: regular expression compile failed (missing '(')

This is due to the regular expression ".*)", meant to strip off the
beginning of the ld version string up to the close bracket, however
brackets have a meaning in regular expressions, so lets escape it so
that awk doesn't expect a corresponding open bracket.

Fixes: ccbef1674a15 ("Kbuild, lto: add ld-version and ld-ifversion ...")
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Tested-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Tested-by: Sudip Mukherjee <sudip.mukherjee@codethink.co.uk>
Cc: Michal Marek <mmarek@suse.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: linux-mips@linux-mips.org
Cc: linux-kbuild@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org # 4.4.x-
Patchwork: https://patchwork.linux-mips.org/patch/12838/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/scripts/ld-version.sh b/scripts/ld-version.sh
index d154f08..7bfe9fa 100755
--- a/scripts/ld-version.sh
+++ b/scripts/ld-version.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/awk -f
 # extract linker version number from stdin and turn into single number
 	{
-	gsub(".*)", "");
+	gsub(".*\\)", "");
 	gsub(".*version ", "");
 	gsub("-.*", "");
 	split($1,a, ".");
-- 
cgit v0.10.2


From 7a50e4688dabb8005df39b2b992d76629b8af8aa Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Sun, 6 Mar 2016 22:28:56 +0100
Subject: MIPS: Fix build error when SMP is used without GIC

The MIPS_GIC_IPI should only be selected when MIPS_GIC is also
selected, otherwise it results in a compile error. smp-gic.c uses some
functions from include/linux/irqchip/mips-gic.h like
plat_ipi_call_int_xlate() which are only added to the header file when
MIPS_GIC is set. The Lantiq SoC does not use the GIC, but supports SMP.
The calls top the functions from smp-gic.c are already protected by
some #ifdefs

The first part of this was introduced in commit 72e20142b2bf ("MIPS:
Move GIC IPI functions out of smp-cmp.c")

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: stable@vger.kernel.org # v3.15+
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12774/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 74a3db9..d3da79d 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2169,7 +2169,7 @@ config MIPS_MT_SMP
 	select CPU_MIPSR2_IRQ_VI
 	select CPU_MIPSR2_IRQ_EI
 	select SYNC_R4K
-	select MIPS_GIC_IPI
+	select MIPS_GIC_IPI if MIPS_GIC
 	select MIPS_MT
 	select SMP
 	select SMP_UP
@@ -2267,7 +2267,7 @@ config MIPS_VPE_APSP_API_MT
 config MIPS_CMP
 	bool "MIPS CMP framework support (DEPRECATED)"
 	depends on SYS_SUPPORTS_MIPS_CMP && !CPU_MIPSR6
-	select MIPS_GIC_IPI
+	select MIPS_GIC_IPI if MIPS_GIC
 	select SMP
 	select SYNC_R4K
 	select SYS_SUPPORTS_SMP
@@ -2287,7 +2287,7 @@ config MIPS_CPS
 	select MIPS_CM
 	select MIPS_CPC
 	select MIPS_CPS_PM if HOTPLUG_CPU
-	select MIPS_GIC_IPI
+	select MIPS_GIC_IPI if MIPS_GIC
 	select SMP
 	select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
 	select SYS_SUPPORTS_HOTPLUG_CPU
@@ -2306,6 +2306,7 @@ config MIPS_CPS_PM
 	bool
 
 config MIPS_GIC_IPI
+	depends on MIPS_GIC
 	bool
 
 config MIPS_CM
-- 
cgit v0.10.2


From d825c06bfe8b885b797f917ad47365d0e9c21fbb Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 4 Mar 2016 10:10:51 +0000
Subject: MIPS: smp.c: Fix uninitialised temp_foreign_map

When calculate_cpu_foreign_map() recalculates the cpu_foreign_map
cpumask it uses the local variable temp_foreign_map without initialising
it to zero. Since the calculation only ever sets bits in this cpumask
any existing bits at that memory location will remain set and find their
way into cpu_foreign_map too. This could potentially lead to cache
operations suboptimally doing smp calls to multiple VPEs in the same
core, even though the VPEs share primary caches.

Therefore initialise temp_foreign_map using cpumask_clear() before use.

Fixes: cccf34e9411c ("MIPS: c-r4k: Fix cache flushing for MT cores")
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/12759/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>

diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index bd4385a..2b521e0 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -121,6 +121,7 @@ static inline void calculate_cpu_foreign_map(void)
 	cpumask_t temp_foreign_map;
 
 	/* Re-calculate the mask */
+	cpumask_clear(&temp_foreign_map);
 	for_each_online_cpu(i) {
 		core_present = 0;
 		for_each_cpu(k, &temp_foreign_map)
-- 
cgit v0.10.2


From b562e44f507e863c6792946e4e1b1449fbbac85d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 13 Mar 2016 21:28:54 -0700
Subject: Linux 4.5


diff --git a/Makefile b/Makefile
index 2d519d2..7b3ecdc 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 4
 PATCHLEVEL = 5
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
 NAME = Blurry Fish Butt
 
 # *DOCUMENTATION*
-- 
cgit v0.10.2